tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/lexer.c ADDED
@@ -0,0 +1,3825 @@
1
+ /* lexer.c -- Lexer for html parser
2
+
3
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: arnaud02 $
9
+ $Date: 2008/03/22 21:06:55 $
10
+ $Revision: 1.194 $
11
+
12
+ */
13
+
14
+ /*
15
+ Given a file stream fp it returns a sequence of tokens.
16
+
17
+ GetToken(fp) gets the next token
18
+ UngetToken(fp) provides one level undo
19
+
20
+ The tags include an attribute list:
21
+
22
+ - linked list of attribute/value nodes
23
+ - each node has 2 NULL-terminated strings.
24
+ - entities are replaced in attribute values
25
+
26
+ white space is compacted if not in preformatted mode
27
+ If not in preformatted mode then leading white space
28
+ is discarded and subsequent white space sequences
29
+ compacted to single space characters.
30
+
31
+ If XmlTags is no then Tag names are folded to upper
32
+ case and attribute names to lower case.
33
+
34
+ Not yet done:
35
+ - Doctype subset and marked sections
36
+ */
37
+
38
+ #include "tidy-int.h"
39
+ #include "lexer.h"
40
+ #include "parser.h"
41
+ #include "entities.h"
42
+ #include "streamio.h"
43
+ #include "message.h"
44
+ #include "tmbstr.h"
45
+ #include "clean.h"
46
+ #include "utf8.h"
47
+ #include "streamio.h"
48
+
49
+ /* Forward references
50
+ */
51
+ /* swallows closing '>' */
52
+ static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
53
+
54
+ static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty,
55
+ Node **asp, Node **php );
56
+
57
+ static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
58
+ Bool *isempty, int *pdelim );
59
+
60
+ static Node *ParseDocTypeDecl(TidyDocImpl* doc);
61
+
62
+ static void AddAttrToList( AttVal** list, AttVal* av );
63
+
64
+ /* used to classify characters for lexical purposes */
65
+ #define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
66
+ static uint lexmap[128];
67
+
68
+ #define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
69
+ #define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
70
+
71
+ static struct _doctypes
72
+ {
73
+ uint score;
74
+ uint vers;
75
+ ctmbstr name;
76
+ ctmbstr fpi;
77
+ ctmbstr si;
78
+ } const W3C_Doctypes[] =
79
+ {
80
+ { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML 2.0//EN", NULL, },
81
+ { 2, HT20, "HTML 2.0", "-//IETF//DTD HTML//EN", NULL, },
82
+ { 2, HT20, "HTML 2.0", "-//W3C//DTD HTML 2.0//EN", NULL, },
83
+ { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2//EN", NULL, },
84
+ { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Final//EN", NULL, },
85
+ { 1, HT32, "HTML 3.2", "-//W3C//DTD HTML 3.2 Draft//EN", NULL, },
86
+ { 6, H40S, "HTML 4.0 Strict", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd" },
87
+ { 8, H40T, "HTML 4.0 Transitional", "-//W3C//DTD HTML 4.0 Transitional//EN", "http://www.w3.org/TR/REC-html40/loose.dtd" },
88
+ { 7, H40F, "HTML 4.0 Frameset", "-//W3C//DTD HTML 4.0 Frameset//EN", "http://www.w3.org/TR/REC-html40/frameset.dtd" },
89
+ { 3, H41S, "HTML 4.01 Strict", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd" },
90
+ { 5, H41T, "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd" },
91
+ { 4, H41F, "HTML 4.01 Frameset", "-//W3C//DTD HTML 4.01 Frameset//EN", "http://www.w3.org/TR/html4/frameset.dtd" },
92
+ { 9, X10S, "XHTML 1.0 Strict", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd" },
93
+ { 11, X10T, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd" },
94
+ { 10, X10F, "XHTML 1.0 Frameset", "-//W3C//DTD XHTML 1.0 Frameset//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd" },
95
+ { 12, XH11, "XHTML 1.1", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd" },
96
+ { 13, XB10, "XHTML Basic 1.0", "-//W3C//DTD XHTML Basic 1.0//EN", "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd" },
97
+
98
+ /* reminder to add XHTML Print 1.0 support, see http://www.w3.org/TR/xhtml-print */
99
+ #if 0
100
+ { 14, XP10, "XHTML Print 1.0", "-//W3C//DTD XHTML-Print 1.0//EN", "http://www.w3.org/MarkUp/DTD/xhtml-print10.dtd" },
101
+ { 14, XP10, "XHTML Print 1.0", "-//PWG//DTD XHTML-Print 1.0//EN", "http://www.xhtml-print.org/xhtml-print/xhtml-print10.dtd" },
102
+ #endif
103
+ /* final entry */
104
+ { 0, 0, NULL, NULL, NULL }
105
+ };
106
+
107
+ int TY_(HTMLVersion)(TidyDocImpl* doc)
108
+ {
109
+ uint i;
110
+ uint j = 0;
111
+ uint score = 0;
112
+ uint vers = doc->lexer->versions;
113
+ uint dtver = doc->lexer->doctype;
114
+ TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
115
+ Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
116
+ !cfgBool(doc, TidyHtmlOut);
117
+ Bool html4 = dtmode == TidyDoctypeStrict || dtmode == TidyDoctypeLoose || VERS_FROM40 & dtver;
118
+
119
+ for (i = 0; W3C_Doctypes[i].name; ++i)
120
+ {
121
+ if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
122
+ (html4 && !(VERS_FROM40 & W3C_Doctypes[i].vers)))
123
+ continue;
124
+
125
+ if (vers & W3C_Doctypes[i].vers &&
126
+ (W3C_Doctypes[i].score < score || !score))
127
+ {
128
+ score = W3C_Doctypes[i].score;
129
+ j = i;
130
+ }
131
+ }
132
+
133
+ if (score)
134
+ return W3C_Doctypes[j].vers;
135
+
136
+ return VERS_UNKNOWN;
137
+ }
138
+
139
+ static ctmbstr GetFPIFromVers(uint vers)
140
+ {
141
+ uint i;
142
+
143
+ for (i = 0; W3C_Doctypes[i].name; ++i)
144
+ if (W3C_Doctypes[i].vers == vers)
145
+ return W3C_Doctypes[i].fpi;
146
+
147
+ return NULL;
148
+ }
149
+
150
+ static ctmbstr GetSIFromVers(uint vers)
151
+ {
152
+ uint i;
153
+
154
+ for (i = 0; W3C_Doctypes[i].name; ++i)
155
+ if (W3C_Doctypes[i].vers == vers)
156
+ return W3C_Doctypes[i].si;
157
+
158
+ return NULL;
159
+ }
160
+
161
+ static ctmbstr GetNameFromVers(uint vers)
162
+ {
163
+ uint i;
164
+
165
+ for (i = 0; W3C_Doctypes[i].name; ++i)
166
+ if (W3C_Doctypes[i].vers == vers)
167
+ return W3C_Doctypes[i].name;
168
+
169
+ return NULL;
170
+ }
171
+
172
+ static uint GetVersFromFPI(ctmbstr fpi)
173
+ {
174
+ uint i;
175
+
176
+ for (i = 0; W3C_Doctypes[i].name; ++i)
177
+ if (TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
178
+ return W3C_Doctypes[i].vers;
179
+
180
+ return 0;
181
+ }
182
+
183
+ /* everything is allowed in proprietary version of HTML */
184
+ /* this is handled here rather than in the tag/attr dicts */
185
+ void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
186
+ {
187
+ doc->lexer->versions &= (vers | VERS_PROPRIETARY);
188
+ }
189
+
190
+ Bool TY_(IsWhite)(uint c)
191
+ {
192
+ uint map = MAP(c);
193
+
194
+ return (map & white)!=0;
195
+ }
196
+
197
+ Bool TY_(IsNewline)(uint c)
198
+ {
199
+ uint map = MAP(c);
200
+ return (map & newline)!=0;
201
+ }
202
+
203
+ Bool TY_(IsDigit)(uint c)
204
+ {
205
+ uint map;
206
+
207
+ map = MAP(c);
208
+
209
+ return (map & digit)!=0;
210
+ }
211
+
212
+ static Bool IsDigitHex(uint c)
213
+ {
214
+ uint map;
215
+
216
+ map = MAP(c);
217
+
218
+ return (map & digithex)!=0;
219
+ }
220
+
221
+ Bool TY_(IsLetter)(uint c)
222
+ {
223
+ uint map;
224
+
225
+ map = MAP(c);
226
+
227
+ return (map & letter)!=0;
228
+ }
229
+
230
+ Bool TY_(IsNamechar)(uint c)
231
+ {
232
+ uint map = MAP(c);
233
+ return (map & namechar)!=0;
234
+ }
235
+
236
+ Bool TY_(IsXMLLetter)(uint c)
237
+ {
238
+ return ((c >= 0x41 && c <= 0x5a) ||
239
+ (c >= 0x61 && c <= 0x7a) ||
240
+ (c >= 0xc0 && c <= 0xd6) ||
241
+ (c >= 0xd8 && c <= 0xf6) ||
242
+ (c >= 0xf8 && c <= 0xff) ||
243
+ (c >= 0x100 && c <= 0x131) ||
244
+ (c >= 0x134 && c <= 0x13e) ||
245
+ (c >= 0x141 && c <= 0x148) ||
246
+ (c >= 0x14a && c <= 0x17e) ||
247
+ (c >= 0x180 && c <= 0x1c3) ||
248
+ (c >= 0x1cd && c <= 0x1f0) ||
249
+ (c >= 0x1f4 && c <= 0x1f5) ||
250
+ (c >= 0x1fa && c <= 0x217) ||
251
+ (c >= 0x250 && c <= 0x2a8) ||
252
+ (c >= 0x2bb && c <= 0x2c1) ||
253
+ c == 0x386 ||
254
+ (c >= 0x388 && c <= 0x38a) ||
255
+ c == 0x38c ||
256
+ (c >= 0x38e && c <= 0x3a1) ||
257
+ (c >= 0x3a3 && c <= 0x3ce) ||
258
+ (c >= 0x3d0 && c <= 0x3d6) ||
259
+ c == 0x3da ||
260
+ c == 0x3dc ||
261
+ c == 0x3de ||
262
+ c == 0x3e0 ||
263
+ (c >= 0x3e2 && c <= 0x3f3) ||
264
+ (c >= 0x401 && c <= 0x40c) ||
265
+ (c >= 0x40e && c <= 0x44f) ||
266
+ (c >= 0x451 && c <= 0x45c) ||
267
+ (c >= 0x45e && c <= 0x481) ||
268
+ (c >= 0x490 && c <= 0x4c4) ||
269
+ (c >= 0x4c7 && c <= 0x4c8) ||
270
+ (c >= 0x4cb && c <= 0x4cc) ||
271
+ (c >= 0x4d0 && c <= 0x4eb) ||
272
+ (c >= 0x4ee && c <= 0x4f5) ||
273
+ (c >= 0x4f8 && c <= 0x4f9) ||
274
+ (c >= 0x531 && c <= 0x556) ||
275
+ c == 0x559 ||
276
+ (c >= 0x561 && c <= 0x586) ||
277
+ (c >= 0x5d0 && c <= 0x5ea) ||
278
+ (c >= 0x5f0 && c <= 0x5f2) ||
279
+ (c >= 0x621 && c <= 0x63a) ||
280
+ (c >= 0x641 && c <= 0x64a) ||
281
+ (c >= 0x671 && c <= 0x6b7) ||
282
+ (c >= 0x6ba && c <= 0x6be) ||
283
+ (c >= 0x6c0 && c <= 0x6ce) ||
284
+ (c >= 0x6d0 && c <= 0x6d3) ||
285
+ c == 0x6d5 ||
286
+ (c >= 0x6e5 && c <= 0x6e6) ||
287
+ (c >= 0x905 && c <= 0x939) ||
288
+ c == 0x93d ||
289
+ (c >= 0x958 && c <= 0x961) ||
290
+ (c >= 0x985 && c <= 0x98c) ||
291
+ (c >= 0x98f && c <= 0x990) ||
292
+ (c >= 0x993 && c <= 0x9a8) ||
293
+ (c >= 0x9aa && c <= 0x9b0) ||
294
+ c == 0x9b2 ||
295
+ (c >= 0x9b6 && c <= 0x9b9) ||
296
+ (c >= 0x9dc && c <= 0x9dd) ||
297
+ (c >= 0x9df && c <= 0x9e1) ||
298
+ (c >= 0x9f0 && c <= 0x9f1) ||
299
+ (c >= 0xa05 && c <= 0xa0a) ||
300
+ (c >= 0xa0f && c <= 0xa10) ||
301
+ (c >= 0xa13 && c <= 0xa28) ||
302
+ (c >= 0xa2a && c <= 0xa30) ||
303
+ (c >= 0xa32 && c <= 0xa33) ||
304
+ (c >= 0xa35 && c <= 0xa36) ||
305
+ (c >= 0xa38 && c <= 0xa39) ||
306
+ (c >= 0xa59 && c <= 0xa5c) ||
307
+ c == 0xa5e ||
308
+ (c >= 0xa72 && c <= 0xa74) ||
309
+ (c >= 0xa85 && c <= 0xa8b) ||
310
+ c == 0xa8d ||
311
+ (c >= 0xa8f && c <= 0xa91) ||
312
+ (c >= 0xa93 && c <= 0xaa8) ||
313
+ (c >= 0xaaa && c <= 0xab0) ||
314
+ (c >= 0xab2 && c <= 0xab3) ||
315
+ (c >= 0xab5 && c <= 0xab9) ||
316
+ c == 0xabd ||
317
+ c == 0xae0 ||
318
+ (c >= 0xb05 && c <= 0xb0c) ||
319
+ (c >= 0xb0f && c <= 0xb10) ||
320
+ (c >= 0xb13 && c <= 0xb28) ||
321
+ (c >= 0xb2a && c <= 0xb30) ||
322
+ (c >= 0xb32 && c <= 0xb33) ||
323
+ (c >= 0xb36 && c <= 0xb39) ||
324
+ c == 0xb3d ||
325
+ (c >= 0xb5c && c <= 0xb5d) ||
326
+ (c >= 0xb5f && c <= 0xb61) ||
327
+ (c >= 0xb85 && c <= 0xb8a) ||
328
+ (c >= 0xb8e && c <= 0xb90) ||
329
+ (c >= 0xb92 && c <= 0xb95) ||
330
+ (c >= 0xb99 && c <= 0xb9a) ||
331
+ c == 0xb9c ||
332
+ (c >= 0xb9e && c <= 0xb9f) ||
333
+ (c >= 0xba3 && c <= 0xba4) ||
334
+ (c >= 0xba8 && c <= 0xbaa) ||
335
+ (c >= 0xbae && c <= 0xbb5) ||
336
+ (c >= 0xbb7 && c <= 0xbb9) ||
337
+ (c >= 0xc05 && c <= 0xc0c) ||
338
+ (c >= 0xc0e && c <= 0xc10) ||
339
+ (c >= 0xc12 && c <= 0xc28) ||
340
+ (c >= 0xc2a && c <= 0xc33) ||
341
+ (c >= 0xc35 && c <= 0xc39) ||
342
+ (c >= 0xc60 && c <= 0xc61) ||
343
+ (c >= 0xc85 && c <= 0xc8c) ||
344
+ (c >= 0xc8e && c <= 0xc90) ||
345
+ (c >= 0xc92 && c <= 0xca8) ||
346
+ (c >= 0xcaa && c <= 0xcb3) ||
347
+ (c >= 0xcb5 && c <= 0xcb9) ||
348
+ c == 0xcde ||
349
+ (c >= 0xce0 && c <= 0xce1) ||
350
+ (c >= 0xd05 && c <= 0xd0c) ||
351
+ (c >= 0xd0e && c <= 0xd10) ||
352
+ (c >= 0xd12 && c <= 0xd28) ||
353
+ (c >= 0xd2a && c <= 0xd39) ||
354
+ (c >= 0xd60 && c <= 0xd61) ||
355
+ (c >= 0xe01 && c <= 0xe2e) ||
356
+ c == 0xe30 ||
357
+ (c >= 0xe32 && c <= 0xe33) ||
358
+ (c >= 0xe40 && c <= 0xe45) ||
359
+ (c >= 0xe81 && c <= 0xe82) ||
360
+ c == 0xe84 ||
361
+ (c >= 0xe87 && c <= 0xe88) ||
362
+ c == 0xe8a ||
363
+ c == 0xe8d ||
364
+ (c >= 0xe94 && c <= 0xe97) ||
365
+ (c >= 0xe99 && c <= 0xe9f) ||
366
+ (c >= 0xea1 && c <= 0xea3) ||
367
+ c == 0xea5 ||
368
+ c == 0xea7 ||
369
+ (c >= 0xeaa && c <= 0xeab) ||
370
+ (c >= 0xead && c <= 0xeae) ||
371
+ c == 0xeb0 ||
372
+ (c >= 0xeb2 && c <= 0xeb3) ||
373
+ c == 0xebd ||
374
+ (c >= 0xec0 && c <= 0xec4) ||
375
+ (c >= 0xf40 && c <= 0xf47) ||
376
+ (c >= 0xf49 && c <= 0xf69) ||
377
+ (c >= 0x10a0 && c <= 0x10c5) ||
378
+ (c >= 0x10d0 && c <= 0x10f6) ||
379
+ c == 0x1100 ||
380
+ (c >= 0x1102 && c <= 0x1103) ||
381
+ (c >= 0x1105 && c <= 0x1107) ||
382
+ c == 0x1109 ||
383
+ (c >= 0x110b && c <= 0x110c) ||
384
+ (c >= 0x110e && c <= 0x1112) ||
385
+ c == 0x113c ||
386
+ c == 0x113e ||
387
+ c == 0x1140 ||
388
+ c == 0x114c ||
389
+ c == 0x114e ||
390
+ c == 0x1150 ||
391
+ (c >= 0x1154 && c <= 0x1155) ||
392
+ c == 0x1159 ||
393
+ (c >= 0x115f && c <= 0x1161) ||
394
+ c == 0x1163 ||
395
+ c == 0x1165 ||
396
+ c == 0x1167 ||
397
+ c == 0x1169 ||
398
+ (c >= 0x116d && c <= 0x116e) ||
399
+ (c >= 0x1172 && c <= 0x1173) ||
400
+ c == 0x1175 ||
401
+ c == 0x119e ||
402
+ c == 0x11a8 ||
403
+ c == 0x11ab ||
404
+ (c >= 0x11ae && c <= 0x11af) ||
405
+ (c >= 0x11b7 && c <= 0x11b8) ||
406
+ c == 0x11ba ||
407
+ (c >= 0x11bc && c <= 0x11c2) ||
408
+ c == 0x11eb ||
409
+ c == 0x11f0 ||
410
+ c == 0x11f9 ||
411
+ (c >= 0x1e00 && c <= 0x1e9b) ||
412
+ (c >= 0x1ea0 && c <= 0x1ef9) ||
413
+ (c >= 0x1f00 && c <= 0x1f15) ||
414
+ (c >= 0x1f18 && c <= 0x1f1d) ||
415
+ (c >= 0x1f20 && c <= 0x1f45) ||
416
+ (c >= 0x1f48 && c <= 0x1f4d) ||
417
+ (c >= 0x1f50 && c <= 0x1f57) ||
418
+ c == 0x1f59 ||
419
+ c == 0x1f5b ||
420
+ c == 0x1f5d ||
421
+ (c >= 0x1f5f && c <= 0x1f7d) ||
422
+ (c >= 0x1f80 && c <= 0x1fb4) ||
423
+ (c >= 0x1fb6 && c <= 0x1fbc) ||
424
+ c == 0x1fbe ||
425
+ (c >= 0x1fc2 && c <= 0x1fc4) ||
426
+ (c >= 0x1fc6 && c <= 0x1fcc) ||
427
+ (c >= 0x1fd0 && c <= 0x1fd3) ||
428
+ (c >= 0x1fd6 && c <= 0x1fdb) ||
429
+ (c >= 0x1fe0 && c <= 0x1fec) ||
430
+ (c >= 0x1ff2 && c <= 0x1ff4) ||
431
+ (c >= 0x1ff6 && c <= 0x1ffc) ||
432
+ c == 0x2126 ||
433
+ (c >= 0x212a && c <= 0x212b) ||
434
+ c == 0x212e ||
435
+ (c >= 0x2180 && c <= 0x2182) ||
436
+ (c >= 0x3041 && c <= 0x3094) ||
437
+ (c >= 0x30a1 && c <= 0x30fa) ||
438
+ (c >= 0x3105 && c <= 0x312c) ||
439
+ (c >= 0xac00 && c <= 0xd7a3) ||
440
+ (c >= 0x4e00 && c <= 0x9fa5) ||
441
+ c == 0x3007 ||
442
+ (c >= 0x3021 && c <= 0x3029) ||
443
+ (c >= 0x4e00 && c <= 0x9fa5) ||
444
+ c == 0x3007 ||
445
+ (c >= 0x3021 && c <= 0x3029));
446
+ }
447
+
448
+ Bool TY_(IsXMLNamechar)(uint c)
449
+ {
450
+ return (TY_(IsXMLLetter)(c) ||
451
+ c == '.' || c == '_' ||
452
+ c == ':' || c == '-' ||
453
+ (c >= 0x300 && c <= 0x345) ||
454
+ (c >= 0x360 && c <= 0x361) ||
455
+ (c >= 0x483 && c <= 0x486) ||
456
+ (c >= 0x591 && c <= 0x5a1) ||
457
+ (c >= 0x5a3 && c <= 0x5b9) ||
458
+ (c >= 0x5bb && c <= 0x5bd) ||
459
+ c == 0x5bf ||
460
+ (c >= 0x5c1 && c <= 0x5c2) ||
461
+ c == 0x5c4 ||
462
+ (c >= 0x64b && c <= 0x652) ||
463
+ c == 0x670 ||
464
+ (c >= 0x6d6 && c <= 0x6dc) ||
465
+ (c >= 0x6dd && c <= 0x6df) ||
466
+ (c >= 0x6e0 && c <= 0x6e4) ||
467
+ (c >= 0x6e7 && c <= 0x6e8) ||
468
+ (c >= 0x6ea && c <= 0x6ed) ||
469
+ (c >= 0x901 && c <= 0x903) ||
470
+ c == 0x93c ||
471
+ (c >= 0x93e && c <= 0x94c) ||
472
+ c == 0x94d ||
473
+ (c >= 0x951 && c <= 0x954) ||
474
+ (c >= 0x962 && c <= 0x963) ||
475
+ (c >= 0x981 && c <= 0x983) ||
476
+ c == 0x9bc ||
477
+ c == 0x9be ||
478
+ c == 0x9bf ||
479
+ (c >= 0x9c0 && c <= 0x9c4) ||
480
+ (c >= 0x9c7 && c <= 0x9c8) ||
481
+ (c >= 0x9cb && c <= 0x9cd) ||
482
+ c == 0x9d7 ||
483
+ (c >= 0x9e2 && c <= 0x9e3) ||
484
+ c == 0xa02 ||
485
+ c == 0xa3c ||
486
+ c == 0xa3e ||
487
+ c == 0xa3f ||
488
+ (c >= 0xa40 && c <= 0xa42) ||
489
+ (c >= 0xa47 && c <= 0xa48) ||
490
+ (c >= 0xa4b && c <= 0xa4d) ||
491
+ (c >= 0xa70 && c <= 0xa71) ||
492
+ (c >= 0xa81 && c <= 0xa83) ||
493
+ c == 0xabc ||
494
+ (c >= 0xabe && c <= 0xac5) ||
495
+ (c >= 0xac7 && c <= 0xac9) ||
496
+ (c >= 0xacb && c <= 0xacd) ||
497
+ (c >= 0xb01 && c <= 0xb03) ||
498
+ c == 0xb3c ||
499
+ (c >= 0xb3e && c <= 0xb43) ||
500
+ (c >= 0xb47 && c <= 0xb48) ||
501
+ (c >= 0xb4b && c <= 0xb4d) ||
502
+ (c >= 0xb56 && c <= 0xb57) ||
503
+ (c >= 0xb82 && c <= 0xb83) ||
504
+ (c >= 0xbbe && c <= 0xbc2) ||
505
+ (c >= 0xbc6 && c <= 0xbc8) ||
506
+ (c >= 0xbca && c <= 0xbcd) ||
507
+ c == 0xbd7 ||
508
+ (c >= 0xc01 && c <= 0xc03) ||
509
+ (c >= 0xc3e && c <= 0xc44) ||
510
+ (c >= 0xc46 && c <= 0xc48) ||
511
+ (c >= 0xc4a && c <= 0xc4d) ||
512
+ (c >= 0xc55 && c <= 0xc56) ||
513
+ (c >= 0xc82 && c <= 0xc83) ||
514
+ (c >= 0xcbe && c <= 0xcc4) ||
515
+ (c >= 0xcc6 && c <= 0xcc8) ||
516
+ (c >= 0xcca && c <= 0xccd) ||
517
+ (c >= 0xcd5 && c <= 0xcd6) ||
518
+ (c >= 0xd02 && c <= 0xd03) ||
519
+ (c >= 0xd3e && c <= 0xd43) ||
520
+ (c >= 0xd46 && c <= 0xd48) ||
521
+ (c >= 0xd4a && c <= 0xd4d) ||
522
+ c == 0xd57 ||
523
+ c == 0xe31 ||
524
+ (c >= 0xe34 && c <= 0xe3a) ||
525
+ (c >= 0xe47 && c <= 0xe4e) ||
526
+ c == 0xeb1 ||
527
+ (c >= 0xeb4 && c <= 0xeb9) ||
528
+ (c >= 0xebb && c <= 0xebc) ||
529
+ (c >= 0xec8 && c <= 0xecd) ||
530
+ (c >= 0xf18 && c <= 0xf19) ||
531
+ c == 0xf35 ||
532
+ c == 0xf37 ||
533
+ c == 0xf39 ||
534
+ c == 0xf3e ||
535
+ c == 0xf3f ||
536
+ (c >= 0xf71 && c <= 0xf84) ||
537
+ (c >= 0xf86 && c <= 0xf8b) ||
538
+ (c >= 0xf90 && c <= 0xf95) ||
539
+ c == 0xf97 ||
540
+ (c >= 0xf99 && c <= 0xfad) ||
541
+ (c >= 0xfb1 && c <= 0xfb7) ||
542
+ c == 0xfb9 ||
543
+ (c >= 0x20d0 && c <= 0x20dc) ||
544
+ c == 0x20e1 ||
545
+ (c >= 0x302a && c <= 0x302f) ||
546
+ c == 0x3099 ||
547
+ c == 0x309a ||
548
+ (c >= 0x30 && c <= 0x39) ||
549
+ (c >= 0x660 && c <= 0x669) ||
550
+ (c >= 0x6f0 && c <= 0x6f9) ||
551
+ (c >= 0x966 && c <= 0x96f) ||
552
+ (c >= 0x9e6 && c <= 0x9ef) ||
553
+ (c >= 0xa66 && c <= 0xa6f) ||
554
+ (c >= 0xae6 && c <= 0xaef) ||
555
+ (c >= 0xb66 && c <= 0xb6f) ||
556
+ (c >= 0xbe7 && c <= 0xbef) ||
557
+ (c >= 0xc66 && c <= 0xc6f) ||
558
+ (c >= 0xce6 && c <= 0xcef) ||
559
+ (c >= 0xd66 && c <= 0xd6f) ||
560
+ (c >= 0xe50 && c <= 0xe59) ||
561
+ (c >= 0xed0 && c <= 0xed9) ||
562
+ (c >= 0xf20 && c <= 0xf29) ||
563
+ c == 0xb7 ||
564
+ c == 0x2d0 ||
565
+ c == 0x2d1 ||
566
+ c == 0x387 ||
567
+ c == 0x640 ||
568
+ c == 0xe46 ||
569
+ c == 0xec6 ||
570
+ c == 0x3005 ||
571
+ (c >= 0x3031 && c <= 0x3035) ||
572
+ (c >= 0x309d && c <= 0x309e) ||
573
+ (c >= 0x30fc && c <= 0x30fe));
574
+ }
575
+
576
+ #if 0
577
+ Bool IsLower(uint c)
578
+ {
579
+ uint map = MAP(c);
580
+
581
+ return (map & lowercase)!=0;
582
+ }
583
+ #endif
584
+
585
+ Bool TY_(IsUpper)(uint c)
586
+ {
587
+ uint map = MAP(c);
588
+
589
+ return (map & uppercase)!=0;
590
+ }
591
+
592
+ uint TY_(ToLower)(uint c)
593
+ {
594
+ uint map = MAP(c);
595
+
596
+ if (map & uppercase)
597
+ c += 'a' - 'A';
598
+
599
+ return c;
600
+ }
601
+
602
+ uint TY_(ToUpper)(uint c)
603
+ {
604
+ uint map = MAP(c);
605
+
606
+ if (map & lowercase)
607
+ c += (uint) ('A' - 'a' );
608
+
609
+ return c;
610
+ }
611
+
612
+ #if 0
613
+ char FoldCase( TidyDocImpl* doc, tmbchar c, Bool tocaps )
614
+ {
615
+ if ( !cfgBool(doc, TidyXmlTags) )
616
+ {
617
+ if ( tocaps )
618
+ {
619
+ c = (tmbchar) ToUpper(c);
620
+ }
621
+ else /* force to lower case */
622
+ {
623
+ c = (tmbchar) ToLower(c);
624
+ }
625
+ }
626
+ return c;
627
+ }
628
+ #endif
629
+
630
+ /*
631
+ return last character in string
632
+ this is useful when trailing quotemark
633
+ is missing on an attribute
634
+ */
635
+ static tmbchar LastChar( tmbstr str )
636
+ {
637
+ if ( str && *str )
638
+ {
639
+ int n = TY_(tmbstrlen)(str);
640
+ return str[n-1];
641
+ }
642
+ return 0;
643
+ }
644
+
645
+ /*
646
+ node->type is one of these:
647
+
648
+ #define TextNode 1
649
+ #define StartTag 2
650
+ #define EndTag 3
651
+ #define StartEndTag 4
652
+ */
653
+
654
+ Lexer* TY_(NewLexer)( TidyDocImpl* doc )
655
+ {
656
+ Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
657
+
658
+ if ( lexer != NULL )
659
+ {
660
+ TidyClearMemory( lexer, sizeof(Lexer) );
661
+
662
+ lexer->allocator = doc->allocator;
663
+ lexer->lines = 1;
664
+ lexer->columns = 1;
665
+ lexer->state = LEX_CONTENT;
666
+
667
+ lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
668
+ lexer->doctype = VERS_UNKNOWN;
669
+ lexer->root = &doc->root;
670
+ }
671
+ return lexer;
672
+ }
673
+
674
+ static Bool EndOfInput( TidyDocImpl* doc )
675
+ {
676
+ assert( doc->docIn != NULL );
677
+ return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
678
+ }
679
+
680
+ void TY_(FreeLexer)( TidyDocImpl* doc )
681
+ {
682
+ Lexer *lexer = doc->lexer;
683
+ if ( lexer )
684
+ {
685
+ TY_(FreeStyles)( doc );
686
+
687
+ /* See GetToken() */
688
+ if ( lexer->pushed || lexer->itoken )
689
+ {
690
+ if (lexer->pushed)
691
+ TY_(FreeNode)( doc, lexer->itoken );
692
+ TY_(FreeNode)( doc, lexer->token );
693
+ }
694
+
695
+ while ( lexer->istacksize > 0 )
696
+ TY_(PopInline)( doc, NULL );
697
+
698
+ TidyDocFree( doc, lexer->istack );
699
+ TidyDocFree( doc, lexer->lexbuf );
700
+ TidyDocFree( doc, lexer );
701
+ doc->lexer = NULL;
702
+ }
703
+ }
704
+
705
+ /* Lexer uses bigger memory chunks than pprint as
706
+ ** it must hold the entire input document. not just
707
+ ** the last line or three.
708
+ */
709
+ static void AddByte( Lexer *lexer, tmbchar ch )
710
+ {
711
+ if ( lexer->lexsize + 2 >= lexer->lexlength )
712
+ {
713
+ tmbstr buf = NULL;
714
+ uint allocAmt = lexer->lexlength;
715
+ while ( lexer->lexsize + 2 >= allocAmt )
716
+ {
717
+ if ( allocAmt == 0 )
718
+ allocAmt = 8192;
719
+ else
720
+ allocAmt *= 2;
721
+ }
722
+ buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt );
723
+ if ( buf )
724
+ {
725
+ TidyClearMemory( buf + lexer->lexlength,
726
+ allocAmt - lexer->lexlength );
727
+ lexer->lexbuf = buf;
728
+ lexer->lexlength = allocAmt;
729
+ }
730
+ }
731
+
732
+ lexer->lexbuf[ lexer->lexsize++ ] = ch;
733
+ lexer->lexbuf[ lexer->lexsize ] = '\0'; /* debug */
734
+ }
735
+
736
+ static void ChangeChar( Lexer *lexer, tmbchar c )
737
+ {
738
+ if ( lexer->lexsize > 0 )
739
+ {
740
+ lexer->lexbuf[ lexer->lexsize-1 ] = c;
741
+ }
742
+ }
743
+
744
+ /* store character c as UTF-8 encoded byte stream */
745
+ void TY_(AddCharToLexer)( Lexer *lexer, uint c )
746
+ {
747
+ int i, err, count = 0;
748
+ tmbchar buf[10] = {0};
749
+
750
+ err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
751
+ if (err)
752
+ {
753
+ #if 0 && defined(_DEBUG)
754
+ fprintf( stderr, "lexer UTF-8 encoding error for U+%x : ", c );
755
+ #endif
756
+ /* replacement character 0xFFFD encoded as UTF-8 */
757
+ buf[0] = (byte) 0xEF;
758
+ buf[1] = (byte) 0xBF;
759
+ buf[2] = (byte) 0xBD;
760
+ count = 3;
761
+ }
762
+
763
+ for ( i = 0; i < count; ++i )
764
+ AddByte( lexer, buf[i] );
765
+ }
766
+
767
+ static void AddStringToLexer( Lexer *lexer, ctmbstr str )
768
+ {
769
+ uint c;
770
+
771
+ /* Many (all?) compilers will sign-extend signed chars (the default) when
772
+ ** converting them to unsigned integer values. We must cast our char to
773
+ ** unsigned char before assigning it to prevent this from happening.
774
+ */
775
+ while( 0 != (c = (unsigned char) *str++ ))
776
+ TY_(AddCharToLexer)( lexer, c );
777
+ }
778
+
779
+
780
+ static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
781
+ {
782
+ lexer->lines = doc->docIn->curline;
783
+ lexer->columns = doc->docIn->curcol;
784
+ }
785
+
786
+ /*
787
+ No longer attempts to insert missing ';' for unknown
788
+ enitities unless one was present already, since this
789
+ gives unexpected results.
790
+
791
+ For example: <a href="something.htm?foo&bar&fred">
792
+ was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
793
+ rather than: <a href="something.htm?foo&amp;bar&amp;fred">
794
+
795
+ My thanks for Maurice Buxton for spotting this.
796
+
797
+ Also Randy Waki pointed out the following case for the
798
+ 04 Aug 00 version (bug #433012):
799
+
800
+ For example: <a href="something.htm?id=1&lang=en">
801
+ was tidied to: <a href="something.htm?id=1&lang;=en">
802
+ rather than: <a href="something.htm?id=1&amp;lang=en">
803
+
804
+ where "lang" is a known entity (#9001), but browsers would
805
+ misinterpret "&lang;" because it had a value > 256.
806
+
807
+ So the case of an apparently known entity with a value > 256 and
808
+ missing a semicolon is handled specially.
809
+
810
+ "ParseEntity" is also a bit of a misnomer - it handles entities and
811
+ numeric character references. Invalid NCR's are now reported.
812
+ */
813
+ static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
814
+ {
815
+ typedef enum
816
+ {
817
+ ENT_default,
818
+ ENT_numdec,
819
+ ENT_numhex
820
+ } ENTState;
821
+
822
+ typedef Bool (*ENTfn)(uint);
823
+ const ENTfn entFn[] = {
824
+ TY_(IsNamechar),
825
+ TY_(IsDigit),
826
+ IsDigitHex
827
+ };
828
+ uint start;
829
+ ENTState entState = ENT_default;
830
+ uint charRead = 0;
831
+ Bool semicolon = no, found = no;
832
+ Bool isXml = cfgBool( doc, TidyXmlTags );
833
+ Bool preserveEntities = cfgBool( doc, TidyPreserveEntities );
834
+ uint c, ch, startcol, entver = 0;
835
+ Lexer* lexer = doc->lexer;
836
+
837
+ start = lexer->lexsize - 1; /* to start at "&" */
838
+ startcol = doc->docIn->curcol - 1;
839
+
840
+ while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
841
+ {
842
+ if ( c == ';' )
843
+ {
844
+ semicolon = yes;
845
+ break;
846
+ }
847
+ ++charRead;
848
+
849
+ if (charRead == 1 && c == '#')
850
+ {
851
+ #if SUPPORT_ASIAN_ENCODINGS
852
+ if ( !cfgBool(doc, TidyNCR) ||
853
+ cfg(doc, TidyInCharEncoding) == BIG5 ||
854
+ cfg(doc, TidyInCharEncoding) == SHIFTJIS )
855
+ {
856
+ TY_(UngetChar)('#', doc->docIn);
857
+ return;
858
+ }
859
+ #endif
860
+ TY_(AddCharToLexer)( lexer, c );
861
+ entState = ENT_numdec;
862
+ continue;
863
+ }
864
+ else if (charRead == 2 && entState == ENT_numdec
865
+ && (c == 'x' || (!isXml && c == 'X')) )
866
+ {
867
+ TY_(AddCharToLexer)( lexer, c );
868
+ entState = ENT_numhex;
869
+ continue;
870
+ }
871
+
872
+ if ( entFn[entState](c) )
873
+ {
874
+ TY_(AddCharToLexer)( lexer, c );
875
+ continue;
876
+ }
877
+
878
+ /* otherwise put it back */
879
+ TY_(UngetChar)( c, doc->docIn );
880
+ break;
881
+ }
882
+
883
+ /* make sure entity is NULL terminated */
884
+ lexer->lexbuf[lexer->lexsize] = '\0';
885
+
886
+ /* Should contrain version to XML/XHTML if &apos;
887
+ ** is encountered. But this is not possible with
888
+ ** Tidy's content model bit mask.
889
+ */
890
+ if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0
891
+ && !cfgBool(doc, TidyXmlOut)
892
+ && !lexer->isvoyager
893
+ && !cfgBool(doc, TidyXhtmlOut) )
894
+ TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
895
+
896
+ /* Lookup entity code and version
897
+ */
898
+ found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
899
+
900
+ /* deal with unrecognized or invalid entities */
901
+ /* #433012 - fix by Randy Waki 17 Feb 01 */
902
+ /* report invalid NCR's - Terry Teague 01 Sep 01 */
903
+ if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
904
+ {
905
+ /* set error position just before offending character */
906
+ SetLexerLocus( doc, lexer );
907
+ lexer->columns = startcol;
908
+
909
+ if (lexer->lexsize > start + 1)
910
+ {
911
+ if (ch >= 128 && ch <= 159)
912
+ {
913
+ /* invalid numeric character reference */
914
+
915
+ uint c1 = 0;
916
+ int replaceMode = DISCARDED_CHAR;
917
+
918
+ if ( TY_(ReplacementCharEncoding) == WIN1252 )
919
+ c1 = TY_(DecodeWin1252)( ch );
920
+ else if ( TY_(ReplacementCharEncoding) == MACROMAN )
921
+ c1 = TY_(DecodeMacRoman)( ch );
922
+
923
+ if ( c1 )
924
+ replaceMode = REPLACED_CHAR;
925
+
926
+ if ( c != ';' ) /* issue warning if not terminated by ';' */
927
+ TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR,
928
+ lexer->lexbuf+start, c );
929
+
930
+ TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
931
+
932
+ if ( c1 )
933
+ {
934
+ /* make the replacement */
935
+ lexer->lexsize = start;
936
+ TY_(AddCharToLexer)( lexer, c1 );
937
+ semicolon = no;
938
+ }
939
+ else
940
+ {
941
+ /* discard */
942
+ lexer->lexsize = start;
943
+ semicolon = no;
944
+ }
945
+
946
+ }
947
+ else
948
+ TY_(ReportEntityError)( doc, UNKNOWN_ENTITY,
949
+ lexer->lexbuf+start, ch );
950
+
951
+ if (semicolon)
952
+ TY_(AddCharToLexer)( lexer, ';' );
953
+ }
954
+ else /* naked & */
955
+ TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND,
956
+ lexer->lexbuf+start, ch );
957
+ }
958
+ else
959
+ {
960
+ if ( c != ';' ) /* issue warning if not terminated by ';' */
961
+ {
962
+ /* set error position just before offending chararcter */
963
+ SetLexerLocus( doc, lexer );
964
+ lexer->columns = startcol;
965
+ TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
966
+ }
967
+
968
+ if (preserveEntities)
969
+ TY_(AddCharToLexer)( lexer, ';' );
970
+ else
971
+ {
972
+ lexer->lexsize = start;
973
+ if ( ch == 160 && (mode == Preformatted) )
974
+ ch = ' ';
975
+ TY_(AddCharToLexer)( lexer, ch );
976
+
977
+ if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
978
+ AddStringToLexer( lexer, "amp;" );
979
+ }
980
+
981
+ /* Detect extended vs. basic entities */
982
+ TY_(ConstrainVersion)( doc, entver );
983
+ }
984
+ }
985
+
986
+ static tmbchar ParseTagName( TidyDocImpl* doc )
987
+ {
988
+ Lexer *lexer = doc->lexer;
989
+ uint c = lexer->lexbuf[ lexer->txtstart ];
990
+ Bool xml = cfgBool(doc, TidyXmlTags);
991
+
992
+ /* fold case of first character in buffer */
993
+ if (!xml && TY_(IsUpper)(c))
994
+ lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c);
995
+
996
+ while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
997
+ {
998
+ if ((!xml && !TY_(IsNamechar)(c)) ||
999
+ (xml && !TY_(IsXMLNamechar)(c)))
1000
+ break;
1001
+
1002
+ /* fold case of subsequent characters */
1003
+ if (!xml && TY_(IsUpper)(c))
1004
+ c = TY_(ToLower)(c);
1005
+
1006
+ TY_(AddCharToLexer)(lexer, c);
1007
+ }
1008
+
1009
+ lexer->txtend = lexer->lexsize;
1010
+ return (tmbchar) c;
1011
+ }
1012
+
1013
+ /*
1014
+ Used for elements and text nodes
1015
+ element name is NULL for text nodes
1016
+ start and end are offsets into lexbuf
1017
+ which contains the textual content of
1018
+ all elements in the parse tree.
1019
+
1020
+ parent and content allow traversal
1021
+ of the parse tree in any direction.
1022
+ attributes are represented as a linked
1023
+ list of AttVal nodes which hold the
1024
+ strings for attribute/value pairs.
1025
+ */
1026
+
1027
+
1028
+ Node *TY_(NewNode)(TidyAllocator* allocator, Lexer *lexer)
1029
+ {
1030
+ Node* node = (Node*) TidyAlloc( allocator, sizeof(Node) );
1031
+ TidyClearMemory( node, sizeof(Node) );
1032
+ if ( lexer )
1033
+ {
1034
+ node->line = lexer->lines;
1035
+ node->column = lexer->columns;
1036
+ }
1037
+ node->type = TextNode;
1038
+ return node;
1039
+ }
1040
+
1041
+ /* used to clone heading nodes when split by an <HR> */
1042
+ Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element )
1043
+ {
1044
+ Lexer* lexer = doc->lexer;
1045
+ Node *node = TY_(NewNode)( lexer->allocator, lexer );
1046
+
1047
+ node->start = lexer->lexsize;
1048
+ node->end = lexer->lexsize;
1049
+
1050
+ if ( element )
1051
+ {
1052
+ node->parent = element->parent;
1053
+ node->type = element->type;
1054
+ node->closed = element->closed;
1055
+ node->implicit = element->implicit;
1056
+ node->tag = element->tag;
1057
+ node->element = TY_(tmbstrdup)( doc->allocator, element->element );
1058
+ node->attributes = TY_(DupAttrs)( doc, element->attributes );
1059
+ }
1060
+ return node;
1061
+ }
1062
+
1063
+ /* free node's attributes */
1064
+ void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node )
1065
+ {
1066
+ while ( node->attributes )
1067
+ {
1068
+ AttVal *av = node->attributes;
1069
+
1070
+ if ( av->attribute )
1071
+ {
1072
+ if ( (attrIsID(av) || attrIsNAME(av)) &&
1073
+ TY_(IsAnchorElement)(doc, node) )
1074
+ {
1075
+ TY_(RemoveAnchorByNode)( doc, node );
1076
+ }
1077
+ }
1078
+
1079
+ node->attributes = av->next;
1080
+ TY_(FreeAttribute)( doc, av );
1081
+ }
1082
+ }
1083
+
1084
+ /* doesn't repair attribute list linkage */
1085
+ void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av )
1086
+ {
1087
+ TY_(FreeNode)( doc, av->asp );
1088
+ TY_(FreeNode)( doc, av->php );
1089
+ TidyDocFree( doc, av->attribute );
1090
+ TidyDocFree( doc, av->value );
1091
+ TidyDocFree( doc, av );
1092
+ }
1093
+
1094
+ /* detach attribute from node
1095
+ */
1096
+ void TY_(DetachAttribute)( Node *node, AttVal *attr )
1097
+ {
1098
+ AttVal *av, *prev = NULL;
1099
+
1100
+ for ( av = node->attributes; av; av = av->next )
1101
+ {
1102
+ if ( av == attr )
1103
+ {
1104
+ if ( prev )
1105
+ prev->next = attr->next;
1106
+ else
1107
+ node->attributes = attr->next;
1108
+ break;
1109
+ }
1110
+ prev = av;
1111
+ }
1112
+ }
1113
+
1114
+ /* detach attribute from node then free it
1115
+ */
1116
+ void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr )
1117
+ {
1118
+ TY_(DetachAttribute)( node, attr );
1119
+ TY_(FreeAttribute)( doc, attr );
1120
+ }
1121
+
1122
+ /*
1123
+ Free document nodes by iterating through peers and recursing
1124
+ through children. Set next to NULL before calling TY_(FreeNode)()
1125
+ to avoid freeing peer nodes. Doesn't patch up prev/next links.
1126
+ */
1127
+ void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
1128
+ {
1129
+ while ( node )
1130
+ {
1131
+ Node* next = node->next;
1132
+
1133
+ TY_(FreeAttrs)( doc, node );
1134
+ TY_(FreeNode)( doc, node->content );
1135
+ TidyDocFree( doc, node->element );
1136
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
1137
+ if (node->otext)
1138
+ TidyDocFree(doc, node->otext);
1139
+ #endif
1140
+ if (RootNode != node->type)
1141
+ TidyDocFree( doc, node );
1142
+ else
1143
+ node->content = NULL;
1144
+
1145
+ node = next;
1146
+ }
1147
+ }
1148
+
1149
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
1150
+ void StoreOriginalTextInToken(TidyDocImpl* doc, Node* node, uint count)
1151
+ {
1152
+ if (!doc->storeText)
1153
+ return;
1154
+
1155
+ if (count >= doc->docIn->otextlen)
1156
+ return;
1157
+
1158
+ if (!doc->docIn->otextsize)
1159
+ return;
1160
+
1161
+ if (count == 0)
1162
+ {
1163
+ node->otext = doc->docIn->otextbuf;
1164
+ doc->docIn->otextbuf = NULL;
1165
+ doc->docIn->otextlen = 0;
1166
+ doc->docIn->otextsize = 0;
1167
+ }
1168
+ else
1169
+ {
1170
+ uint len = doc->docIn->otextlen;
1171
+ tmbstr buf1 = (tmbstr)TidyDocAlloc(doc, len - count + 1);
1172
+ tmbstr buf2 = (tmbstr)TidyDocAlloc(doc, count + 1);
1173
+ uint i, j;
1174
+
1175
+ /* strncpy? */
1176
+
1177
+ for (i = 0; i < len - count; ++i)
1178
+ buf1[i] = doc->docIn->otextbuf[i];
1179
+
1180
+ buf1[i] = 0;
1181
+
1182
+ for (j = 0; j + i < len; ++j)
1183
+ buf2[j] = doc->docIn->otextbuf[j + i];
1184
+
1185
+ buf2[j] = 0;
1186
+
1187
+ TidyDocFree(doc, doc->docIn->otextbuf);
1188
+ node->otext = buf1;
1189
+ doc->docIn->otextbuf = buf2;
1190
+ doc->docIn->otextlen = count;
1191
+ doc->docIn->otextsize = count + 1;
1192
+ }
1193
+ }
1194
+ #endif
1195
+
1196
+ Node* TY_(TextToken)( Lexer *lexer )
1197
+ {
1198
+ Node *node = TY_(NewNode)( lexer->allocator, lexer );
1199
+ node->start = lexer->txtstart;
1200
+ node->end = lexer->txtend;
1201
+ return node;
1202
+ }
1203
+
1204
+ /* used for creating preformatted text from Word2000 */
1205
+ Node *TY_(NewLineNode)( Lexer *lexer )
1206
+ {
1207
+ Node *node = TY_(NewNode)( lexer->allocator, lexer );
1208
+ node->start = lexer->lexsize;
1209
+ TY_(AddCharToLexer)( lexer, (uint)'\n' );
1210
+ node->end = lexer->lexsize;
1211
+ return node;
1212
+ }
1213
+
1214
+ /* used for adding a &nbsp; for Word2000 */
1215
+ Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt )
1216
+ {
1217
+ Node *node = TY_(NewNode)( lexer->allocator, lexer );
1218
+ node->start = lexer->lexsize;
1219
+ AddStringToLexer( lexer, txt );
1220
+ node->end = lexer->lexsize;
1221
+ return node;
1222
+ }
1223
+
1224
+ static Node* TagToken( TidyDocImpl* doc, NodeType type )
1225
+ {
1226
+ Lexer* lexer = doc->lexer;
1227
+ Node* node = TY_(NewNode)( lexer->allocator, lexer );
1228
+ node->type = type;
1229
+ node->element = TY_(tmbstrndup)( doc->allocator,
1230
+ lexer->lexbuf + lexer->txtstart,
1231
+ lexer->txtend - lexer->txtstart );
1232
+ node->start = lexer->txtstart;
1233
+ node->end = lexer->txtstart;
1234
+
1235
+ if ( type == StartTag || type == StartEndTag || type == EndTag )
1236
+ TY_(FindTag)(doc, node);
1237
+
1238
+ return node;
1239
+ }
1240
+
1241
+ static Node* NewToken(TidyDocImpl* doc, NodeType type)
1242
+ {
1243
+ Lexer* lexer = doc->lexer;
1244
+ Node* node = TY_(NewNode)(lexer->allocator, lexer);
1245
+ node->type = type;
1246
+ node->start = lexer->txtstart;
1247
+ node->end = lexer->txtend;
1248
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
1249
+ StoreOriginalTextInToken(doc, node, 0);
1250
+ #endif
1251
+ return node;
1252
+ }
1253
+
1254
+ #define CommentToken(doc) NewToken(doc, CommentTag)
1255
+ #define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1256
+ #define PIToken(doc) NewToken(doc, ProcInsTag)
1257
+ #define AspToken(doc) NewToken(doc, AspTag)
1258
+ #define JsteToken(doc) NewToken(doc, JsteTag)
1259
+ #define PhpToken(doc) NewToken(doc, PhpTag)
1260
+ #define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1261
+ #define SectionToken(doc) NewToken(doc, SectionTag)
1262
+ #define CDATAToken(doc) NewToken(doc, CDATATag)
1263
+
1264
+ void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str )
1265
+ {
1266
+ byte c;
1267
+ while(0 != (c = *str++) )
1268
+ TY_(AddCharToLexer)( lexer, c );
1269
+ }
1270
+
1271
+ /*
1272
+ void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1273
+ {
1274
+ byte c;
1275
+ int ix;
1276
+
1277
+ for ( ix=0; ix < len && (c = *str++); ++ix )
1278
+ TY_(AddCharToLexer)(lexer, c);
1279
+ }
1280
+ */
1281
+
1282
+ /* find doctype element */
1283
+ Node *TY_(FindDocType)( TidyDocImpl* doc )
1284
+ {
1285
+ Node* node;
1286
+ for ( node = (doc ? doc->root.content : NULL);
1287
+ node && node->type != DocTypeTag;
1288
+ node = node->next )
1289
+ /**/;
1290
+ return node;
1291
+ }
1292
+
1293
+ /* find parent container element */
1294
+ Node* TY_(FindContainer)( Node* node )
1295
+ {
1296
+ for ( node = (node ? node->parent : NULL);
1297
+ node && TY_(nodeHasCM)(node, CM_INLINE);
1298
+ node = node->parent )
1299
+ /**/;
1300
+
1301
+ return node;
1302
+ }
1303
+
1304
+
1305
+ /* find html element */
1306
+ Node *TY_(FindHTML)( TidyDocImpl* doc )
1307
+ {
1308
+ Node *node;
1309
+ for ( node = (doc ? doc->root.content : NULL);
1310
+ node && !nodeIsHTML(node);
1311
+ node = node->next )
1312
+ /**/;
1313
+
1314
+ return node;
1315
+ }
1316
+
1317
+ /* find XML Declaration */
1318
+ Node *TY_(FindXmlDecl)(TidyDocImpl* doc)
1319
+ {
1320
+ Node *node;
1321
+ for ( node = (doc ? doc->root.content : NULL);
1322
+ node && !(node->type == XmlDecl);
1323
+ node = node->next )
1324
+ /**/;
1325
+
1326
+ return node;
1327
+ }
1328
+
1329
+
1330
+ Node *TY_(FindHEAD)( TidyDocImpl* doc )
1331
+ {
1332
+ Node *node = TY_(FindHTML)( doc );
1333
+
1334
+ if ( node )
1335
+ {
1336
+ for ( node = node->content;
1337
+ node && !nodeIsHEAD(node);
1338
+ node = node->next )
1339
+ /**/;
1340
+ }
1341
+
1342
+ return node;
1343
+ }
1344
+
1345
+ Node *TY_(FindTITLE)(TidyDocImpl* doc)
1346
+ {
1347
+ Node *node = TY_(FindHEAD)(doc);
1348
+
1349
+ if (node)
1350
+ for (node = node->content;
1351
+ node && !nodeIsTITLE(node);
1352
+ node = node->next) {}
1353
+
1354
+ return node;
1355
+ }
1356
+
1357
+ Node *TY_(FindBody)( TidyDocImpl* doc )
1358
+ {
1359
+ Node *node = ( doc ? doc->root.content : NULL );
1360
+
1361
+ while ( node && !nodeIsHTML(node) )
1362
+ node = node->next;
1363
+
1364
+ if (node == NULL)
1365
+ return NULL;
1366
+
1367
+ node = node->content;
1368
+ while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1369
+ node = node->next;
1370
+
1371
+ if ( node && nodeIsFRAMESET(node) )
1372
+ {
1373
+ node = node->content;
1374
+ while ( node && !nodeIsNOFRAMES(node) )
1375
+ node = node->next;
1376
+
1377
+ if ( node )
1378
+ {
1379
+ node = node->content;
1380
+ while ( node && !nodeIsBODY(node) )
1381
+ node = node->next;
1382
+ }
1383
+ }
1384
+
1385
+ return node;
1386
+ }
1387
+
1388
+ /* add meta element for Tidy */
1389
+ Bool TY_(AddGenerator)( TidyDocImpl* doc )
1390
+ {
1391
+ AttVal *attval;
1392
+ Node *node;
1393
+ Node *head = TY_(FindHEAD)( doc );
1394
+ tmbchar buf[256];
1395
+
1396
+ if (head)
1397
+ {
1398
+ #ifdef PLATFORM_NAME
1399
+ TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for "PLATFORM_NAME" (vers %s), see www.w3.org",
1400
+ tidyReleaseDate());
1401
+ #else
1402
+ TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy (vers %s), see www.w3.org", tidyReleaseDate());
1403
+ #endif
1404
+
1405
+ for ( node = head->content; node; node = node->next )
1406
+ {
1407
+ if ( nodeIsMETA(node) )
1408
+ {
1409
+ attval = TY_(AttrGetById)(node, TidyAttr_NAME);
1410
+
1411
+ if (AttrValueIs(attval, "generator"))
1412
+ {
1413
+ attval = TY_(AttrGetById)(node, TidyAttr_CONTENT);
1414
+
1415
+ if (AttrHasValue(attval) &&
1416
+ TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0)
1417
+ {
1418
+ /* update the existing content to reflect the */
1419
+ /* actual version of Tidy currently being used */
1420
+
1421
+ TidyDocFree(doc, attval->value);
1422
+ attval->value = TY_(tmbstrdup)(doc->allocator, buf);
1423
+ return no;
1424
+ }
1425
+ }
1426
+ }
1427
+ }
1428
+
1429
+ if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
1430
+ {
1431
+ node = TY_(InferredTag)(doc, TidyTag_META);
1432
+ TY_(AddAttribute)( doc, node, "name", "generator" );
1433
+ TY_(AddAttribute)( doc, node, "content", buf );
1434
+ TY_(InsertNodeAtStart)( head, node );
1435
+ return yes;
1436
+ }
1437
+ }
1438
+
1439
+ return no;
1440
+ }
1441
+
1442
+ /* examine <!DOCTYPE> to identify version */
1443
+ static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )
1444
+ {
1445
+ AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC");
1446
+ uint vers;
1447
+
1448
+ if (!fpi || !fpi->value)
1449
+ return VERS_UNKNOWN;
1450
+
1451
+ vers = GetVersFromFPI(fpi->value);
1452
+
1453
+ if (VERS_XHTML & vers)
1454
+ {
1455
+ TY_(SetOptionBool)(doc, TidyXmlOut, yes);
1456
+ TY_(SetOptionBool)(doc, TidyXhtmlOut, yes);
1457
+ doc->lexer->isvoyager = yes;
1458
+ }
1459
+
1460
+ /* todo: add a warning if case does not match? */
1461
+ TidyDocFree(doc, fpi->value);
1462
+ fpi->value = TY_(tmbstrdup)(doc->allocator, GetFPIFromVers(vers));
1463
+
1464
+ return vers;
1465
+ }
1466
+
1467
+ /* return guessed version */
1468
+ uint TY_(ApparentVersion)( TidyDocImpl* doc )
1469
+ {
1470
+ if ((doc->lexer->doctype == XH11 ||
1471
+ doc->lexer->doctype == XB10) &&
1472
+ (doc->lexer->versions & doc->lexer->doctype))
1473
+ return doc->lexer->doctype;
1474
+ else
1475
+ return TY_(HTMLVersion)(doc);
1476
+ }
1477
+
1478
+ ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) )
1479
+ {
1480
+ ctmbstr name = GetNameFromVers(vers);
1481
+
1482
+ /* this test has moved to ReportMarkupVersion() in localize.c, for localization reasons */
1483
+ /*
1484
+ if (!name)
1485
+ name = "HTML Proprietary";
1486
+ */
1487
+
1488
+ return name;
1489
+ }
1490
+
1491
+ Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc )
1492
+ {
1493
+ Bool isXhtml = doc->lexer->isvoyager;
1494
+ Node* doctype;
1495
+
1496
+ /* Do not warn in XHTML mode */
1497
+ if ( isXhtml )
1498
+ return no;
1499
+
1500
+ /* Do not warn if emitted doctype is proprietary */
1501
+ if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL )
1502
+ return no;
1503
+
1504
+ /* Do not warn if no SI is possible */
1505
+ if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL )
1506
+ return no;
1507
+
1508
+ if ( (doctype = TY_(FindDocType)( doc )) != NULL
1509
+ && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL )
1510
+ return yes;
1511
+
1512
+ return no;
1513
+ }
1514
+
1515
+
1516
+ /* Put DOCTYPE declaration between the
1517
+ ** <?xml version "1.0" ... ?> declaration, if any,
1518
+ ** and the <html> tag. Should also work for any comments,
1519
+ ** etc. that may precede the <html> tag.
1520
+ */
1521
+
1522
+ static Node* NewDocTypeNode( TidyDocImpl* doc )
1523
+ {
1524
+ Node* doctype = NULL;
1525
+ Node* html = TY_(FindHTML)( doc );
1526
+
1527
+ if ( !html )
1528
+ return NULL;
1529
+
1530
+ doctype = TY_(NewNode)( doc->allocator, NULL );
1531
+ doctype->type = DocTypeTag;
1532
+ TY_(InsertNodeBeforeElement)(html, doctype);
1533
+ return doctype;
1534
+ }
1535
+
1536
+ Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
1537
+ {
1538
+ Lexer *lexer = doc->lexer;
1539
+ Node *doctype = TY_(FindDocType)( doc );
1540
+ TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
1541
+ ctmbstr pub = "PUBLIC";
1542
+ ctmbstr sys = "SYSTEM";
1543
+
1544
+ lexer->versionEmitted = TY_(ApparentVersion)( doc );
1545
+
1546
+ if (dtmode == TidyDoctypeOmit)
1547
+ {
1548
+ if (doctype)
1549
+ TY_(DiscardElement)(doc, doctype);
1550
+ return yes;
1551
+ }
1552
+
1553
+ if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))
1554
+ return no;
1555
+
1556
+ if (!doctype)
1557
+ {
1558
+ doctype = NewDocTypeNode(doc);
1559
+ doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
1560
+ }
1561
+ else
1562
+ {
1563
+ doctype->element = TY_(tmbstrtolower)(doctype->element);
1564
+ }
1565
+
1566
+ switch(dtmode)
1567
+ {
1568
+ case TidyDoctypeStrict:
1569
+ /* XHTML 1.0 Strict */
1570
+ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1571
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1572
+ lexer->versionEmitted = X10S;
1573
+ break;
1574
+ case TidyDoctypeLoose:
1575
+ /* XHTML 1.0 Transitional */
1576
+ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1577
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1578
+ lexer->versionEmitted = X10T;
1579
+ break;
1580
+ case TidyDoctypeUser:
1581
+ /* user defined document type declaration */
1582
+ TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype));
1583
+ TY_(RepairAttrValue)(doc, doctype, sys, "");
1584
+ break;
1585
+ case TidyDoctypeAuto:
1586
+ if (lexer->versions & XH11 && lexer->doctype == XH11)
1587
+ {
1588
+ if (!TY_(GetAttrByName)(doctype, sys))
1589
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1590
+ lexer->versionEmitted = XH11;
1591
+ return yes;
1592
+ }
1593
+ else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))
1594
+ {
1595
+ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11));
1596
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1597
+ lexer->versionEmitted = XH11;
1598
+ }
1599
+ else if (lexer->versions & XB10 && lexer->doctype == XB10)
1600
+ {
1601
+ if (!TY_(GetAttrByName)(doctype, sys))
1602
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10));
1603
+ lexer->versionEmitted = XB10;
1604
+ return yes;
1605
+ }
1606
+ else if (lexer->versions & VERS_HTML40_STRICT)
1607
+ {
1608
+ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1609
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1610
+ lexer->versionEmitted = X10S;
1611
+ }
1612
+ else if (lexer->versions & VERS_FRAMESET)
1613
+ {
1614
+ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F));
1615
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F));
1616
+ lexer->versionEmitted = X10F;
1617
+ }
1618
+ else if (lexer->versions & VERS_LOOSE)
1619
+ {
1620
+ TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1621
+ TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1622
+ lexer->versionEmitted = X10T;
1623
+ }
1624
+ else
1625
+ {
1626
+ if (doctype)
1627
+ TY_(DiscardElement)(doc, doctype);
1628
+ return no;
1629
+ }
1630
+ break;
1631
+ case TidyDoctypeOmit:
1632
+ assert(0);
1633
+ break;
1634
+ }
1635
+
1636
+ return no;
1637
+ }
1638
+
1639
+ /* fixup doctype if missing */
1640
+ Bool TY_(FixDocType)( TidyDocImpl* doc )
1641
+ {
1642
+ Lexer* lexer = doc->lexer;
1643
+ Node* doctype = TY_(FindDocType)( doc );
1644
+ uint dtmode = cfg( doc, TidyDoctypeMode );
1645
+ uint guessed = VERS_UNKNOWN;
1646
+ Bool hadSI = no;
1647
+
1648
+ if (dtmode == TidyDoctypeAuto &&
1649
+ lexer->versions & lexer->doctype &&
1650
+ !(VERS_XHTML & lexer->doctype && !lexer->isvoyager)
1651
+ && TY_(FindDocType)(doc))
1652
+ {
1653
+ lexer->versionEmitted = lexer->doctype;
1654
+ return yes;
1655
+ }
1656
+
1657
+ if (dtmode == TidyDoctypeOmit)
1658
+ {
1659
+ if (doctype)
1660
+ TY_(DiscardElement)( doc, doctype );
1661
+ lexer->versionEmitted = TY_(ApparentVersion)( doc );
1662
+ return yes;
1663
+ }
1664
+
1665
+ if (cfgBool(doc, TidyXmlOut))
1666
+ return yes;
1667
+
1668
+ if (doctype)
1669
+ hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL;
1670
+
1671
+ if ((dtmode == TidyDoctypeStrict ||
1672
+ dtmode == TidyDoctypeLoose) && doctype)
1673
+ {
1674
+ TY_(DiscardElement)(doc, doctype);
1675
+ doctype = NULL;
1676
+ }
1677
+
1678
+ switch (dtmode)
1679
+ {
1680
+ case TidyDoctypeStrict:
1681
+ guessed = H41S;
1682
+ break;
1683
+ case TidyDoctypeLoose:
1684
+ guessed = H41T;
1685
+ break;
1686
+ case TidyDoctypeAuto:
1687
+ guessed = TY_(HTMLVersion)(doc);
1688
+ break;
1689
+ }
1690
+
1691
+ lexer->versionEmitted = guessed;
1692
+ if (guessed == VERS_UNKNOWN)
1693
+ return no;
1694
+
1695
+ if (doctype)
1696
+ {
1697
+ doctype->element = TY_(tmbstrtolower)(doctype->element);
1698
+ }
1699
+ else
1700
+ {
1701
+ doctype = NewDocTypeNode(doc);
1702
+ doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
1703
+ }
1704
+
1705
+ TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));
1706
+
1707
+ if (hadSI)
1708
+ TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed));
1709
+
1710
+ return yes;
1711
+ }
1712
+
1713
+ /* ensure XML document starts with <?xml version="1.0"?> */
1714
+ /* add encoding attribute if not using ASCII or UTF-8 output */
1715
+ Bool TY_(FixXmlDecl)( TidyDocImpl* doc )
1716
+ {
1717
+ Node* xml;
1718
+ AttVal *version, *encoding;
1719
+ Lexer*lexer = doc->lexer;
1720
+ Node* root = &doc->root;
1721
+
1722
+ if ( root->content && root->content->type == XmlDecl )
1723
+ {
1724
+ xml = root->content;
1725
+ }
1726
+ else
1727
+ {
1728
+ xml = TY_(NewNode)(lexer->allocator, lexer);
1729
+ xml->type = XmlDecl;
1730
+ if ( root->content )
1731
+ TY_(InsertNodeBeforeElement)(root->content, xml);
1732
+ else
1733
+ root->content = xml;
1734
+ }
1735
+
1736
+ version = TY_(GetAttrByName)(xml, "version");
1737
+ encoding = TY_(GetAttrByName)(xml, "encoding");
1738
+
1739
+ /*
1740
+ We need to insert a check if declared encoding
1741
+ and output encoding mismatch and fix the XML
1742
+ declaration accordingly!!!
1743
+ */
1744
+
1745
+ if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )
1746
+ {
1747
+ ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
1748
+ if ( enc )
1749
+ TY_(AddAttribute)( doc, xml, "encoding", enc );
1750
+ }
1751
+
1752
+ if ( version == NULL )
1753
+ TY_(AddAttribute)( doc, xml, "version", "1.0" );
1754
+ return yes;
1755
+ }
1756
+
1757
+ Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id)
1758
+ {
1759
+ Lexer *lexer = doc->lexer;
1760
+ Node *node = TY_(NewNode)( lexer->allocator, lexer );
1761
+ const Dict* dict = TY_(LookupTagDef)(id);
1762
+
1763
+ assert( dict != NULL );
1764
+
1765
+ node->type = StartTag;
1766
+ node->implicit = yes;
1767
+ node->element = TY_(tmbstrdup)(doc->allocator, dict->name);
1768
+ node->tag = dict;
1769
+ node->start = lexer->txtstart;
1770
+ node->end = lexer->txtend;
1771
+
1772
+ return node;
1773
+ }
1774
+
1775
+ static Bool ExpectsContent(Node *node)
1776
+ {
1777
+ if (node->type != StartTag)
1778
+ return no;
1779
+
1780
+ /* unknown element? */
1781
+ if (node->tag == NULL)
1782
+ return yes;
1783
+
1784
+ if (node->tag->model & CM_EMPTY)
1785
+ return no;
1786
+
1787
+ return yes;
1788
+ }
1789
+
1790
+ /*
1791
+ create a text node for the contents of
1792
+ a CDATA element like style or script
1793
+ which ends with </foo> for some foo.
1794
+ */
1795
+
1796
+ typedef enum
1797
+ {
1798
+ CDATA_INTERMEDIATE,
1799
+ CDATA_STARTTAG,
1800
+ CDATA_ENDTAG
1801
+ } CDATAState;
1802
+
1803
+ static Node *GetCDATA( TidyDocImpl* doc, Node *container )
1804
+ {
1805
+ Lexer* lexer = doc->lexer;
1806
+ uint start = 0;
1807
+ int nested = 0;
1808
+ CDATAState state = CDATA_INTERMEDIATE;
1809
+ uint i;
1810
+ Bool isEmpty = yes;
1811
+ Bool matches = no;
1812
+ uint c;
1813
+ Bool hasSrc = TY_(AttrGetById)(container, TidyAttr_SRC) != NULL;
1814
+
1815
+ SetLexerLocus( doc, lexer );
1816
+ lexer->waswhite = no;
1817
+ lexer->txtstart = lexer->txtend = lexer->lexsize;
1818
+
1819
+ /* seen start tag, look for matching end tag */
1820
+ while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
1821
+ {
1822
+ TY_(AddCharToLexer)(lexer, c);
1823
+ lexer->txtend = lexer->lexsize;
1824
+
1825
+ if (state == CDATA_INTERMEDIATE)
1826
+ {
1827
+ if (c != '<')
1828
+ {
1829
+ if (isEmpty && !TY_(IsWhite)(c))
1830
+ isEmpty = no;
1831
+ continue;
1832
+ }
1833
+
1834
+ c = TY_(ReadChar)(doc->docIn);
1835
+
1836
+ if (TY_(IsLetter)(c))
1837
+ {
1838
+ /* <head><script src=foo><meta name=foo content=bar>*/
1839
+ if (hasSrc && isEmpty && nodeIsSCRIPT(container))
1840
+ {
1841
+ /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
1842
+ lexer->lexsize = lexer->txtstart;
1843
+ TY_(UngetChar)(c, doc->docIn);
1844
+ TY_(UngetChar)('<', doc->docIn);
1845
+ return NULL;
1846
+ }
1847
+ TY_(AddCharToLexer)(lexer, c);
1848
+ start = lexer->lexsize - 1;
1849
+ state = CDATA_STARTTAG;
1850
+ }
1851
+ else if (c == '/')
1852
+ {
1853
+ TY_(AddCharToLexer)(lexer, c);
1854
+
1855
+ c = TY_(ReadChar)(doc->docIn);
1856
+
1857
+ if (!TY_(IsLetter)(c))
1858
+ {
1859
+ TY_(UngetChar)(c, doc->docIn);
1860
+ continue;
1861
+ }
1862
+ TY_(UngetChar)(c, doc->docIn);
1863
+
1864
+ start = lexer->lexsize;
1865
+ state = CDATA_ENDTAG;
1866
+ }
1867
+ else if (c == '\\')
1868
+ {
1869
+ /* recognize document.write("<script><\/script>") */
1870
+ TY_(AddCharToLexer)(lexer, c);
1871
+
1872
+ c = TY_(ReadChar)(doc->docIn);
1873
+
1874
+ if (c != '/')
1875
+ {
1876
+ TY_(UngetChar)(c, doc->docIn);
1877
+ continue;
1878
+ }
1879
+
1880
+ TY_(AddCharToLexer)(lexer, c);
1881
+ c = TY_(ReadChar)(doc->docIn);
1882
+
1883
+ if (!TY_(IsLetter)(c))
1884
+ {
1885
+ TY_(UngetChar)(c, doc->docIn);
1886
+ continue;
1887
+ }
1888
+ TY_(UngetChar)(c, doc->docIn);
1889
+
1890
+ start = lexer->lexsize;
1891
+ state = CDATA_ENDTAG;
1892
+ }
1893
+ else
1894
+ {
1895
+ TY_(UngetChar)(c, doc->docIn);
1896
+ }
1897
+ }
1898
+ /* '<' + Letter found */
1899
+ else if (state == CDATA_STARTTAG)
1900
+ {
1901
+ if (TY_(IsLetter)(c))
1902
+ continue;
1903
+
1904
+ matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
1905
+ TY_(tmbstrlen)(container->element)) == 0;
1906
+ if (matches)
1907
+ nested++;
1908
+
1909
+ state = CDATA_INTERMEDIATE;
1910
+ }
1911
+ /* '<' + '/' + Letter found */
1912
+ else if (state == CDATA_ENDTAG)
1913
+ {
1914
+ if (TY_(IsLetter)(c))
1915
+ continue;
1916
+
1917
+ matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
1918
+ TY_(tmbstrlen)(container->element)) == 0;
1919
+
1920
+ if (isEmpty && !matches)
1921
+ {
1922
+ /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
1923
+
1924
+ for (i = lexer->lexsize - 1; i >= start; --i)
1925
+ TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
1926
+ TY_(UngetChar)('/', doc->docIn);
1927
+ TY_(UngetChar)('<', doc->docIn);
1928
+ break;
1929
+ }
1930
+
1931
+ if (matches && nested-- <= 0)
1932
+ {
1933
+ for (i = lexer->lexsize - 1; i >= start; --i)
1934
+ TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
1935
+ TY_(UngetChar)('/', doc->docIn);
1936
+ TY_(UngetChar)('<', doc->docIn);
1937
+ lexer->lexsize -= (lexer->lexsize - start) + 2;
1938
+ break;
1939
+ }
1940
+ else if (lexer->lexbuf[start - 2] != '\\')
1941
+ {
1942
+ /* if the end tag is not already escaped using backslash */
1943
+ SetLexerLocus( doc, lexer );
1944
+ lexer->columns -= 3;
1945
+ TY_(ReportError)(doc, NULL, NULL, BAD_CDATA_CONTENT);
1946
+
1947
+ /* if javascript insert backslash before / */
1948
+ if (TY_(IsJavaScript)(container))
1949
+ {
1950
+ for (i = lexer->lexsize; i > start-1; --i)
1951
+ lexer->lexbuf[i] = lexer->lexbuf[i-1];
1952
+
1953
+ lexer->lexbuf[start-1] = '\\';
1954
+ lexer->lexsize++;
1955
+ }
1956
+ }
1957
+ state = CDATA_INTERMEDIATE;
1958
+ }
1959
+ }
1960
+ if (isEmpty)
1961
+ lexer->lexsize = lexer->txtstart = lexer->txtend;
1962
+ else
1963
+ lexer->txtend = lexer->lexsize;
1964
+
1965
+ if (c == EndOfStream)
1966
+ TY_(ReportError)(doc, container, NULL, MISSING_ENDTAG_FOR );
1967
+
1968
+ /* this was disabled for some reason... */
1969
+ #if 0
1970
+ if (lexer->txtend > lexer->txtstart)
1971
+ return TextToken(lexer);
1972
+ else
1973
+ return NULL;
1974
+ #else
1975
+ return TY_(TextToken)(lexer);
1976
+ #endif
1977
+ }
1978
+
1979
+ void TY_(UngetToken)( TidyDocImpl* doc )
1980
+ {
1981
+ doc->lexer->pushed = yes;
1982
+ }
1983
+
1984
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
1985
+ #define CondReturnTextNode(doc, skip) \
1986
+ if (lexer->txtend > lexer->txtstart) \
1987
+ { \
1988
+ lexer->token = TY_(TextToken)(lexer); \
1989
+ StoreOriginalTextInToken(doc, lexer->token, skip); \
1990
+ return lexer->token; \
1991
+ }
1992
+ #else
1993
+ #define CondReturnTextNode(doc, skip) \
1994
+ if (lexer->txtend > lexer->txtstart) \
1995
+ { \
1996
+ lexer->token = TY_(TextToken)(lexer); \
1997
+ return lexer->token; \
1998
+ }
1999
+ #endif
2000
+
2001
+ /*
2002
+ modes for GetToken()
2003
+
2004
+ MixedContent -- for elements which don't accept PCDATA
2005
+ Preformatted -- white space preserved as is
2006
+ IgnoreMarkup -- for CDATA elements such as script, style
2007
+ */
2008
+ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode );
2009
+
2010
+ Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
2011
+ {
2012
+ Lexer* lexer = doc->lexer;
2013
+
2014
+ if (lexer->pushed || lexer->itoken)
2015
+ {
2016
+ /* Deal with previously returned duplicate inline token */
2017
+ if (lexer->itoken)
2018
+ {
2019
+ /* itoken rejected */
2020
+ if (lexer->pushed)
2021
+ {
2022
+ lexer->pushed = no;
2023
+ return lexer->itoken;
2024
+ }
2025
+ /* itoken has been accepted */
2026
+ lexer->itoken = NULL;
2027
+ }
2028
+
2029
+ /* duplicate inlines in preference to pushed text nodes when appropriate */
2030
+ lexer->pushed = no;
2031
+ if (lexer->token->type != TextNode
2032
+ || !(lexer->insert || lexer->inode))
2033
+ return lexer->token;
2034
+ return lexer->itoken = TY_(InsertedToken)( doc );
2035
+ }
2036
+
2037
+ assert( !(lexer->pushed || lexer->itoken) );
2038
+
2039
+ /* at start of block elements, unclosed inline
2040
+ elements are inserted into the token stream */
2041
+ if (lexer->insert || lexer->inode)
2042
+ return lexer->token = TY_(InsertedToken)( doc );
2043
+
2044
+ if (mode == CdataContent)
2045
+ {
2046
+ assert( lexer->parent != NULL );
2047
+ return GetCDATA(doc, lexer->parent);
2048
+ }
2049
+
2050
+ return GetTokenFromStream( doc, mode );
2051
+ }
2052
+
2053
+ static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
2054
+ {
2055
+ Lexer* lexer = doc->lexer;
2056
+ uint c, badcomment = 0;
2057
+ Bool isempty = no;
2058
+ AttVal *attributes = NULL;
2059
+
2060
+ /* Lexer->token must be set on return. Nullify it for safety. */
2061
+ lexer->token = NULL;
2062
+
2063
+ SetLexerLocus( doc, lexer );
2064
+ lexer->waswhite = no;
2065
+
2066
+ lexer->txtstart = lexer->txtend = lexer->lexsize;
2067
+
2068
+ while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2069
+ {
2070
+ if (lexer->insertspace)
2071
+ {
2072
+ TY_(AddCharToLexer)(lexer, ' ');
2073
+ lexer->waswhite = yes;
2074
+ lexer->insertspace = no;
2075
+ }
2076
+
2077
+ if (c == 160 && (mode == Preformatted))
2078
+ c = ' ';
2079
+
2080
+ TY_(AddCharToLexer)(lexer, c);
2081
+
2082
+ switch (lexer->state)
2083
+ {
2084
+ case LEX_CONTENT: /* element content */
2085
+
2086
+ /*
2087
+ Discard white space if appropriate. Its cheaper
2088
+ to do this here rather than in parser methods
2089
+ for elements that don't have mixed content.
2090
+ */
2091
+ if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace)
2092
+ && lexer->lexsize == lexer->txtstart + 1)
2093
+ {
2094
+ --(lexer->lexsize);
2095
+ lexer->waswhite = no;
2096
+ SetLexerLocus( doc, lexer );
2097
+ continue;
2098
+ }
2099
+
2100
+ if (c == '<')
2101
+ {
2102
+ lexer->state = LEX_GT;
2103
+ continue;
2104
+ }
2105
+
2106
+ if (TY_(IsWhite)(c))
2107
+ {
2108
+ /* was previous character white? */
2109
+ if (lexer->waswhite)
2110
+ {
2111
+ if (mode != Preformatted && mode != IgnoreMarkup)
2112
+ {
2113
+ --(lexer->lexsize);
2114
+ SetLexerLocus( doc, lexer );
2115
+ }
2116
+ }
2117
+ else /* prev character wasn't white */
2118
+ {
2119
+ lexer->waswhite = yes;
2120
+
2121
+ if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
2122
+ ChangeChar(lexer, ' ');
2123
+ }
2124
+
2125
+ continue;
2126
+ }
2127
+ else if (c == '&' && mode != IgnoreMarkup)
2128
+ ParseEntity( doc, mode );
2129
+
2130
+ /* this is needed to avoid trimming trailing whitespace */
2131
+ if (mode == IgnoreWhitespace)
2132
+ mode = MixedContent;
2133
+
2134
+ lexer->waswhite = no;
2135
+ continue;
2136
+
2137
+ case LEX_GT: /* < */
2138
+
2139
+ /* check for endtag */
2140
+ if (c == '/')
2141
+ {
2142
+ if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2143
+ {
2144
+ TY_(UngetChar)(c, doc->docIn);
2145
+ continue;
2146
+ }
2147
+
2148
+ TY_(AddCharToLexer)(lexer, c);
2149
+
2150
+ if (TY_(IsLetter)(c))
2151
+ {
2152
+ lexer->lexsize -= 3;
2153
+ lexer->txtend = lexer->lexsize;
2154
+ TY_(UngetChar)(c, doc->docIn);
2155
+ lexer->state = LEX_ENDTAG;
2156
+ lexer->lexbuf[lexer->lexsize] = '\0'; /* debug */
2157
+ doc->docIn->curcol -= 2;
2158
+
2159
+ /* if some text before the </ return it now */
2160
+ if (lexer->txtend > lexer->txtstart)
2161
+ {
2162
+ /* trim space character before end tag */
2163
+ if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
2164
+ {
2165
+ lexer->lexsize -= 1;
2166
+ lexer->txtend = lexer->lexsize;
2167
+ }
2168
+ lexer->token = TY_(TextToken)(lexer);
2169
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
2170
+ StoreOriginalTextInToken(doc, lexer->token, 3);
2171
+ #endif
2172
+ return lexer->token;
2173
+ }
2174
+
2175
+ continue; /* no text so keep going */
2176
+ }
2177
+
2178
+ /* otherwise treat as CDATA */
2179
+ lexer->waswhite = no;
2180
+ lexer->state = LEX_CONTENT;
2181
+ continue;
2182
+ }
2183
+
2184
+ if (mode == IgnoreMarkup)
2185
+ {
2186
+ /* otherwise treat as CDATA */
2187
+ lexer->waswhite = no;
2188
+ lexer->state = LEX_CONTENT;
2189
+ continue;
2190
+ }
2191
+
2192
+ /*
2193
+ look out for comments, doctype or marked sections
2194
+ this isn't quite right, but its getting there ...
2195
+ */
2196
+ if (c == '!')
2197
+ {
2198
+ c = TY_(ReadChar)(doc->docIn);
2199
+
2200
+ if (c == '-')
2201
+ {
2202
+ c = TY_(ReadChar)(doc->docIn);
2203
+
2204
+ if (c == '-')
2205
+ {
2206
+ lexer->state = LEX_COMMENT; /* comment */
2207
+ lexer->lexsize -= 2;
2208
+ lexer->txtend = lexer->lexsize;
2209
+
2210
+ CondReturnTextNode(doc, 4)
2211
+
2212
+ lexer->txtstart = lexer->lexsize;
2213
+ continue;
2214
+ }
2215
+
2216
+ TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2217
+ }
2218
+ else if (c == 'd' || c == 'D')
2219
+ {
2220
+ /* todo: check for complete "<!DOCTYPE" not just <!D */
2221
+
2222
+ uint skip = 0;
2223
+
2224
+ lexer->state = LEX_DOCTYPE; /* doctype */
2225
+ lexer->lexsize -= 2;
2226
+ lexer->txtend = lexer->lexsize;
2227
+ mode = IgnoreWhitespace;
2228
+
2229
+ /* skip until white space or '>' */
2230
+
2231
+ for (;;)
2232
+ {
2233
+ c = TY_(ReadChar)(doc->docIn);
2234
+ ++skip;
2235
+
2236
+ if (c == EndOfStream || c == '>')
2237
+ {
2238
+ TY_(UngetChar)(c, doc->docIn);
2239
+ break;
2240
+ }
2241
+
2242
+
2243
+ if (!TY_(IsWhite)(c))
2244
+ continue;
2245
+
2246
+ /* and skip to end of whitespace */
2247
+
2248
+ for (;;)
2249
+ {
2250
+ c = TY_(ReadChar)(doc->docIn);
2251
+ ++skip;
2252
+
2253
+ if (c == EndOfStream || c == '>')
2254
+ {
2255
+ TY_(UngetChar)(c, doc->docIn);
2256
+ break;
2257
+ }
2258
+
2259
+
2260
+ if (TY_(IsWhite)(c))
2261
+ continue;
2262
+
2263
+ TY_(UngetChar)(c, doc->docIn);
2264
+ break;
2265
+ }
2266
+
2267
+ break;
2268
+ }
2269
+
2270
+ CondReturnTextNode(doc, (skip + 3))
2271
+
2272
+ lexer->txtstart = lexer->lexsize;
2273
+ continue;
2274
+ }
2275
+ else if (c == '[')
2276
+ {
2277
+ /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
2278
+ lexer->lexsize -= 2;
2279
+ lexer->state = LEX_SECTION;
2280
+ lexer->txtend = lexer->lexsize;
2281
+
2282
+ CondReturnTextNode(doc, 2)
2283
+
2284
+ lexer->txtstart = lexer->lexsize;
2285
+ continue;
2286
+ }
2287
+
2288
+
2289
+
2290
+ /* else swallow characters up to and including next '>' */
2291
+ while ((c = TY_(ReadChar)(doc->docIn)) != '>')
2292
+ {
2293
+ if (c == EndOfStream)
2294
+ {
2295
+ TY_(UngetChar)(c, doc->docIn);
2296
+ break;
2297
+ }
2298
+ }
2299
+
2300
+ lexer->lexsize -= 2;
2301
+ lexer->lexbuf[lexer->lexsize] = '\0';
2302
+ lexer->state = LEX_CONTENT;
2303
+ continue;
2304
+ }
2305
+
2306
+ /*
2307
+ processing instructions
2308
+ */
2309
+
2310
+ if (c == '?')
2311
+ {
2312
+ lexer->lexsize -= 2;
2313
+ lexer->state = LEX_PROCINSTR;
2314
+ lexer->txtend = lexer->lexsize;
2315
+
2316
+ CondReturnTextNode(doc, 2)
2317
+
2318
+ lexer->txtstart = lexer->lexsize;
2319
+ continue;
2320
+ }
2321
+
2322
+ /* Microsoft ASP's e.g. <% ... server-code ... %> */
2323
+ if (c == '%')
2324
+ {
2325
+ lexer->lexsize -= 2;
2326
+ lexer->state = LEX_ASP;
2327
+ lexer->txtend = lexer->lexsize;
2328
+
2329
+ CondReturnTextNode(doc, 2)
2330
+
2331
+ lexer->txtstart = lexer->lexsize;
2332
+ continue;
2333
+ }
2334
+
2335
+ /* Netscapes JSTE e.g. <# ... server-code ... #> */
2336
+ if (c == '#')
2337
+ {
2338
+ lexer->lexsize -= 2;
2339
+ lexer->state = LEX_JSTE;
2340
+ lexer->txtend = lexer->lexsize;
2341
+
2342
+ CondReturnTextNode(doc, 2)
2343
+
2344
+ lexer->txtstart = lexer->lexsize;
2345
+ continue;
2346
+ }
2347
+
2348
+ /* check for start tag */
2349
+ if (TY_(IsLetter)(c))
2350
+ {
2351
+ TY_(UngetChar)(c, doc->docIn); /* push back letter */
2352
+ TY_(UngetChar)('<', doc->docIn);
2353
+ lexer->lexsize -= 2; /* discard "<" + letter */
2354
+ lexer->txtend = lexer->lexsize;
2355
+ lexer->state = LEX_STARTTAG; /* ready to read tag name */
2356
+
2357
+ CondReturnTextNode(doc, 2)
2358
+
2359
+ /* lexer->txtstart = lexer->lexsize; missing here? */
2360
+ continue; /* no text so keep going */
2361
+ }
2362
+
2363
+ /* fix for bug 762102 */
2364
+ if (c == '&')
2365
+ {
2366
+ TY_(UngetChar)(c, doc->docIn);
2367
+ --(lexer->lexsize);
2368
+ }
2369
+
2370
+ /* otherwise treat as CDATA */
2371
+ lexer->state = LEX_CONTENT;
2372
+ lexer->waswhite = no;
2373
+ continue;
2374
+
2375
+ case LEX_ENDTAG: /* </letter */
2376
+ lexer->txtstart = lexer->lexsize - 1;
2377
+ doc->docIn->curcol += 2;
2378
+ c = ParseTagName( doc );
2379
+ lexer->token = TagToken( doc, EndTag ); /* create endtag token */
2380
+ lexer->lexsize = lexer->txtend = lexer->txtstart;
2381
+
2382
+ /* skip to '>' */
2383
+ while ( c != '>' && c != EndOfStream )
2384
+ {
2385
+ c = TY_(ReadChar)(doc->docIn);
2386
+ }
2387
+
2388
+ if (c == EndOfStream)
2389
+ {
2390
+ TY_(FreeNode)( doc, lexer->token );
2391
+ continue;
2392
+ }
2393
+
2394
+ lexer->state = LEX_CONTENT;
2395
+ lexer->waswhite = no;
2396
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
2397
+ StoreOriginalTextInToken(doc, lexer->token, 0); /* hmm... */
2398
+ #endif
2399
+ return lexer->token; /* the endtag token */
2400
+
2401
+ case LEX_STARTTAG: /* first letter of tagname */
2402
+ c = TY_(ReadChar)(doc->docIn);
2403
+ ChangeChar(lexer, (tmbchar)c);
2404
+ lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
2405
+ c = ParseTagName( doc );
2406
+ isempty = no;
2407
+ attributes = NULL;
2408
+ lexer->token = TagToken( doc, (isempty ? StartEndTag : StartTag) );
2409
+
2410
+ /* parse attributes, consuming closing ">" */
2411
+ if (c != '>')
2412
+ {
2413
+ if (c == '/')
2414
+ TY_(UngetChar)(c, doc->docIn);
2415
+
2416
+ attributes = ParseAttrs( doc, &isempty );
2417
+ }
2418
+
2419
+ if (isempty)
2420
+ lexer->token->type = StartEndTag;
2421
+
2422
+ lexer->token->attributes = attributes;
2423
+ lexer->lexsize = lexer->txtend = lexer->txtstart;
2424
+
2425
+ /* swallow newline following start tag */
2426
+ /* special check needed for CRLF sequence */
2427
+ /* this doesn't apply to empty elements */
2428
+ /* nor to preformatted content that needs escaping */
2429
+
2430
+ if ((mode != Preformatted && ExpectsContent(lexer->token))
2431
+ || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
2432
+ {
2433
+ c = TY_(ReadChar)(doc->docIn);
2434
+
2435
+ if (c != '\n' && c != '\f')
2436
+ TY_(UngetChar)(c, doc->docIn);
2437
+
2438
+ lexer->waswhite = yes; /* to swallow leading whitespace */
2439
+ }
2440
+ else
2441
+ lexer->waswhite = no;
2442
+
2443
+ lexer->state = LEX_CONTENT;
2444
+ if (lexer->token->tag == NULL)
2445
+ TY_(ReportFatal)( doc, NULL, lexer->token, UNKNOWN_ELEMENT );
2446
+ else if ( !cfgBool(doc, TidyXmlTags) )
2447
+ {
2448
+ Node* curr = lexer->token;
2449
+ TY_(ConstrainVersion)( doc, curr->tag->versions );
2450
+
2451
+ if ( curr->tag->versions & VERS_PROPRIETARY )
2452
+ {
2453
+ if ( !cfgBool(doc, TidyMakeClean) ||
2454
+ ( !nodeIsNOBR(curr) && !nodeIsWBR(curr) ) )
2455
+ {
2456
+ TY_(ReportError)(doc, NULL, curr, PROPRIETARY_ELEMENT );
2457
+
2458
+ if ( nodeIsLAYER(curr) )
2459
+ doc->badLayout |= USING_LAYER;
2460
+ else if ( nodeIsSPACER(curr) )
2461
+ doc->badLayout |= USING_SPACER;
2462
+ else if ( nodeIsNOBR(curr) )
2463
+ doc->badLayout |= USING_NOBR;
2464
+ }
2465
+ }
2466
+
2467
+ TY_(RepairDuplicateAttributes)( doc, curr, no );
2468
+ } else
2469
+ TY_(RepairDuplicateAttributes)( doc, lexer->token, yes );
2470
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
2471
+ StoreOriginalTextInToken(doc, lexer->token, 0);
2472
+ #endif
2473
+ return lexer->token; /* return start tag */
2474
+
2475
+ case LEX_COMMENT: /* seen <!-- so look for --> */
2476
+
2477
+ if (c != '-')
2478
+ continue;
2479
+
2480
+ c = TY_(ReadChar)(doc->docIn);
2481
+ TY_(AddCharToLexer)(lexer, c);
2482
+
2483
+ if (c != '-')
2484
+ continue;
2485
+
2486
+ end_comment:
2487
+ c = TY_(ReadChar)(doc->docIn);
2488
+
2489
+ if (c == '>')
2490
+ {
2491
+ if (badcomment)
2492
+ TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2493
+
2494
+ /* do not store closing -- in lexbuf */
2495
+ lexer->lexsize -= 2;
2496
+ lexer->txtend = lexer->lexsize;
2497
+ lexer->lexbuf[lexer->lexsize] = '\0';
2498
+ lexer->state = LEX_CONTENT;
2499
+ lexer->waswhite = no;
2500
+ lexer->token = CommentToken(doc);
2501
+
2502
+ /* now look for a line break */
2503
+
2504
+ c = TY_(ReadChar)(doc->docIn);
2505
+
2506
+ if (c == '\n')
2507
+ lexer->token->linebreak = yes;
2508
+ else
2509
+ TY_(UngetChar)(c, doc->docIn);
2510
+
2511
+ return lexer->token;
2512
+ }
2513
+
2514
+ /* note position of first such error in the comment */
2515
+ if (!badcomment)
2516
+ {
2517
+ SetLexerLocus( doc, lexer );
2518
+ lexer->columns -= 3;
2519
+ }
2520
+
2521
+ badcomment++;
2522
+
2523
+ if ( cfgBool(doc, TidyFixComments) )
2524
+ lexer->lexbuf[lexer->lexsize - 2] = '=';
2525
+
2526
+ /* if '-' then look for '>' to end the comment */
2527
+ if (c == '-')
2528
+ {
2529
+ TY_(AddCharToLexer)(lexer, c);
2530
+ goto end_comment;
2531
+ }
2532
+
2533
+ /* otherwise continue to look for --> */
2534
+ lexer->lexbuf[lexer->lexsize - 1] = '=';
2535
+
2536
+ /* http://tidy.sf.net/bug/1266647 */
2537
+ TY_(AddCharToLexer)(lexer, c);
2538
+
2539
+ continue;
2540
+
2541
+ case LEX_DOCTYPE: /* seen <!d so look for '>' munging whitespace */
2542
+
2543
+ /* use ParseDocTypeDecl() to tokenize doctype declaration */
2544
+ TY_(UngetChar)(c, doc->docIn);
2545
+ lexer->lexsize -= 1;
2546
+ lexer->token = ParseDocTypeDecl(doc);
2547
+
2548
+ lexer->txtend = lexer->lexsize;
2549
+ lexer->lexbuf[lexer->lexsize] = '\0';
2550
+ lexer->state = LEX_CONTENT;
2551
+ lexer->waswhite = no;
2552
+
2553
+ /* make a note of the version named by the 1st doctype */
2554
+ if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
2555
+ lexer->doctype = FindGivenVersion(doc, lexer->token);
2556
+ return lexer->token;
2557
+
2558
+ case LEX_PROCINSTR: /* seen <? so look for '>' */
2559
+ /* check for PHP preprocessor instructions <?php ... ?> */
2560
+
2561
+ if (lexer->lexsize - lexer->txtstart == 3)
2562
+ {
2563
+ if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
2564
+ {
2565
+ lexer->state = LEX_PHP;
2566
+ continue;
2567
+ }
2568
+ }
2569
+
2570
+ if (lexer->lexsize - lexer->txtstart == 4)
2571
+ {
2572
+ if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&
2573
+ TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3]))
2574
+ {
2575
+ lexer->state = LEX_XMLDECL;
2576
+ attributes = NULL;
2577
+ continue;
2578
+ }
2579
+ }
2580
+
2581
+ if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */
2582
+ {
2583
+ if (c != '?')
2584
+ continue;
2585
+
2586
+ /* now look for '>' */
2587
+ c = TY_(ReadChar)(doc->docIn);
2588
+
2589
+ if (c == EndOfStream)
2590
+ {
2591
+ TY_(ReportError)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );
2592
+ TY_(UngetChar)(c, doc->docIn);
2593
+ continue;
2594
+ }
2595
+
2596
+ TY_(AddCharToLexer)(lexer, c);
2597
+ }
2598
+
2599
+
2600
+ if (c != '>')
2601
+ continue;
2602
+
2603
+ lexer->lexsize -= 1;
2604
+
2605
+ if (lexer->lexsize)
2606
+ {
2607
+ uint i;
2608
+ Bool closed;
2609
+
2610
+ for (i = 0; i < lexer->lexsize - lexer->txtstart &&
2611
+ !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i)
2612
+ /**/;
2613
+
2614
+ closed = lexer->lexbuf[lexer->lexsize - 1] == '?';
2615
+
2616
+ if (closed)
2617
+ lexer->lexsize -= 1;
2618
+
2619
+ lexer->txtstart += i;
2620
+ lexer->txtend = lexer->lexsize;
2621
+ lexer->lexbuf[lexer->lexsize] = '\0';
2622
+
2623
+ lexer->token = PIToken(doc);
2624
+ lexer->token->closed = closed;
2625
+ lexer->token->element = TY_(tmbstrndup)(doc->allocator,
2626
+ lexer->lexbuf +
2627
+ lexer->txtstart - i, i);
2628
+ }
2629
+ else
2630
+ {
2631
+ lexer->txtend = lexer->lexsize;
2632
+ lexer->lexbuf[lexer->lexsize] = '\0';
2633
+ lexer->token = PIToken(doc);
2634
+ }
2635
+
2636
+ lexer->state = LEX_CONTENT;
2637
+ lexer->waswhite = no;
2638
+ return lexer->token;
2639
+
2640
+ case LEX_ASP: /* seen <% so look for "%>" */
2641
+ if (c != '%')
2642
+ continue;
2643
+
2644
+ /* now look for '>' */
2645
+ c = TY_(ReadChar)(doc->docIn);
2646
+
2647
+
2648
+ if (c != '>')
2649
+ {
2650
+ TY_(UngetChar)(c, doc->docIn);
2651
+ continue;
2652
+ }
2653
+
2654
+ lexer->lexsize -= 1;
2655
+ lexer->txtend = lexer->lexsize;
2656
+ lexer->lexbuf[lexer->lexsize] = '\0';
2657
+ lexer->state = LEX_CONTENT;
2658
+ lexer->waswhite = no;
2659
+ return lexer->token = AspToken(doc);
2660
+
2661
+ case LEX_JSTE: /* seen <# so look for "#>" */
2662
+ if (c != '#')
2663
+ continue;
2664
+
2665
+ /* now look for '>' */
2666
+ c = TY_(ReadChar)(doc->docIn);
2667
+
2668
+
2669
+ if (c != '>')
2670
+ {
2671
+ TY_(UngetChar)(c, doc->docIn);
2672
+ continue;
2673
+ }
2674
+
2675
+ lexer->lexsize -= 1;
2676
+ lexer->txtend = lexer->lexsize;
2677
+ lexer->lexbuf[lexer->lexsize] = '\0';
2678
+ lexer->state = LEX_CONTENT;
2679
+ lexer->waswhite = no;
2680
+ return lexer->token = JsteToken(doc);
2681
+
2682
+ case LEX_PHP: /* seen "<?php" so look for "?>" */
2683
+ if (c != '?')
2684
+ continue;
2685
+
2686
+ /* now look for '>' */
2687
+ c = TY_(ReadChar)(doc->docIn);
2688
+
2689
+ if (c != '>')
2690
+ {
2691
+ TY_(UngetChar)(c, doc->docIn);
2692
+ continue;
2693
+ }
2694
+
2695
+ lexer->lexsize -= 1;
2696
+ lexer->txtend = lexer->lexsize;
2697
+ lexer->lexbuf[lexer->lexsize] = '\0';
2698
+ lexer->state = LEX_CONTENT;
2699
+ lexer->waswhite = no;
2700
+ return lexer->token = PhpToken(doc);
2701
+
2702
+ case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
2703
+
2704
+ if (TY_(IsWhite)(c) && c != '?')
2705
+ continue;
2706
+
2707
+ /* get pseudo-attribute */
2708
+ if (c != '?')
2709
+ {
2710
+ tmbstr name;
2711
+ Node *asp, *php;
2712
+ AttVal *av = NULL;
2713
+ int pdelim = 0;
2714
+ isempty = no;
2715
+
2716
+ TY_(UngetChar)(c, doc->docIn);
2717
+
2718
+ name = ParseAttribute( doc, &isempty, &asp, &php );
2719
+
2720
+ if (!name)
2721
+ {
2722
+ /* fix for http://tidy.sf.net/bug/788031 */
2723
+ lexer->lexsize -= 1;
2724
+ lexer->txtend = lexer->txtstart;
2725
+ lexer->lexbuf[lexer->txtend] = '\0';
2726
+ lexer->state = LEX_CONTENT;
2727
+ lexer->waswhite = no;
2728
+ lexer->token = XmlDeclToken(doc);
2729
+ lexer->token->attributes = attributes;
2730
+ return lexer->token;
2731
+ }
2732
+
2733
+ av = TY_(NewAttribute)(doc);
2734
+ av->attribute = name;
2735
+ av->value = ParseValue( doc, name, yes, &isempty, &pdelim );
2736
+ av->delim = pdelim;
2737
+ av->dict = TY_(FindAttribute)( doc, av );
2738
+
2739
+ AddAttrToList( &attributes, av );
2740
+ /* continue; */
2741
+ }
2742
+
2743
+ /* now look for '>' */
2744
+ c = TY_(ReadChar)(doc->docIn);
2745
+
2746
+ if (c != '>')
2747
+ {
2748
+ TY_(UngetChar)(c, doc->docIn);
2749
+ continue;
2750
+ }
2751
+ lexer->lexsize -= 1;
2752
+ lexer->txtend = lexer->txtstart;
2753
+ lexer->lexbuf[lexer->txtend] = '\0';
2754
+ lexer->state = LEX_CONTENT;
2755
+ lexer->waswhite = no;
2756
+ lexer->token = XmlDeclToken(doc);
2757
+ lexer->token->attributes = attributes;
2758
+ return lexer->token;
2759
+
2760
+ case LEX_SECTION: /* seen "<![" so look for "]>" */
2761
+ if (c == '[')
2762
+ {
2763
+ if (lexer->lexsize == (lexer->txtstart + 6) &&
2764
+ TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
2765
+ {
2766
+ lexer->state = LEX_CDATA;
2767
+ lexer->lexsize -= 6;
2768
+ continue;
2769
+ }
2770
+ }
2771
+
2772
+ if (c != ']')
2773
+ continue;
2774
+
2775
+ /* now look for '>' */
2776
+ c = TY_(ReadChar)(doc->docIn);
2777
+
2778
+ if (c != '>')
2779
+ {
2780
+ TY_(UngetChar)(c, doc->docIn);
2781
+ continue;
2782
+ }
2783
+
2784
+ lexer->lexsize -= 1;
2785
+ lexer->txtend = lexer->lexsize;
2786
+ lexer->lexbuf[lexer->lexsize] = '\0';
2787
+ lexer->state = LEX_CONTENT;
2788
+ lexer->waswhite = no;
2789
+ return lexer->token = SectionToken(doc);
2790
+
2791
+ case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
2792
+ if (c != ']')
2793
+ continue;
2794
+
2795
+ /* now look for ']' */
2796
+ c = TY_(ReadChar)(doc->docIn);
2797
+
2798
+ if (c != ']')
2799
+ {
2800
+ TY_(UngetChar)(c, doc->docIn);
2801
+ continue;
2802
+ }
2803
+
2804
+ /* now look for '>' */
2805
+ c = TY_(ReadChar)(doc->docIn);
2806
+
2807
+ if (c != '>')
2808
+ {
2809
+ TY_(UngetChar)(c, doc->docIn);
2810
+ TY_(UngetChar)(']', doc->docIn);
2811
+ continue;
2812
+ }
2813
+
2814
+ lexer->lexsize -= 1;
2815
+ lexer->txtend = lexer->lexsize;
2816
+ lexer->lexbuf[lexer->lexsize] = '\0';
2817
+ lexer->state = LEX_CONTENT;
2818
+ lexer->waswhite = no;
2819
+ return lexer->token = CDATAToken(doc);
2820
+ }
2821
+ }
2822
+
2823
+ if (lexer->state == LEX_CONTENT) /* text string */
2824
+ {
2825
+ lexer->txtend = lexer->lexsize;
2826
+
2827
+ if (lexer->txtend > lexer->txtstart)
2828
+ {
2829
+ TY_(UngetChar)(c, doc->docIn);
2830
+
2831
+ if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
2832
+ {
2833
+ lexer->lexsize -= 1;
2834
+ lexer->txtend = lexer->lexsize;
2835
+ }
2836
+ lexer->token = TY_(TextToken)(lexer);
2837
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
2838
+ StoreOriginalTextInToken(doc, lexer->token, 0); /* ? */
2839
+ #endif
2840
+ return lexer->token;
2841
+ }
2842
+ }
2843
+ else if (lexer->state == LEX_COMMENT) /* comment */
2844
+ {
2845
+ if (c == EndOfStream)
2846
+ TY_(ReportError)(doc, NULL, NULL, MALFORMED_COMMENT );
2847
+
2848
+ lexer->txtend = lexer->lexsize;
2849
+ lexer->lexbuf[lexer->lexsize] = '\0';
2850
+ lexer->state = LEX_CONTENT;
2851
+ lexer->waswhite = no;
2852
+ return lexer->token = CommentToken(doc);
2853
+ }
2854
+
2855
+ return NULL;
2856
+ }
2857
+
2858
+ static void MapStr( ctmbstr str, uint code )
2859
+ {
2860
+ while ( *str )
2861
+ {
2862
+ uint i = (byte) *str++;
2863
+ lexmap[i] |= code;
2864
+ }
2865
+ }
2866
+
2867
+ void TY_(InitMap)(void)
2868
+ {
2869
+ MapStr("\r\n\f", newline|white);
2870
+ MapStr(" \t", white);
2871
+ MapStr("-.:_", namechar);
2872
+ MapStr("0123456789", digit|digithex|namechar);
2873
+ MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
2874
+ MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
2875
+ MapStr("abcdefABCDEF", digithex);
2876
+ }
2877
+
2878
+ /*
2879
+ parser for ASP within start tags
2880
+
2881
+ Some people use ASP for to customize attributes
2882
+ Tidy isn't really well suited to dealing with ASP
2883
+ This is a workaround for attributes, but won't
2884
+ deal with the case where the ASP is used to tailor
2885
+ the attribute value. Here is an example of a work
2886
+ around for using ASP in attribute values:
2887
+
2888
+ href='<%=rsSchool.Fields("ID").Value%>'
2889
+
2890
+ where the ASP that generates the attribute value
2891
+ is masked from Tidy by the quotemarks.
2892
+
2893
+ */
2894
+
2895
+ static Node *ParseAsp( TidyDocImpl* doc )
2896
+ {
2897
+ Lexer* lexer = doc->lexer;
2898
+ uint c;
2899
+ Node *asp = NULL;
2900
+
2901
+ lexer->txtstart = lexer->lexsize;
2902
+
2903
+ for (;;)
2904
+ {
2905
+ if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2906
+ break;
2907
+
2908
+ TY_(AddCharToLexer)(lexer, c);
2909
+
2910
+
2911
+ if (c != '%')
2912
+ continue;
2913
+
2914
+ if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2915
+ break;
2916
+
2917
+ TY_(AddCharToLexer)(lexer, c);
2918
+
2919
+ if (c == '>')
2920
+ {
2921
+ lexer->lexsize -= 2;
2922
+ break;
2923
+ }
2924
+ }
2925
+
2926
+ lexer->txtend = lexer->lexsize;
2927
+ if (lexer->txtend > lexer->txtstart)
2928
+ asp = AspToken(doc);
2929
+
2930
+ lexer->txtstart = lexer->txtend;
2931
+ return asp;
2932
+ }
2933
+
2934
+
2935
+ /*
2936
+ PHP is like ASP but is based upon XML
2937
+ processing instructions, e.g. <?php ... ?>
2938
+ */
2939
+ static Node *ParsePhp( TidyDocImpl* doc )
2940
+ {
2941
+ Lexer* lexer = doc->lexer;
2942
+ uint c;
2943
+ Node *php = NULL;
2944
+
2945
+ lexer->txtstart = lexer->lexsize;
2946
+
2947
+ for (;;)
2948
+ {
2949
+ if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2950
+ break;
2951
+
2952
+ TY_(AddCharToLexer)(lexer, c);
2953
+
2954
+
2955
+ if (c != '?')
2956
+ continue;
2957
+
2958
+ if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2959
+ break;
2960
+
2961
+ TY_(AddCharToLexer)(lexer, c);
2962
+
2963
+ if (c == '>')
2964
+ {
2965
+ lexer->lexsize -= 2;
2966
+ break;
2967
+ }
2968
+ }
2969
+
2970
+ lexer->txtend = lexer->lexsize;
2971
+ if (lexer->txtend > lexer->txtstart)
2972
+ php = PhpToken(doc);
2973
+
2974
+ lexer->txtstart = lexer->txtend;
2975
+ return php;
2976
+ }
2977
+
2978
+ /* consumes the '>' terminating start tags */
2979
+ static tmbstr ParseAttribute( TidyDocImpl* doc, Bool *isempty,
2980
+ Node **asp, Node **php)
2981
+ {
2982
+ Lexer* lexer = doc->lexer;
2983
+ int start, len = 0;
2984
+ tmbstr attr = NULL;
2985
+ uint c, lastc;
2986
+
2987
+ *asp = NULL; /* clear asp pointer */
2988
+ *php = NULL; /* clear php pointer */
2989
+
2990
+ /* skip white space before the attribute */
2991
+
2992
+ for (;;)
2993
+ {
2994
+ c = TY_(ReadChar)( doc->docIn );
2995
+
2996
+
2997
+ if (c == '/')
2998
+ {
2999
+ c = TY_(ReadChar)( doc->docIn );
3000
+
3001
+ if (c == '>')
3002
+ {
3003
+ *isempty = yes;
3004
+ return NULL;
3005
+ }
3006
+
3007
+ TY_(UngetChar)(c, doc->docIn);
3008
+ c = '/';
3009
+ break;
3010
+ }
3011
+
3012
+ if (c == '>')
3013
+ return NULL;
3014
+
3015
+ if (c =='<')
3016
+ {
3017
+ c = TY_(ReadChar)(doc->docIn);
3018
+
3019
+ if (c == '%')
3020
+ {
3021
+ *asp = ParseAsp( doc );
3022
+ return NULL;
3023
+ }
3024
+ else if (c == '?')
3025
+ {
3026
+ *php = ParsePhp( doc );
3027
+ return NULL;
3028
+ }
3029
+
3030
+ TY_(UngetChar)(c, doc->docIn);
3031
+ TY_(UngetChar)('<', doc->docIn);
3032
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3033
+ return NULL;
3034
+ }
3035
+
3036
+ if (c == '=')
3037
+ {
3038
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );
3039
+ continue;
3040
+ }
3041
+
3042
+ if (c == '"' || c == '\'')
3043
+ {
3044
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3045
+ continue;
3046
+ }
3047
+
3048
+ if (c == EndOfStream)
3049
+ {
3050
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3051
+ TY_(UngetChar)(c, doc->docIn);
3052
+ return NULL;
3053
+ }
3054
+
3055
+
3056
+ if (!TY_(IsWhite)(c))
3057
+ break;
3058
+ }
3059
+
3060
+ start = lexer->lexsize;
3061
+ lastc = c;
3062
+
3063
+ for (;;)
3064
+ {
3065
+ /* but push back '=' for parseValue() */
3066
+ if (c == '=' || c == '>')
3067
+ {
3068
+ TY_(UngetChar)(c, doc->docIn);
3069
+ break;
3070
+ }
3071
+
3072
+ if (c == '<' || c == EndOfStream)
3073
+ {
3074
+ TY_(UngetChar)(c, doc->docIn);
3075
+ break;
3076
+ }
3077
+
3078
+ if (lastc == '-' && (c == '"' || c == '\''))
3079
+ {
3080
+ lexer->lexsize--;
3081
+ --len;
3082
+ TY_(UngetChar)(c, doc->docIn);
3083
+ break;
3084
+ }
3085
+
3086
+ if (TY_(IsWhite)(c))
3087
+ break;
3088
+
3089
+ /* what should be done about non-namechar characters? */
3090
+ /* currently these are incorporated into the attr name */
3091
+
3092
+ if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) )
3093
+ c = TY_(ToLower)(c);
3094
+
3095
+ TY_(AddCharToLexer)( lexer, c );
3096
+ lastc = c;
3097
+ c = TY_(ReadChar)(doc->docIn);
3098
+ }
3099
+
3100
+ /* handle attribute names with multibyte chars */
3101
+ len = lexer->lexsize - start;
3102
+ attr = (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3103
+ lexer->lexbuf+start, len) : NULL);
3104
+ lexer->lexsize = start;
3105
+ return attr;
3106
+ }
3107
+
3108
+ /*
3109
+ invoked when < is seen in place of attribute value
3110
+ but terminates on whitespace if not ASP, PHP or Tango
3111
+ this routine recognizes ' and " quoted strings
3112
+ */
3113
+ static int ParseServerInstruction( TidyDocImpl* doc )
3114
+ {
3115
+ Lexer* lexer = doc->lexer;
3116
+ uint c;
3117
+ int delim = '"';
3118
+ Bool isrule = no;
3119
+
3120
+ c = TY_(ReadChar)(doc->docIn);
3121
+ TY_(AddCharToLexer)(lexer, c);
3122
+
3123
+ /* check for ASP, PHP or Tango */
3124
+ if (c == '%' || c == '?' || c == '@')
3125
+ isrule = yes;
3126
+
3127
+ for (;;)
3128
+ {
3129
+ c = TY_(ReadChar)(doc->docIn);
3130
+
3131
+ if (c == EndOfStream)
3132
+ break;
3133
+
3134
+ if (c == '>')
3135
+ {
3136
+ if (isrule)
3137
+ TY_(AddCharToLexer)(lexer, c);
3138
+ else
3139
+ TY_(UngetChar)(c, doc->docIn);
3140
+
3141
+ break;
3142
+ }
3143
+
3144
+ /* if not recognized as ASP, PHP or Tango */
3145
+ /* then also finish value on whitespace */
3146
+ if (!isrule)
3147
+ {
3148
+ if (TY_(IsWhite)(c))
3149
+ break;
3150
+ }
3151
+
3152
+ TY_(AddCharToLexer)(lexer, c);
3153
+
3154
+ if (c == '"')
3155
+ {
3156
+ do
3157
+ {
3158
+ c = TY_(ReadChar)(doc->docIn);
3159
+ if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3160
+ {
3161
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3162
+ TY_(UngetChar)(c, doc->docIn);
3163
+ return 0;
3164
+ }
3165
+ if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3166
+ {
3167
+ TY_(UngetChar)(c, doc->docIn);
3168
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3169
+ return 0;
3170
+ }
3171
+ TY_(AddCharToLexer)(lexer, c);
3172
+ }
3173
+ while (c != '"');
3174
+ delim = '\'';
3175
+ continue;
3176
+ }
3177
+
3178
+ if (c == '\'')
3179
+ {
3180
+ do
3181
+ {
3182
+ c = TY_(ReadChar)(doc->docIn);
3183
+ if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3184
+ {
3185
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3186
+ TY_(UngetChar)(c, doc->docIn);
3187
+ return 0;
3188
+ }
3189
+ if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3190
+ {
3191
+ TY_(UngetChar)(c, doc->docIn);
3192
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3193
+ return 0;
3194
+ }
3195
+ TY_(AddCharToLexer)(lexer, c);
3196
+ }
3197
+ while (c != '\'');
3198
+ }
3199
+ }
3200
+
3201
+ return delim;
3202
+ }
3203
+
3204
+ /* values start with "=" or " = " etc. */
3205
+ /* doesn't consume the ">" at end of start tag */
3206
+
3207
+ static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,
3208
+ Bool foldCase, Bool *isempty, int *pdelim)
3209
+ {
3210
+ Lexer* lexer = doc->lexer;
3211
+ int len = 0, start;
3212
+ Bool seen_gt = no;
3213
+ Bool munge = yes;
3214
+ uint c, lastc, delim, quotewarning;
3215
+ tmbstr value;
3216
+
3217
+ delim = (tmbchar) 0;
3218
+ *pdelim = '"';
3219
+
3220
+ /*
3221
+ Henry Zrepa reports that some folk are using the
3222
+ embed element with script attributes where newlines
3223
+ are significant and must be preserved
3224
+ */
3225
+ if ( cfgBool(doc, TidyLiteralAttribs) )
3226
+ munge = no;
3227
+
3228
+ /* skip white space before the '=' */
3229
+
3230
+ for (;;)
3231
+ {
3232
+ c = TY_(ReadChar)(doc->docIn);
3233
+
3234
+ if (c == EndOfStream)
3235
+ {
3236
+ TY_(UngetChar)(c, doc->docIn);
3237
+ break;
3238
+ }
3239
+
3240
+ if (!TY_(IsWhite)(c))
3241
+ break;
3242
+ }
3243
+
3244
+ /*
3245
+ c should be '=' if there is a value
3246
+ other legal possibilities are white
3247
+ space, '/' and '>'
3248
+ */
3249
+
3250
+ if (c != '=' && c != '"' && c != '\'')
3251
+ {
3252
+ TY_(UngetChar)(c, doc->docIn);
3253
+ return NULL;
3254
+ }
3255
+
3256
+ /* skip white space after '=' */
3257
+
3258
+ for (;;)
3259
+ {
3260
+ c = TY_(ReadChar)(doc->docIn);
3261
+
3262
+ if (c == EndOfStream)
3263
+ {
3264
+ TY_(UngetChar)(c, doc->docIn);
3265
+ break;
3266
+ }
3267
+
3268
+ if (!TY_(IsWhite)(c))
3269
+ break;
3270
+ }
3271
+
3272
+ /* check for quote marks */
3273
+
3274
+ if (c == '"' || c == '\'')
3275
+ delim = c;
3276
+ else if (c == '<')
3277
+ {
3278
+ start = lexer->lexsize;
3279
+ TY_(AddCharToLexer)(lexer, c);
3280
+ *pdelim = ParseServerInstruction( doc );
3281
+ len = lexer->lexsize - start;
3282
+ lexer->lexsize = start;
3283
+ return (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3284
+ lexer->lexbuf+start, len) : NULL);
3285
+ }
3286
+ else
3287
+ TY_(UngetChar)(c, doc->docIn);
3288
+
3289
+ /*
3290
+ and read the value string
3291
+ check for quote mark if needed
3292
+ */
3293
+
3294
+ quotewarning = 0;
3295
+ start = lexer->lexsize;
3296
+ c = '\0';
3297
+
3298
+ for (;;)
3299
+ {
3300
+ lastc = c; /* track last character */
3301
+ c = TY_(ReadChar)(doc->docIn);
3302
+
3303
+ if (c == EndOfStream)
3304
+ {
3305
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3306
+ TY_(UngetChar)(c, doc->docIn);
3307
+ break;
3308
+ }
3309
+
3310
+ if (delim == (tmbchar)0)
3311
+ {
3312
+ if (c == '>')
3313
+ {
3314
+ TY_(UngetChar)(c, doc->docIn);
3315
+ break;
3316
+ }
3317
+
3318
+ if (c == '"' || c == '\'')
3319
+ {
3320
+ uint q = c;
3321
+
3322
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3323
+
3324
+ /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
3325
+ /* this doesn't handle <a title=foo"/> which browsers treat as */
3326
+ /* 'foo"/' nor <a title=foo" /> which browser treat as 'foo"' */
3327
+
3328
+ c = TY_(ReadChar)(doc->docIn);
3329
+ if (c == '>')
3330
+ {
3331
+ TY_(AddCharToLexer)(lexer, q);
3332
+ TY_(UngetChar)(c, doc->docIn);
3333
+ break;
3334
+ }
3335
+ else
3336
+ {
3337
+ TY_(UngetChar)(c, doc->docIn);
3338
+ c = q;
3339
+ }
3340
+ }
3341
+
3342
+ if (c == '<')
3343
+ {
3344
+ TY_(UngetChar)(c, doc->docIn);
3345
+ c = '>';
3346
+ TY_(UngetChar)(c, doc->docIn);
3347
+ TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3348
+ break;
3349
+ }
3350
+
3351
+ /*
3352
+ For cases like <br clear=all/> need to avoid treating /> as
3353
+ part of the attribute value, however care is needed to avoid
3354
+ so treating <a href=http://www.acme.com/> in this way, which
3355
+ would map the <a> tag to <a href="http://www.acme.com"/>
3356
+ */
3357
+ if (c == '/')
3358
+ {
3359
+ /* peek ahead in case of /> */
3360
+ c = TY_(ReadChar)(doc->docIn);
3361
+
3362
+ if ( c == '>' && !TY_(IsUrl)(doc, name) )
3363
+ {
3364
+ *isempty = yes;
3365
+ TY_(UngetChar)(c, doc->docIn);
3366
+ break;
3367
+ }
3368
+
3369
+ /* unget peeked character */
3370
+ TY_(UngetChar)(c, doc->docIn);
3371
+ c = '/';
3372
+ }
3373
+ }
3374
+ else /* delim is '\'' or '"' */
3375
+ {
3376
+ if (c == delim)
3377
+ break;
3378
+
3379
+ if (c == '\n' || c == '<' || c == '>')
3380
+ ++quotewarning;
3381
+
3382
+ if (c == '>')
3383
+ seen_gt = yes;
3384
+ }
3385
+
3386
+ if (c == '&')
3387
+ {
3388
+ TY_(AddCharToLexer)(lexer, c);
3389
+ ParseEntity( doc, IgnoreWhitespace );
3390
+ if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)
3391
+ ChangeChar(lexer, ' ');
3392
+ continue;
3393
+ }
3394
+
3395
+ /*
3396
+ kludge for JavaScript attribute values
3397
+ with line continuations in string literals
3398
+ */
3399
+ if (c == '\\')
3400
+ {
3401
+ c = TY_(ReadChar)(doc->docIn);
3402
+
3403
+ if (c != '\n')
3404
+ {
3405
+ TY_(UngetChar)(c, doc->docIn);
3406
+ c = '\\';
3407
+ }
3408
+ }
3409
+
3410
+ if (TY_(IsWhite)(c))
3411
+ {
3412
+ if ( delim == 0 )
3413
+ break;
3414
+
3415
+ if (munge)
3416
+ {
3417
+ /* discard line breaks in quoted URLs */
3418
+ /* #438650 - fix by Randy Waki */
3419
+ if ( c == '\n' && TY_(IsUrl)(doc, name) )
3420
+ {
3421
+ /* warn that we discard this newline */
3422
+ TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI);
3423
+ continue;
3424
+ }
3425
+
3426
+ c = ' ';
3427
+
3428
+ if (lastc == ' ')
3429
+ {
3430
+ if (TY_(IsUrl)(doc, name) )
3431
+ TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI);
3432
+ continue;
3433
+ }
3434
+ }
3435
+ }
3436
+ else if (foldCase && TY_(IsUpper)(c))
3437
+ c = TY_(ToLower)(c);
3438
+
3439
+ TY_(AddCharToLexer)(lexer, c);
3440
+ }
3441
+
3442
+ if (quotewarning > 10 && seen_gt && munge)
3443
+ {
3444
+ /*
3445
+ there is almost certainly a missing trailing quote mark
3446
+ as we have see too many newlines, < or > characters.
3447
+
3448
+ an exception is made for Javascript attributes and the
3449
+ javascript URL scheme which may legitimately include < and >,
3450
+ and for attributes starting with "<xml " as generated by
3451
+ Microsoft Office.
3452
+ */
3453
+ if ( !TY_(IsScript)(doc, name) &&
3454
+ !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) &&
3455
+ !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0)
3456
+ )
3457
+ TY_(ReportFatal)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE );
3458
+ }
3459
+
3460
+ len = lexer->lexsize - start;
3461
+ lexer->lexsize = start;
3462
+
3463
+
3464
+ if (len > 0 || delim)
3465
+ {
3466
+ /* ignore leading and trailing white space for all but title, alt, value */
3467
+ /* and prompts attributes unless --literal-attributes is set to yes */
3468
+ /* #994841 - Whitespace is removed from value attributes */
3469
+
3470
+ if (munge &&
3471
+ TY_(tmbstrcasecmp)(name, "alt") &&
3472
+ TY_(tmbstrcasecmp)(name, "title") &&
3473
+ TY_(tmbstrcasecmp)(name, "value") &&
3474
+ TY_(tmbstrcasecmp)(name, "prompt"))
3475
+ {
3476
+ while (TY_(IsWhite)(lexer->lexbuf[start+len-1]))
3477
+ --len;
3478
+
3479
+ while (TY_(IsWhite)(lexer->lexbuf[start]) && start < len)
3480
+ {
3481
+ ++start;
3482
+ --len;
3483
+ }
3484
+ }
3485
+
3486
+ value = TY_(tmbstrndup)(doc->allocator, lexer->lexbuf + start, len);
3487
+ }
3488
+ else
3489
+ value = NULL;
3490
+
3491
+ /* note delimiter if given */
3492
+ *pdelim = (delim ? delim : '"');
3493
+
3494
+ return value;
3495
+ }
3496
+
3497
+ /* attr must be non-NULL */
3498
+ static Bool IsValidAttrName( ctmbstr attr )
3499
+ {
3500
+ uint i, c = attr[0];
3501
+
3502
+ /* first character should be a letter */
3503
+ if (!TY_(IsLetter)(c))
3504
+ return no;
3505
+
3506
+ /* remaining characters should be namechars */
3507
+ for( i = 1; i < TY_(tmbstrlen)(attr); i++)
3508
+ {
3509
+ c = attr[i];
3510
+
3511
+ if (TY_(IsNamechar)(c))
3512
+ continue;
3513
+
3514
+ return no;
3515
+ }
3516
+
3517
+ return yes;
3518
+ }
3519
+
3520
+ /* create a new attribute */
3521
+ AttVal *TY_(NewAttribute)( TidyDocImpl* doc )
3522
+ {
3523
+ AttVal *av = (AttVal*) TidyDocAlloc( doc, sizeof(AttVal) );
3524
+ TidyClearMemory( av, sizeof(AttVal) );
3525
+ return av;
3526
+ }
3527
+
3528
+ /* create a new attribute with given name and value */
3529
+ AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
3530
+ int delim )
3531
+ {
3532
+ AttVal *av = TY_(NewAttribute)(doc);
3533
+ av->attribute = TY_(tmbstrdup)(doc->allocator, name);
3534
+ av->value = TY_(tmbstrdup)(doc->allocator, value);
3535
+ av->delim = delim;
3536
+ av->dict = TY_(FindAttribute)( doc, av );
3537
+ return av;
3538
+ }
3539
+
3540
+ static void AddAttrToList( AttVal** list, AttVal* av )
3541
+ {
3542
+ if ( *list == NULL )
3543
+ *list = av;
3544
+ else
3545
+ {
3546
+ AttVal* here = *list;
3547
+ while ( here->next )
3548
+ here = here->next;
3549
+ here->next = av;
3550
+ }
3551
+ }
3552
+
3553
+ void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av )
3554
+ {
3555
+ AddAttrToList(&node->attributes, av);
3556
+ }
3557
+
3558
+ void TY_(InsertAttributeAtStart)( Node *node, AttVal *av )
3559
+ {
3560
+ av->next = node->attributes;
3561
+ node->attributes = av;
3562
+ }
3563
+
3564
+ /* swallows closing '>' */
3565
+
3566
+ static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )
3567
+ {
3568
+ Lexer* lexer = doc->lexer;
3569
+ AttVal *av, *list;
3570
+ tmbstr value;
3571
+ int delim;
3572
+ Node *asp, *php;
3573
+
3574
+ list = NULL;
3575
+
3576
+ while ( !EndOfInput(doc) )
3577
+ {
3578
+ tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );
3579
+
3580
+ if (attribute == NULL)
3581
+ {
3582
+ /* check if attributes are created by ASP markup */
3583
+ if (asp)
3584
+ {
3585
+ av = TY_(NewAttribute)(doc);
3586
+ av->asp = asp;
3587
+ AddAttrToList( &list, av );
3588
+ continue;
3589
+ }
3590
+
3591
+ /* check if attributes are created by PHP markup */
3592
+ if (php)
3593
+ {
3594
+ av = TY_(NewAttribute)(doc);
3595
+ av->php = php;
3596
+ AddAttrToList( &list, av );
3597
+ continue;
3598
+ }
3599
+
3600
+ break;
3601
+ }
3602
+
3603
+ value = ParseValue( doc, attribute, no, isempty, &delim );
3604
+
3605
+ if (attribute && (IsValidAttrName(attribute) ||
3606
+ (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))
3607
+ {
3608
+ av = TY_(NewAttribute)(doc);
3609
+ av->delim = delim;
3610
+ av->attribute = attribute;
3611
+ av->value = value;
3612
+ av->dict = TY_(FindAttribute)( doc, av );
3613
+ AddAttrToList( &list, av );
3614
+ }
3615
+ else
3616
+ {
3617
+ av = TY_(NewAttribute)(doc);
3618
+ av->attribute = attribute;
3619
+ av->value = value;
3620
+
3621
+ if (LastChar(attribute) == '"')
3622
+ TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK);
3623
+ else if (value == NULL)
3624
+ TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE);
3625
+ else
3626
+ TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE);
3627
+
3628
+ TY_(FreeAttribute)( doc, av );
3629
+ }
3630
+ }
3631
+
3632
+ return list;
3633
+ }
3634
+
3635
+ /*
3636
+ Returns document type declarations like
3637
+
3638
+ <!DOCTYPE foo PUBLIC "fpi" "sysid">
3639
+ <!DOCTYPE bar SYSTEM "sysid">
3640
+ <!DOCTYPE baz [ <!ENTITY ouml "&#246"> ]>
3641
+
3642
+ as
3643
+
3644
+ <foo PUBLIC="fpi" SYSTEM="sysid" />
3645
+ <bar SYSTEM="sysid" />
3646
+ <baz> &lt;!ENTITY ouml &quot;&amp;#246&quot;&gt; </baz>
3647
+ */
3648
+ static Node *ParseDocTypeDecl(TidyDocImpl* doc)
3649
+ {
3650
+ Lexer *lexer = doc->lexer;
3651
+ int start = lexer->lexsize;
3652
+ ParseDocTypeDeclState state = DT_DOCTYPENAME;
3653
+ uint c;
3654
+ uint delim = 0;
3655
+ Bool hasfpi = yes;
3656
+
3657
+ Node* node = TY_(NewNode)(lexer->allocator, lexer);
3658
+ node->type = DocTypeTag;
3659
+ node->start = lexer->txtstart;
3660
+ node->end = lexer->txtend;
3661
+
3662
+ lexer->waswhite = no;
3663
+
3664
+ /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */
3665
+
3666
+ while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
3667
+ {
3668
+ /* convert newlines to spaces */
3669
+ if (state != DT_INTSUBSET)
3670
+ c = c == '\n' ? ' ' : c;
3671
+
3672
+ /* convert white-space sequences to single space character */
3673
+ if (TY_(IsWhite)(c) && state != DT_INTSUBSET)
3674
+ {
3675
+ if (!lexer->waswhite)
3676
+ {
3677
+ TY_(AddCharToLexer)(lexer, c);
3678
+ lexer->waswhite = yes;
3679
+ }
3680
+ else
3681
+ {
3682
+ /* discard space */
3683
+ continue;
3684
+ }
3685
+ }
3686
+ else
3687
+ {
3688
+ TY_(AddCharToLexer)(lexer, c);
3689
+ lexer->waswhite = no;
3690
+ }
3691
+
3692
+ switch(state)
3693
+ {
3694
+ case DT_INTERMEDIATE:
3695
+ /* determine what's next */
3696
+ if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S')
3697
+ {
3698
+ start = lexer->lexsize - 1;
3699
+ state = DT_PUBLICSYSTEM;
3700
+ continue;
3701
+ }
3702
+ else if (c == '[')
3703
+ {
3704
+ start = lexer->lexsize;
3705
+ state = DT_INTSUBSET;
3706
+ continue;
3707
+ }
3708
+ else if (c == '\'' || c == '"')
3709
+ {
3710
+ start = lexer->lexsize;
3711
+ delim = c;
3712
+ state = DT_QUOTEDSTRING;
3713
+ continue;
3714
+ }
3715
+ else if (c == '>')
3716
+ {
3717
+ AttVal* si;
3718
+
3719
+ node->end = --(lexer->lexsize);
3720
+
3721
+ si = TY_(GetAttrByName)(node, "SYSTEM");
3722
+ if (si)
3723
+ TY_(CheckUrl)(doc, node, si);
3724
+
3725
+ if (!node->element || !IsValidXMLElemName(node->element))
3726
+ {
3727
+ TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE);
3728
+ TY_(FreeNode)(doc, node);
3729
+ return NULL;
3730
+ }
3731
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
3732
+ StoreOriginalTextInToken(doc, node, 0);
3733
+ #endif
3734
+ return node;
3735
+ }
3736
+ else
3737
+ {
3738
+ /* error */
3739
+ }
3740
+ break;
3741
+ case DT_DOCTYPENAME:
3742
+ /* read document type name */
3743
+ if (TY_(IsWhite)(c) || c == '>' || c == '[')
3744
+ {
3745
+ node->element = TY_(tmbstrndup)(doc->allocator,
3746
+ lexer->lexbuf + start,
3747
+ lexer->lexsize - start - 1);
3748
+ if (c == '>' || c == '[')
3749
+ {
3750
+ --(lexer->lexsize);
3751
+ TY_(UngetChar)(c, doc->docIn);
3752
+ }
3753
+
3754
+ state = DT_INTERMEDIATE;
3755
+ continue;
3756
+ }
3757
+ break;
3758
+ case DT_PUBLICSYSTEM:
3759
+ /* read PUBLIC/SYSTEM */
3760
+ if (TY_(IsWhite)(c) || c == '>')
3761
+ {
3762
+ char *attname = TY_(tmbstrndup)(doc->allocator,
3763
+ lexer->lexbuf + start,
3764
+ lexer->lexsize - start - 1);
3765
+ hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0);
3766
+
3767
+ TidyDocFree(doc, attname);
3768
+
3769
+ /* todo: report an error if SYSTEM/PUBLIC not uppercase */
3770
+
3771
+ if (c == '>')
3772
+ {
3773
+ --(lexer->lexsize);
3774
+ TY_(UngetChar)(c, doc->docIn);
3775
+ }
3776
+
3777
+ state = DT_INTERMEDIATE;
3778
+ continue;
3779
+ }
3780
+ break;
3781
+ case DT_QUOTEDSTRING:
3782
+ /* read quoted string */
3783
+ if (c == delim)
3784
+ {
3785
+ char *value = TY_(tmbstrndup)(doc->allocator,
3786
+ lexer->lexbuf + start,
3787
+ lexer->lexsize - start - 1);
3788
+ AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);
3789
+ TidyDocFree(doc, value);
3790
+ att->delim = delim;
3791
+ hasfpi = no;
3792
+ state = DT_INTERMEDIATE;
3793
+ delim = 0;
3794
+ continue;
3795
+ }
3796
+ break;
3797
+ case DT_INTSUBSET:
3798
+ /* read internal subset */
3799
+ if (c == ']')
3800
+ {
3801
+ Node* subset;
3802
+ lexer->txtstart = start;
3803
+ lexer->txtend = lexer->lexsize - 1;
3804
+ subset = TY_(TextToken)(lexer);
3805
+ TY_(InsertNodeAtEnd)(node, subset);
3806
+ state = DT_INTERMEDIATE;
3807
+ }
3808
+ break;
3809
+ }
3810
+ }
3811
+
3812
+ /* document type declaration not finished */
3813
+ TY_(ReportError)(doc, NULL, NULL, MALFORMED_DOCTYPE);
3814
+ TY_(FreeNode)(doc, node);
3815
+ return NULL;
3816
+ }
3817
+
3818
+ /*
3819
+ * local variables:
3820
+ * mode: c
3821
+ * indent-tabs-mode: nil
3822
+ * c-basic-offset: 4
3823
+ * eval: (c-set-offset 'substatement-open 0)
3824
+ * end:
3825
+ */