tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/parser.c ADDED
@@ -0,0 +1,4408 @@
1
+ /* parser.c -- HTML Parser
2
+
3
+ (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: krusch $
9
+ $Date: 2009/10/27 19:27:49 $
10
+ $Revision: 1.188 $
11
+
12
+ */
13
+
14
+ #include "tidy-int.h"
15
+ #include "lexer.h"
16
+ #include "parser.h"
17
+ #include "message.h"
18
+ #include "clean.h"
19
+ #include "tags.h"
20
+ #include "tmbstr.h"
21
+
22
+ #ifdef AUTO_INPUT_ENCODING
23
+ #include "charsets.h"
24
+ #endif
25
+
26
+ Bool TY_(CheckNodeIntegrity)(Node *node)
27
+ {
28
+ #ifndef NO_NODE_INTEGRITY_CHECK
29
+ Node *child;
30
+
31
+ if (node->prev)
32
+ {
33
+ if (node->prev->next != node)
34
+ return no;
35
+ }
36
+
37
+ if (node->next)
38
+ {
39
+ if (node->next == node || node->next->prev != node)
40
+ return no;
41
+ }
42
+
43
+ if (node->parent)
44
+ {
45
+ if (node->prev == NULL && node->parent->content != node)
46
+ return no;
47
+
48
+ if (node->next == NULL && node->parent->last != node)
49
+ return no;
50
+ }
51
+
52
+ for (child = node->content; child; child = child->next)
53
+ if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
54
+ return no;
55
+
56
+ #endif
57
+ return yes;
58
+ }
59
+
60
+ /*
61
+ used to determine how attributes
62
+ without values should be printed
63
+ this was introduced to deal with
64
+ user defined tags e.g. Cold Fusion
65
+ */
66
+ Bool TY_(IsNewNode)(Node *node)
67
+ {
68
+ if (node && node->tag)
69
+ {
70
+ return (node->tag->model & CM_NEW);
71
+ }
72
+ return yes;
73
+ }
74
+
75
+ void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
76
+ {
77
+ const Dict* tag = TY_(LookupTagDef)(tid);
78
+ Node* tmp = TY_(InferredTag)(doc, tag->id);
79
+
80
+ if (obsolete)
81
+ TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT);
82
+ else if (unexpected)
83
+ TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
84
+ else
85
+ TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT);
86
+
87
+ TidyDocFree(doc, tmp->element);
88
+ TidyDocFree(doc, tmp);
89
+
90
+ node->was = node->tag;
91
+ node->tag = tag;
92
+ node->type = StartTag;
93
+ node->implicit = yes;
94
+ TidyDocFree(doc, node->element);
95
+ node->element = TY_(tmbstrdup)(doc->allocator, tag->name);
96
+ }
97
+
98
+ /* extract a node and its children from a markup tree */
99
+ Node *TY_(RemoveNode)(Node *node)
100
+ {
101
+ if (node->prev)
102
+ node->prev->next = node->next;
103
+
104
+ if (node->next)
105
+ node->next->prev = node->prev;
106
+
107
+ if (node->parent)
108
+ {
109
+ if (node->parent->content == node)
110
+ node->parent->content = node->next;
111
+
112
+ if (node->parent->last == node)
113
+ node->parent->last = node->prev;
114
+ }
115
+
116
+ node->parent = node->prev = node->next = NULL;
117
+ return node;
118
+ }
119
+
120
+ /* remove node from markup tree and discard it */
121
+ Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
122
+ {
123
+ Node *next = NULL;
124
+
125
+ if (element)
126
+ {
127
+ next = element->next;
128
+ TY_(RemoveNode)(element);
129
+ TY_(FreeNode)( doc, element);
130
+ }
131
+
132
+ return next;
133
+ }
134
+
135
+ /*
136
+ insert "node" into markup tree as the firt element
137
+ of content of "element"
138
+ */
139
+ void TY_(InsertNodeAtStart)(Node *element, Node *node)
140
+ {
141
+ node->parent = element;
142
+
143
+ if (element->content == NULL)
144
+ element->last = node;
145
+ else
146
+ element->content->prev = node;
147
+
148
+ node->next = element->content;
149
+ node->prev = NULL;
150
+ element->content = node;
151
+ }
152
+
153
+ /*
154
+ insert "node" into markup tree as the last element
155
+ of content of "element"
156
+ */
157
+ void TY_(InsertNodeAtEnd)(Node *element, Node *node)
158
+ {
159
+ node->parent = element;
160
+ node->prev = element->last;
161
+
162
+ if (element->last != NULL)
163
+ element->last->next = node;
164
+ else
165
+ element->content = node;
166
+
167
+ element->last = node;
168
+ }
169
+
170
+ /*
171
+ insert "node" into markup tree in place of "element"
172
+ which is moved to become the child of the node
173
+ */
174
+ static void InsertNodeAsParent(Node *element, Node *node)
175
+ {
176
+ node->content = element;
177
+ node->last = element;
178
+ node->parent = element->parent;
179
+ element->parent = node;
180
+
181
+ if (node->parent->content == element)
182
+ node->parent->content = node;
183
+
184
+ if (node->parent->last == element)
185
+ node->parent->last = node;
186
+
187
+ node->prev = element->prev;
188
+ element->prev = NULL;
189
+
190
+ if (node->prev)
191
+ node->prev->next = node;
192
+
193
+ node->next = element->next;
194
+ element->next = NULL;
195
+
196
+ if (node->next)
197
+ node->next->prev = node;
198
+ }
199
+
200
+ /* insert "node" into markup tree before "element" */
201
+ void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
202
+ {
203
+ Node *parent;
204
+
205
+ parent = element->parent;
206
+ node->parent = parent;
207
+ node->next = element;
208
+ node->prev = element->prev;
209
+ element->prev = node;
210
+
211
+ if (node->prev)
212
+ node->prev->next = node;
213
+
214
+ if (parent->content == element)
215
+ parent->content = node;
216
+ }
217
+
218
+ /* insert "node" into markup tree after "element" */
219
+ void TY_(InsertNodeAfterElement)(Node *element, Node *node)
220
+ {
221
+ Node *parent;
222
+
223
+ parent = element->parent;
224
+ node->parent = parent;
225
+
226
+ /* AQ - 13 Jan 2000 fix for parent == NULL */
227
+ if (parent != NULL && parent->last == element)
228
+ parent->last = node;
229
+ else
230
+ {
231
+ node->next = element->next;
232
+ /* AQ - 13 Jan 2000 fix for node->next == NULL */
233
+ if (node->next != NULL)
234
+ node->next->prev = node;
235
+ }
236
+
237
+ element->next = node;
238
+ node->prev = element;
239
+ }
240
+
241
+ static Bool CanPrune( TidyDocImpl* doc, Node *element )
242
+ {
243
+ if ( TY_(nodeIsText)(element) )
244
+ return yes;
245
+
246
+ if ( element->content )
247
+ return no;
248
+
249
+ if ( element->tag == NULL )
250
+ return no;
251
+
252
+ if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
253
+ return no;
254
+
255
+ if ( nodeIsA(element) && element->attributes != NULL )
256
+ return no;
257
+
258
+ if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
259
+ return no;
260
+
261
+ if ( element->tag->model & CM_ROW )
262
+ return no;
263
+
264
+ if ( element->tag->model & CM_EMPTY )
265
+ return no;
266
+
267
+ if ( nodeIsAPPLET(element) )
268
+ return no;
269
+
270
+ if ( nodeIsOBJECT(element) )
271
+ return no;
272
+
273
+ if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
274
+ return no;
275
+
276
+ if ( nodeIsTITLE(element) )
277
+ return no;
278
+
279
+ /* #433359 - fix by Randy Waki 12 Mar 01 */
280
+ if ( nodeIsIFRAME(element) )
281
+ return no;
282
+
283
+ /* fix for bug 770297 */
284
+ if (nodeIsTEXTAREA(element))
285
+ return no;
286
+
287
+ if ( attrGetID(element) || attrGetNAME(element) )
288
+ return no;
289
+
290
+ /* fix for bug 695408; a better fix would look for unknown and */
291
+ /* known proprietary attributes that make the element significant */
292
+ if (attrGetDATAFLD(element))
293
+ return no;
294
+
295
+ /* fix for bug 723772, don't trim new-...-tags */
296
+ if (element->tag->id == TidyTag_UNKNOWN)
297
+ return no;
298
+
299
+ if (nodeIsBODY(element))
300
+ return no;
301
+
302
+ if (nodeIsCOLGROUP(element))
303
+ return no;
304
+
305
+ return yes;
306
+ }
307
+
308
+ /* return next element */
309
+ Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
310
+ {
311
+ if ( CanPrune(doc, element) )
312
+ {
313
+ if (element->type != TextNode)
314
+ TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
315
+
316
+ return TY_(DiscardElement)(doc, element);
317
+ }
318
+ return element->next;
319
+ }
320
+
321
+ Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
322
+ {
323
+ Node* next;
324
+
325
+ while (node)
326
+ {
327
+ next = node->next;
328
+
329
+ if (node->content)
330
+ TY_(DropEmptyElements)(doc, node->content);
331
+
332
+ if (!TY_(nodeIsElement)(node) &&
333
+ !(TY_(nodeIsText)(node) && !(node->start < node->end)))
334
+ {
335
+ node = next;
336
+ continue;
337
+ }
338
+
339
+ next = TY_(TrimEmptyElement)(doc, node);
340
+ node = next;
341
+ }
342
+
343
+ return node;
344
+ }
345
+
346
+ /*
347
+ errors in positioning of form start or end tags
348
+ generally require human intervention to fix
349
+ */
350
+ static void BadForm( TidyDocImpl* doc )
351
+ {
352
+ doc->badForm = yes;
353
+ /* doc->errors++; */
354
+ }
355
+
356
+ /*
357
+ This maps
358
+ <em>hello </em><strong>world</strong>
359
+ to
360
+ <em>hello</em> <strong>world</strong>
361
+
362
+ If last child of element is a text node
363
+ then trim trailing white space character
364
+ moving it to after element's end tag.
365
+ */
366
+ static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
367
+ {
368
+ Lexer* lexer = doc->lexer;
369
+ byte c;
370
+
371
+ if (TY_(nodeIsText)(last))
372
+ {
373
+ if (last->end > last->start)
374
+ {
375
+ c = (byte) lexer->lexbuf[ last->end - 1 ];
376
+
377
+ if ( c == ' '
378
+ #ifdef COMMENT_NBSP_FIX
379
+ || c == 160
380
+ #endif
381
+ )
382
+ {
383
+ #ifdef COMMENT_NBSP_FIX
384
+ /* take care with <td>&nbsp;</td> */
385
+ if ( c == 160 &&
386
+ ( element->tag == doc->tags.tag_td ||
387
+ element->tag == doc->tags.tag_th )
388
+ )
389
+ {
390
+ if (last->end > last->start + 1)
391
+ last->end -= 1;
392
+ }
393
+ else
394
+ #endif
395
+ {
396
+ last->end -= 1;
397
+ if ( (element->tag->model & CM_INLINE) &&
398
+ !(element->tag->model & CM_FIELD) )
399
+ lexer->insertspace = yes;
400
+ }
401
+ }
402
+ }
403
+ }
404
+ }
405
+
406
+ #if 0
407
+ static Node *EscapeTag(Lexer *lexer, Node *element)
408
+ {
409
+ Node *node = NewNode(lexer->allocator, lexer);
410
+
411
+ node->start = lexer->lexsize;
412
+ AddByte(lexer, '<');
413
+
414
+ if (element->type == EndTag)
415
+ AddByte(lexer, '/');
416
+
417
+ if (element->element)
418
+ {
419
+ char *p;
420
+ for (p = element->element; *p != '\0'; ++p)
421
+ AddByte(lexer, *p);
422
+ }
423
+ else if (element->type == DocTypeTag)
424
+ {
425
+ uint i;
426
+ AddStringLiteral( lexer, "!DOCTYPE " );
427
+ for (i = element->start; i < element->end; ++i)
428
+ AddByte(lexer, lexer->lexbuf[i]);
429
+ }
430
+
431
+ if (element->type == StartEndTag)
432
+ AddByte(lexer, '/');
433
+
434
+ AddByte(lexer, '>');
435
+ node->end = lexer->lexsize;
436
+
437
+ return node;
438
+ }
439
+ #endif /* 0 */
440
+
441
+ /* Only true for text nodes. */
442
+ Bool TY_(IsBlank)(Lexer *lexer, Node *node)
443
+ {
444
+ Bool isBlank = TY_(nodeIsText)(node);
445
+ if ( isBlank )
446
+ isBlank = ( node->end == node->start || /* Zero length */
447
+ ( node->end == node->start+1 /* or one blank. */
448
+ && lexer->lexbuf[node->start] == ' ' ) );
449
+ return isBlank;
450
+ }
451
+
452
+ /*
453
+ This maps
454
+ <p>hello<em> world</em>
455
+ to
456
+ <p>hello <em>world</em>
457
+
458
+ Trims initial space, by moving it before the
459
+ start tag, or if this element is the first in
460
+ parent's content, then by discarding the space
461
+ */
462
+ static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
463
+ {
464
+ Lexer* lexer = doc->lexer;
465
+ Node *prev, *node;
466
+
467
+ if ( TY_(nodeIsText)(text) &&
468
+ lexer->lexbuf[text->start] == ' ' &&
469
+ text->start < text->end )
470
+ {
471
+ if ( (element->tag->model & CM_INLINE) &&
472
+ !(element->tag->model & CM_FIELD) )
473
+ {
474
+ prev = element->prev;
475
+
476
+ if (TY_(nodeIsText)(prev))
477
+ {
478
+ if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
479
+ lexer->lexbuf[(prev->end)++] = ' ';
480
+
481
+ ++(element->start);
482
+ }
483
+ else /* create new node */
484
+ {
485
+ node = TY_(NewNode)(lexer->allocator, lexer);
486
+ node->start = (element->start)++;
487
+ node->end = element->start;
488
+ lexer->lexbuf[node->start] = ' ';
489
+ TY_(InsertNodeBeforeElement)(element ,node);
490
+ }
491
+ }
492
+
493
+ /* discard the space in current node */
494
+ ++(text->start);
495
+ }
496
+ }
497
+
498
+ static Bool IsPreDescendant(Node* node)
499
+ {
500
+ Node *parent = node->parent;
501
+
502
+ while (parent)
503
+ {
504
+ if (parent->tag && parent->tag->parser == TY_(ParsePre))
505
+ return yes;
506
+
507
+ parent = parent->parent;
508
+ }
509
+
510
+ return no;
511
+ }
512
+
513
+ static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
514
+ {
515
+ Node* next;
516
+
517
+ if (!TY_(nodeIsText)(node))
518
+ return no;
519
+
520
+ if (node->parent->type == DocTypeTag)
521
+ return no;
522
+
523
+ if (IsPreDescendant(node))
524
+ return no;
525
+
526
+ if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
527
+ return no;
528
+
529
+ next = node->next;
530
+
531
+ /* <p>... </p> */
532
+ if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
533
+ return yes;
534
+
535
+ /* <div><small>... </small><h3>...</h3></div> */
536
+ if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
537
+ return yes;
538
+
539
+ if (!next)
540
+ return no;
541
+
542
+ if (nodeIsBR(next))
543
+ return yes;
544
+
545
+ if (TY_(nodeHasCM)(next, CM_INLINE))
546
+ return no;
547
+
548
+ /* <a href='/'>...</a> <p>...</p> */
549
+ if (next->type == StartTag)
550
+ return yes;
551
+
552
+ /* <strong>...</strong> <hr /> */
553
+ if (next->type == StartEndTag)
554
+ return yes;
555
+
556
+ /* evil adjacent text nodes, Tidy should not generate these :-( */
557
+ if (TY_(nodeIsText)(next) && next->start < next->end
558
+ && TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
559
+ return yes;
560
+
561
+ return no;
562
+ }
563
+
564
+ static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
565
+ {
566
+ if (!TY_(nodeIsText)(node))
567
+ return no;
568
+
569
+ if (node->parent->type == DocTypeTag)
570
+ return no;
571
+
572
+ if (IsPreDescendant(node))
573
+ return no;
574
+
575
+ if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
576
+ return no;
577
+
578
+ /* <p>...<br> <em>...</em>...</p> */
579
+ if (nodeIsBR(node->prev))
580
+ return yes;
581
+
582
+ /* <p> ...</p> */
583
+ if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
584
+ return yes;
585
+
586
+ /* <h4>...</h4> <em>...</em> */
587
+ if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
588
+ TY_(nodeIsElement)(node->prev))
589
+ return yes;
590
+
591
+ /* <p><span> ...</span></p> */
592
+ if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
593
+ return yes;
594
+
595
+ return no;
596
+ }
597
+
598
+ static void CleanSpaces(TidyDocImpl* doc, Node* node)
599
+ {
600
+ Node* next;
601
+
602
+ while (node)
603
+ {
604
+ next = node->next;
605
+
606
+ if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
607
+ while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
608
+ ++(node->start);
609
+
610
+ if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
611
+ while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
612
+ --(node->end);
613
+
614
+ if (TY_(nodeIsText)(node) && !(node->start < node->end))
615
+ {
616
+ TY_(RemoveNode)(node);
617
+ TY_(FreeNode)(doc, node);
618
+ node = next;
619
+
620
+ continue;
621
+ }
622
+
623
+ if (node->content)
624
+ CleanSpaces(doc, node->content);
625
+
626
+ node = next;
627
+ }
628
+ }
629
+
630
+ /*
631
+ Move initial and trailing space out.
632
+ This routine maps:
633
+
634
+ hello<em> world</em>
635
+ to
636
+ hello <em>world</em>
637
+ and
638
+ <em>hello </em><strong>world</strong>
639
+ to
640
+ <em>hello</em> <strong>world</strong>
641
+ */
642
+ static void TrimSpaces( TidyDocImpl* doc, Node *element)
643
+ {
644
+ Node* text = element->content;
645
+
646
+ if (nodeIsPRE(element) || IsPreDescendant(element))
647
+ return;
648
+
649
+ if (TY_(nodeIsText)(text))
650
+ TrimInitialSpace(doc, element, text);
651
+
652
+ text = element->last;
653
+
654
+ if (TY_(nodeIsText)(text))
655
+ TrimTrailingSpace(doc, element, text);
656
+ }
657
+
658
+ static Bool DescendantOf( Node *element, TidyTagId tid )
659
+ {
660
+ Node *parent;
661
+ for ( parent = element->parent;
662
+ parent != NULL;
663
+ parent = parent->parent )
664
+ {
665
+ if ( TagIsId(parent, tid) )
666
+ return yes;
667
+ }
668
+ return no;
669
+ }
670
+
671
+ static Bool InsertMisc(Node *element, Node *node)
672
+ {
673
+ if (node->type == CommentTag ||
674
+ node->type == ProcInsTag ||
675
+ node->type == CDATATag ||
676
+ node->type == SectionTag ||
677
+ node->type == AspTag ||
678
+ node->type == JsteTag ||
679
+ node->type == PhpTag )
680
+ {
681
+ TY_(InsertNodeAtEnd)(element, node);
682
+ return yes;
683
+ }
684
+
685
+ if ( node->type == XmlDecl )
686
+ {
687
+ Node* root = element;
688
+ while ( root && root->parent )
689
+ root = root->parent;
690
+ if ( root && !(root->content && root->content->type == XmlDecl))
691
+ {
692
+ TY_(InsertNodeAtStart)( root, node );
693
+ return yes;
694
+ }
695
+ }
696
+
697
+ /* Declared empty tags seem to be slipping through
698
+ ** the cracks. This is an experiment to figure out
699
+ ** a decent place to pick them up.
700
+ */
701
+ if ( node->tag &&
702
+ TY_(nodeIsElement)(node) &&
703
+ TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
704
+ (node->tag->versions & VERS_PROPRIETARY) != 0 )
705
+ {
706
+ TY_(InsertNodeAtEnd)(element, node);
707
+ return yes;
708
+ }
709
+
710
+ return no;
711
+ }
712
+
713
+
714
+ static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode )
715
+ {
716
+ Lexer* lexer = doc->lexer;
717
+ /*
718
+ Fix by GLP 2000-12-21. Need to reset insertspace if this
719
+ is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
720
+ */
721
+ if (node->tag->model & CM_EMPTY)
722
+ {
723
+ lexer->waswhite = no;
724
+ if (node->tag->parser == NULL)
725
+ return;
726
+ }
727
+ else if (!(node->tag->model & CM_INLINE))
728
+ lexer->insertspace = no;
729
+
730
+ if (node->tag->parser == NULL)
731
+ return;
732
+
733
+ if (node->type == StartEndTag)
734
+ return;
735
+
736
+ (*node->tag->parser)( doc, node, mode );
737
+ }
738
+
739
+ /*
740
+ the doctype has been found after other tags,
741
+ and needs moving to before the html element
742
+ */
743
+ static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
744
+ {
745
+ Node* existing = TY_(FindDocType)( doc );
746
+ if ( existing )
747
+ {
748
+ TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED );
749
+ TY_(FreeNode)( doc, doctype );
750
+ }
751
+ else
752
+ {
753
+ TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
754
+ while ( !nodeIsHTML(element) )
755
+ element = element->parent;
756
+ TY_(InsertNodeBeforeElement)( element, doctype );
757
+ }
758
+ }
759
+
760
+ /*
761
+ move node to the head, where element is used as starting
762
+ point in hunt for head. normally called during parsing
763
+ */
764
+ static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
765
+ {
766
+ Node *head;
767
+
768
+ TY_(RemoveNode)( node ); /* make sure that node is isolated */
769
+
770
+ if ( TY_(nodeIsElement)(node) )
771
+ {
772
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN );
773
+
774
+ head = TY_(FindHEAD)(doc);
775
+ assert(head != NULL);
776
+
777
+ TY_(InsertNodeAtEnd)(head, node);
778
+
779
+ if ( node->tag->parser )
780
+ ParseTag( doc, node, IgnoreWhitespace );
781
+ }
782
+ else
783
+ {
784
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
785
+ TY_(FreeNode)( doc, node );
786
+ }
787
+ }
788
+
789
+ /* moves given node to end of body element */
790
+ static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
791
+ {
792
+ Node* body = TY_(FindBody)( doc );
793
+ if ( body )
794
+ {
795
+ TY_(RemoveNode)( node );
796
+ TY_(InsertNodeAtEnd)( body, node );
797
+ }
798
+ }
799
+
800
+ static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
801
+ {
802
+ ctmbstr sprop =
803
+ "padding-left: 2ex; margin-left: 0ex"
804
+ "; margin-top: 0ex; margin-bottom: 0ex";
805
+ if ( !cfgBool(doc, TidyDecorateInferredUL) )
806
+ return;
807
+ if ( cfgBool(doc, TidyMakeClean) )
808
+ TY_(AddStyleAsClass)( doc, node, sprop );
809
+ else
810
+ TY_(AddStyleProperty)( doc, node, sprop );
811
+ }
812
+
813
+ /*
814
+ element is node created by the lexer
815
+ upon seeing the start tag, or by the
816
+ parser when the start tag is inferred
817
+ */
818
+ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
819
+ {
820
+ Lexer* lexer = doc->lexer;
821
+ Node *node;
822
+ Bool checkstack = yes;
823
+ uint istackbase = 0;
824
+
825
+ if ( element->tag->model & CM_EMPTY )
826
+ return;
827
+
828
+ if ( nodeIsFORM(element) &&
829
+ DescendantOf(element, TidyTag_FORM) )
830
+ TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING );
831
+
832
+ /*
833
+ InlineDup() asks the lexer to insert inline emphasis tags
834
+ currently pushed on the istack, but take care to avoid
835
+ propagating inline emphasis inside OBJECT or APPLET.
836
+ For these elements a fresh inline stack context is created
837
+ and disposed of upon reaching the end of the element.
838
+ They thus behave like table cells in this respect.
839
+ */
840
+ if (element->tag->model & CM_OBJECT)
841
+ {
842
+ istackbase = lexer->istackbase;
843
+ lexer->istackbase = lexer->istacksize;
844
+ }
845
+
846
+ if (!(element->tag->model & CM_MIXED))
847
+ TY_(InlineDup)( doc, NULL );
848
+
849
+ mode = IgnoreWhitespace;
850
+
851
+ while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
852
+ {
853
+ /* end tag for this element */
854
+ if (node->type == EndTag && node->tag &&
855
+ (node->tag == element->tag || element->was == node->tag))
856
+ {
857
+ TY_(FreeNode)( doc, node );
858
+
859
+ if (element->tag->model & CM_OBJECT)
860
+ {
861
+ /* pop inline stack */
862
+ while (lexer->istacksize > lexer->istackbase)
863
+ TY_(PopInline)( doc, NULL );
864
+ lexer->istackbase = istackbase;
865
+ }
866
+
867
+ element->closed = yes;
868
+ TrimSpaces( doc, element );
869
+ return;
870
+ }
871
+
872
+ if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
873
+ {
874
+ /* If we're in the HEAD, close it before proceeding.
875
+ This is an extremely rare occurance, but has been observed.
876
+ */
877
+ TY_(UngetToken)( doc );
878
+ break;
879
+ }
880
+
881
+ if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
882
+ {
883
+ if ( TY_(nodeIsElement)(node) )
884
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
885
+ TY_(FreeNode)( doc, node );
886
+ continue;
887
+ }
888
+
889
+
890
+ if (node->type == EndTag)
891
+ {
892
+ if (node->tag == NULL)
893
+ {
894
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
895
+ TY_(FreeNode)( doc, node );
896
+ continue;
897
+ }
898
+ else if ( nodeIsBR(node) )
899
+ node->type = StartTag;
900
+ else if ( nodeIsP(node) )
901
+ {
902
+ /* Cannot have a block inside a paragraph, so no checking
903
+ for an ancestor is necessary -- but we _can_ have
904
+ paragraphs inside a block, so change it to an implicit
905
+ empty paragraph, to be dealt with according to the user's
906
+ options
907
+ */
908
+ node->type = StartEndTag;
909
+ node->implicit = yes;
910
+ #if OBSOLETE
911
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
912
+ TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
913
+ TY_(InsertNodeAtEnd)( element, node );
914
+ node = InferredTag(doc, TidyTag_BR);
915
+ #endif
916
+ }
917
+ else if (DescendantOf( element, node->tag->id ))
918
+ {
919
+ /*
920
+ if this is the end tag for an ancestor element
921
+ then infer end tag for this element
922
+ */
923
+ TY_(UngetToken)( doc );
924
+ break;
925
+ #if OBSOLETE
926
+ Node *parent;
927
+ for ( parent = element->parent;
928
+ parent != NULL;
929
+ parent = parent->parent )
930
+ {
931
+ if (node->tag == parent->tag)
932
+ {
933
+ if (!(element->tag->model & CM_OPT))
934
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
935
+
936
+ TY_(UngetToken)( doc );
937
+
938
+ if (element->tag->model & CM_OBJECT)
939
+ {
940
+ /* pop inline stack */
941
+ while (lexer->istacksize > lexer->istackbase)
942
+ TY_(PopInline)( doc, NULL );
943
+ lexer->istackbase = istackbase;
944
+ }
945
+
946
+ TrimSpaces( doc, element );
947
+ return;
948
+ }
949
+ }
950
+ #endif
951
+ }
952
+ else
953
+ {
954
+ /* special case </tr> etc. for stuff moved in front of table */
955
+ if ( lexer->exiled
956
+ && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
957
+ {
958
+ TY_(UngetToken)( doc );
959
+ TrimSpaces( doc, element );
960
+ return;
961
+ }
962
+ }
963
+ }
964
+
965
+ /* mixed content model permits text */
966
+ if (TY_(nodeIsText)(node))
967
+ {
968
+ if ( checkstack )
969
+ {
970
+ checkstack = no;
971
+ if (!(element->tag->model & CM_MIXED))
972
+ {
973
+ if ( TY_(InlineDup)(doc, node) > 0 )
974
+ continue;
975
+ }
976
+ }
977
+
978
+ TY_(InsertNodeAtEnd)(element, node);
979
+ mode = MixedContent;
980
+
981
+ /*
982
+ HTML4 strict doesn't allow mixed content for
983
+ elements with %block; as their content model
984
+ */
985
+ /*
986
+ But only body, map, blockquote, form and
987
+ noscript have content model %block;
988
+ */
989
+ if ( nodeIsBODY(element) ||
990
+ nodeIsMAP(element) ||
991
+ nodeIsBLOCKQUOTE(element) ||
992
+ nodeIsFORM(element) ||
993
+ nodeIsNOSCRIPT(element) )
994
+ TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
995
+ continue;
996
+ }
997
+
998
+ if ( InsertMisc(element, node) )
999
+ continue;
1000
+
1001
+ /* allow PARAM elements? */
1002
+ if ( nodeIsPARAM(node) )
1003
+ {
1004
+ if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
1005
+ {
1006
+ TY_(InsertNodeAtEnd)(element, node);
1007
+ continue;
1008
+ }
1009
+
1010
+ /* otherwise discard it */
1011
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1012
+ TY_(FreeNode)( doc, node );
1013
+ continue;
1014
+ }
1015
+
1016
+ /* allow AREA elements? */
1017
+ if ( nodeIsAREA(node) )
1018
+ {
1019
+ if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
1020
+ {
1021
+ TY_(InsertNodeAtEnd)(element, node);
1022
+ continue;
1023
+ }
1024
+
1025
+ /* otherwise discard it */
1026
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1027
+ TY_(FreeNode)( doc, node );
1028
+ continue;
1029
+ }
1030
+
1031
+ /* ignore unknown start/end tags */
1032
+ if ( node->tag == NULL )
1033
+ {
1034
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1035
+ TY_(FreeNode)( doc, node );
1036
+ continue;
1037
+ }
1038
+
1039
+ /*
1040
+ Allow CM_INLINE elements here.
1041
+
1042
+ Allow CM_BLOCK elements here unless
1043
+ lexer->excludeBlocks is yes.
1044
+
1045
+ LI and DD are special cased.
1046
+
1047
+ Otherwise infer end tag for this element.
1048
+ */
1049
+
1050
+ if ( !TY_(nodeHasCM)(node, CM_INLINE) )
1051
+ {
1052
+ if ( !TY_(nodeIsElement)(node) )
1053
+ {
1054
+ if ( nodeIsFORM(node) )
1055
+ BadForm( doc );
1056
+
1057
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1058
+ TY_(FreeNode)( doc, node );
1059
+ continue;
1060
+ }
1061
+
1062
+ /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1063
+ /*
1064
+ If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1065
+ start tag, discard the start tag and let the subsequent content get
1066
+ parsed as content of the enclosing LI. This seems to mimic IE and
1067
+ Netscape, and avoids an infinite loop: without this check,
1068
+ ParseBlock (which is parsing the LI's content) and ParseList (which
1069
+ is parsing the LI's parent's content) repeatedly defer to each
1070
+ other to parse the illegal start tag, each time inferring a missing
1071
+ </li> or <li> respectively.
1072
+
1073
+ NOTE: This check is a bit fragile. It specifically checks for the
1074
+ four tags that happen to weave their way through the current series
1075
+ of tests performed by ParseBlock and ParseList to trigger the
1076
+ infinite loop.
1077
+ */
1078
+ if ( nodeIsLI(element) )
1079
+ {
1080
+ if ( nodeIsFRAME(node) ||
1081
+ nodeIsFRAMESET(node) ||
1082
+ nodeIsOPTGROUP(node) ||
1083
+ nodeIsOPTION(node) )
1084
+ {
1085
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1086
+ TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */
1087
+ continue;
1088
+ }
1089
+ }
1090
+
1091
+ if ( nodeIsTD(element) || nodeIsTH(element) )
1092
+ {
1093
+ /* if parent is a table cell, avoid inferring the end of the cell */
1094
+
1095
+ if ( TY_(nodeHasCM)(node, CM_HEAD) )
1096
+ {
1097
+ MoveToHead( doc, element, node );
1098
+ continue;
1099
+ }
1100
+
1101
+ if ( TY_(nodeHasCM)(node, CM_LIST) )
1102
+ {
1103
+ TY_(UngetToken)( doc );
1104
+ node = TY_(InferredTag)(doc, TidyTag_UL);
1105
+ AddClassNoIndent(doc, node);
1106
+ lexer->excludeBlocks = yes;
1107
+ }
1108
+ else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1109
+ {
1110
+ TY_(UngetToken)( doc );
1111
+ node = TY_(InferredTag)(doc, TidyTag_DL);
1112
+ lexer->excludeBlocks = yes;
1113
+ }
1114
+
1115
+ /* infer end of current table cell */
1116
+ if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
1117
+ {
1118
+ TY_(UngetToken)( doc );
1119
+ TrimSpaces( doc, element );
1120
+ return;
1121
+ }
1122
+ }
1123
+ else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
1124
+ {
1125
+ if ( lexer->excludeBlocks )
1126
+ {
1127
+ if ( !TY_(nodeHasCM)(element, CM_OPT) )
1128
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1129
+
1130
+ TY_(UngetToken)( doc );
1131
+
1132
+ if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1133
+ lexer->istackbase = istackbase;
1134
+
1135
+ TrimSpaces( doc, element );
1136
+ return;
1137
+ }
1138
+ }
1139
+ else /* things like list items */
1140
+ {
1141
+ if (node->tag->model & CM_HEAD)
1142
+ {
1143
+ MoveToHead( doc, element, node );
1144
+ continue;
1145
+ }
1146
+
1147
+ /*
1148
+ special case where a form start tag
1149
+ occurs in a tr and is followed by td or th
1150
+ */
1151
+
1152
+ if ( nodeIsFORM(element) &&
1153
+ nodeIsTD(element->parent) &&
1154
+ element->parent->implicit )
1155
+ {
1156
+ if ( nodeIsTD(node) )
1157
+ {
1158
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1159
+ TY_(FreeNode)( doc, node );
1160
+ continue;
1161
+ }
1162
+
1163
+ if ( nodeIsTH(node) )
1164
+ {
1165
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1166
+ TY_(FreeNode)( doc, node );
1167
+ node = element->parent;
1168
+ TidyDocFree(doc, node->element);
1169
+ node->element = TY_(tmbstrdup)(doc->allocator, "th");
1170
+ node->tag = TY_(LookupTagDef)( TidyTag_TH );
1171
+ continue;
1172
+ }
1173
+ }
1174
+
1175
+ if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
1176
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1177
+
1178
+ TY_(UngetToken)( doc );
1179
+
1180
+ if ( TY_(nodeHasCM)(node, CM_LIST) )
1181
+ {
1182
+ if ( element->parent && element->parent->tag &&
1183
+ element->parent->tag->parser == TY_(ParseList) )
1184
+ {
1185
+ TrimSpaces( doc, element );
1186
+ return;
1187
+ }
1188
+
1189
+ node = TY_(InferredTag)(doc, TidyTag_UL);
1190
+ AddClassNoIndent(doc, node);
1191
+ }
1192
+ else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1193
+ {
1194
+ if ( nodeIsDL(element->parent) )
1195
+ {
1196
+ TrimSpaces( doc, element );
1197
+ return;
1198
+ }
1199
+
1200
+ node = TY_(InferredTag)(doc, TidyTag_DL);
1201
+ }
1202
+ else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
1203
+ {
1204
+ /* http://tidy.sf.net/issue/1316307 */
1205
+ /* In exiled mode, return so table processing can
1206
+ continue. */
1207
+ if (lexer->exiled)
1208
+ return;
1209
+ node = TY_(InferredTag)(doc, TidyTag_TABLE);
1210
+ }
1211
+ else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1212
+ {
1213
+ /* pop inline stack */
1214
+ while ( lexer->istacksize > lexer->istackbase )
1215
+ TY_(PopInline)( doc, NULL );
1216
+ lexer->istackbase = istackbase;
1217
+ TrimSpaces( doc, element );
1218
+ return;
1219
+
1220
+ }
1221
+ else
1222
+ {
1223
+ TrimSpaces( doc, element );
1224
+ return;
1225
+ }
1226
+ }
1227
+ }
1228
+
1229
+ /* parse known element */
1230
+ if (TY_(nodeIsElement)(node))
1231
+ {
1232
+ if (node->tag->model & CM_INLINE)
1233
+ {
1234
+ if (checkstack && !node->implicit)
1235
+ {
1236
+ checkstack = no;
1237
+
1238
+ if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1239
+ {
1240
+ if ( TY_(InlineDup)(doc, node) > 0 )
1241
+ continue;
1242
+ }
1243
+ }
1244
+
1245
+ mode = MixedContent;
1246
+ }
1247
+ else
1248
+ {
1249
+ checkstack = yes;
1250
+ mode = IgnoreWhitespace;
1251
+ }
1252
+
1253
+ /* trim white space before <br> */
1254
+ if ( nodeIsBR(node) )
1255
+ TrimSpaces( doc, element );
1256
+
1257
+ TY_(InsertNodeAtEnd)(element, node);
1258
+
1259
+ if (node->implicit)
1260
+ TY_(ReportError)(doc, element, node, INSERTING_TAG );
1261
+
1262
+ ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1263
+ continue;
1264
+ }
1265
+
1266
+ /* discard unexpected tags */
1267
+ if (node->type == EndTag)
1268
+ TY_(PopInline)( doc, node ); /* if inline end tag */
1269
+
1270
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1271
+ TY_(FreeNode)( doc, node );
1272
+ continue;
1273
+ }
1274
+
1275
+ if (!(element->tag->model & CM_OPT))
1276
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1277
+
1278
+ if (element->tag->model & CM_OBJECT)
1279
+ {
1280
+ /* pop inline stack */
1281
+ while ( lexer->istacksize > lexer->istackbase )
1282
+ TY_(PopInline)( doc, NULL );
1283
+ lexer->istackbase = istackbase;
1284
+ }
1285
+
1286
+ TrimSpaces( doc, element );
1287
+ }
1288
+
1289
+ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
1290
+ {
1291
+ Lexer* lexer = doc->lexer;
1292
+ Node *node, *parent;
1293
+
1294
+ if (element->tag->model & CM_EMPTY)
1295
+ return;
1296
+
1297
+ /*
1298
+ ParseInline is used for some block level elements like H1 to H6
1299
+ For such elements we need to insert inline emphasis tags currently
1300
+ on the inline stack. For Inline elements, we normally push them
1301
+ onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1302
+ This test is carried out in PushInline and PopInline, see istack.c
1303
+
1304
+ InlineDup(...) is not called for elements with a CM_MIXED (inline and
1305
+ block) content model, e.g. <del> or <ins>, otherwise constructs like
1306
+
1307
+ <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1308
+ <p>111<span>222<del>333</del>444</span>555</p>
1309
+ <p>111<em>222<del>333</del>444</em>555</p>
1310
+
1311
+ will get corrupted.
1312
+ */
1313
+ if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
1314
+ !TY_(nodeHasCM)(element, CM_MIXED))
1315
+ TY_(InlineDup)(doc, NULL);
1316
+ else if (TY_(nodeHasCM)(element, CM_INLINE))
1317
+ TY_(PushInline)(doc, element);
1318
+
1319
+ if ( nodeIsNOBR(element) )
1320
+ doc->badLayout |= USING_NOBR;
1321
+ else if ( nodeIsFONT(element) )
1322
+ doc->badLayout |= USING_FONT;
1323
+
1324
+ /* Inline elements may or may not be within a preformatted element */
1325
+ if (mode != Preformatted)
1326
+ mode = MixedContent;
1327
+
1328
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
1329
+ {
1330
+ /* end tag for current element */
1331
+ if (node->tag == element->tag && node->type == EndTag)
1332
+ {
1333
+ if (element->tag->model & CM_INLINE)
1334
+ TY_(PopInline)( doc, node );
1335
+
1336
+ TY_(FreeNode)( doc, node );
1337
+
1338
+ if (!(mode & Preformatted))
1339
+ TrimSpaces(doc, element);
1340
+
1341
+ /*
1342
+ if a font element wraps an anchor and nothing else
1343
+ then move the font element inside the anchor since
1344
+ otherwise it won't alter the anchor text color
1345
+ */
1346
+ if ( nodeIsFONT(element) &&
1347
+ element->content && element->content == element->last )
1348
+ {
1349
+ Node *child = element->content;
1350
+
1351
+ if ( nodeIsA(child) )
1352
+ {
1353
+ child->parent = element->parent;
1354
+ child->next = element->next;
1355
+ child->prev = element->prev;
1356
+
1357
+ element->next = NULL;
1358
+ element->prev = NULL;
1359
+ element->parent = child;
1360
+
1361
+ element->content = child->content;
1362
+ element->last = child->last;
1363
+ child->content = element;
1364
+
1365
+ TY_(FixNodeLinks)(child);
1366
+ TY_(FixNodeLinks)(element);
1367
+ }
1368
+ }
1369
+
1370
+ element->closed = yes;
1371
+ TrimSpaces( doc, element );
1372
+ return;
1373
+ }
1374
+
1375
+ /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
1376
+ /* (see additional conditions below) */
1377
+ /* otherwise emphasis nesting is probably unintentional */
1378
+ /* big, small, sub, sup have cumulative effect to leave them alone */
1379
+ if ( node->type == StartTag
1380
+ && node->tag == element->tag
1381
+ && TY_(IsPushed)( doc, node )
1382
+ && !node->implicit
1383
+ && !element->implicit
1384
+ && node->tag && (node->tag->model & CM_INLINE)
1385
+ && !nodeIsA(node)
1386
+ && !nodeIsFONT(node)
1387
+ && !nodeIsBIG(node)
1388
+ && !nodeIsSMALL(node)
1389
+ && !nodeIsSUB(node)
1390
+ && !nodeIsSUP(node)
1391
+ && !nodeIsQ(node)
1392
+ && !nodeIsSPAN(node)
1393
+ )
1394
+ {
1395
+ /* proceeds only if "node" does not have any attribute and
1396
+ follows a text node not finishing with a space */
1397
+ if (element->content != NULL && node->attributes == NULL
1398
+ && TY_(nodeIsText)(element->last)
1399
+ && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
1400
+ {
1401
+ TY_(ReportWarning)(doc, element, node, COERCE_TO_ENDTAG_WARN);
1402
+ node->type = EndTag;
1403
+ TY_(UngetToken)(doc);
1404
+ continue;
1405
+ }
1406
+
1407
+ if (node->attributes == NULL || element->attributes == NULL)
1408
+ TY_(ReportWarning)(doc, element, node, NESTED_EMPHASIS);
1409
+ }
1410
+ else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
1411
+ nodeIsQ(node) )
1412
+ {
1413
+ TY_(ReportWarning)(doc, element, node, NESTED_QUOTATION);
1414
+ }
1415
+
1416
+ if ( TY_(nodeIsText)(node) )
1417
+ {
1418
+ /* only called for 1st child */
1419
+ if ( element->content == NULL && !(mode & Preformatted) )
1420
+ TrimSpaces( doc, element );
1421
+
1422
+ if ( node->start >= node->end )
1423
+ {
1424
+ TY_(FreeNode)( doc, node );
1425
+ continue;
1426
+ }
1427
+
1428
+ TY_(InsertNodeAtEnd)(element, node);
1429
+ continue;
1430
+ }
1431
+
1432
+ /* mixed content model so allow text */
1433
+ if (InsertMisc(element, node))
1434
+ continue;
1435
+
1436
+ /* deal with HTML tags */
1437
+ if ( nodeIsHTML(node) )
1438
+ {
1439
+ if ( TY_(nodeIsElement)(node) )
1440
+ {
1441
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1442
+ TY_(FreeNode)( doc, node );
1443
+ continue;
1444
+ }
1445
+
1446
+ /* otherwise infer end of inline element */
1447
+ TY_(UngetToken)( doc );
1448
+
1449
+ if (!(mode & Preformatted))
1450
+ TrimSpaces(doc, element);
1451
+
1452
+ return;
1453
+ }
1454
+
1455
+ /* within <dt> or <pre> map <p> to <br> */
1456
+ if ( nodeIsP(node) &&
1457
+ node->type == StartTag &&
1458
+ ( (mode & Preformatted) ||
1459
+ nodeIsDT(element) ||
1460
+ DescendantOf(element, TidyTag_DT )
1461
+ )
1462
+ )
1463
+ {
1464
+ node->tag = TY_(LookupTagDef)( TidyTag_BR );
1465
+ TidyDocFree(doc, node->element);
1466
+ node->element = TY_(tmbstrdup)(doc->allocator, "br");
1467
+ TrimSpaces(doc, element);
1468
+ TY_(InsertNodeAtEnd)(element, node);
1469
+ continue;
1470
+ }
1471
+
1472
+ /* <p> allowed within <address> in HTML 4.01 Transitional */
1473
+ if ( nodeIsP(node) &&
1474
+ node->type == StartTag &&
1475
+ nodeIsADDRESS(element) )
1476
+ {
1477
+ TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1478
+ TY_(InsertNodeAtEnd)(element, node);
1479
+ (*node->tag->parser)( doc, node, mode );
1480
+ continue;
1481
+ }
1482
+
1483
+ /* ignore unknown and PARAM tags */
1484
+ if ( node->tag == NULL || nodeIsPARAM(node) )
1485
+ {
1486
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1487
+ TY_(FreeNode)( doc, node );
1488
+ continue;
1489
+ }
1490
+
1491
+ if ( nodeIsBR(node) && node->type == EndTag )
1492
+ node->type = StartTag;
1493
+
1494
+ if ( node->type == EndTag )
1495
+ {
1496
+ /* coerce </br> to <br> */
1497
+ if ( nodeIsBR(node) )
1498
+ node->type = StartTag;
1499
+ else if ( nodeIsP(node) )
1500
+ {
1501
+ /* coerce unmatched </p> to <br><br> */
1502
+ if ( !DescendantOf(element, TidyTag_P) )
1503
+ {
1504
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
1505
+ TrimSpaces( doc, element );
1506
+ TY_(InsertNodeAtEnd)( element, node );
1507
+ node = TY_(InferredTag)(doc, TidyTag_BR);
1508
+ TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
1509
+ continue;
1510
+ }
1511
+ }
1512
+ else if ( TY_(nodeHasCM)(node, CM_INLINE)
1513
+ && !nodeIsA(node)
1514
+ && !TY_(nodeHasCM)(node, CM_OBJECT)
1515
+ && TY_(nodeHasCM)(element, CM_INLINE) )
1516
+ {
1517
+ /* allow any inline end tag to end current element */
1518
+
1519
+ /* http://tidy.sf.net/issue/1426419 */
1520
+ /* but, like the browser, retain an earlier inline element.
1521
+ This is implemented by setting the lexer into a mode
1522
+ where it gets tokens from the inline stack rather than
1523
+ from the input stream. Check if the scenerio fits. */
1524
+ if ( !nodeIsA(element)
1525
+ && (node->tag != element->tag)
1526
+ && TY_(IsPushed)( doc, node )
1527
+ && TY_(IsPushed)( doc, element ) )
1528
+ {
1529
+ /* we have something like
1530
+ <b>bold <i>bold and italic</b> italics</i> */
1531
+ if ( TY_(SwitchInline)( doc, element, node ) )
1532
+ {
1533
+ TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1534
+ TY_(UngetToken)( doc ); /* put this back */
1535
+ TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
1536
+ if (!(mode & Preformatted))
1537
+ TrimSpaces( doc, element );
1538
+ return; /* close <i>, but will re-open it, after </b> */
1539
+ }
1540
+ }
1541
+ TY_(PopInline)( doc, element );
1542
+
1543
+ if ( !nodeIsA(element) )
1544
+ {
1545
+ if ( nodeIsA(node) && node->tag != element->tag )
1546
+ {
1547
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1548
+ TY_(UngetToken)( doc );
1549
+ }
1550
+ else
1551
+ {
1552
+ TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1553
+ TY_(FreeNode)( doc, node);
1554
+ }
1555
+
1556
+ if (!(mode & Preformatted))
1557
+ TrimSpaces(doc, element);
1558
+
1559
+ return;
1560
+ }
1561
+
1562
+ /* if parent is <a> then discard unexpected inline end tag */
1563
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1564
+ TY_(FreeNode)( doc, node);
1565
+ continue;
1566
+ } /* special case </tr> etc. for stuff moved in front of table */
1567
+ else if ( lexer->exiled
1568
+ && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
1569
+ {
1570
+ TY_(UngetToken)( doc );
1571
+ TrimSpaces(doc, element);
1572
+ return;
1573
+ }
1574
+ }
1575
+
1576
+ /* allow any header tag to end current header */
1577
+ if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
1578
+ {
1579
+
1580
+ if ( node->tag == element->tag )
1581
+ {
1582
+ TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG );
1583
+ TY_(FreeNode)( doc, node);
1584
+ }
1585
+ else
1586
+ {
1587
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1588
+ TY_(UngetToken)( doc );
1589
+ }
1590
+
1591
+ if (!(mode & Preformatted))
1592
+ TrimSpaces(doc, element);
1593
+
1594
+ return;
1595
+ }
1596
+
1597
+ /*
1598
+ an <A> tag to ends any open <A> element
1599
+ but <A href=...> is mapped to </A><A href=...>
1600
+ */
1601
+ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1602
+ /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
1603
+ if ( nodeIsA(node) && !node->implicit &&
1604
+ (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1605
+ {
1606
+ /* coerce <a> to </a> unless it has some attributes */
1607
+ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1608
+ /* other fixes by Dave Raggett */
1609
+ /* if (node->attributes == NULL) */
1610
+ if (node->type != EndTag && node->attributes == NULL)
1611
+ {
1612
+ node->type = EndTag;
1613
+ TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
1614
+ /* TY_(PopInline)( doc, node ); */
1615
+ TY_(UngetToken)( doc );
1616
+ continue;
1617
+ }
1618
+
1619
+ TY_(UngetToken)( doc );
1620
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1621
+ /* TY_(PopInline)( doc, element ); */
1622
+
1623
+ if (!(mode & Preformatted))
1624
+ TrimSpaces(doc, element);
1625
+
1626
+ return;
1627
+ }
1628
+
1629
+ if (element->tag->model & CM_HEADING)
1630
+ {
1631
+ if ( nodeIsCENTER(node) || nodeIsDIV(node) )
1632
+ {
1633
+ if (!TY_(nodeIsElement)(node))
1634
+ {
1635
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1636
+ TY_(FreeNode)( doc, node);
1637
+ continue;
1638
+ }
1639
+
1640
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1641
+
1642
+ /* insert center as parent if heading is empty */
1643
+ if (element->content == NULL)
1644
+ {
1645
+ InsertNodeAsParent(element, node);
1646
+ continue;
1647
+ }
1648
+
1649
+ /* split heading and make center parent of 2nd part */
1650
+ TY_(InsertNodeAfterElement)(element, node);
1651
+
1652
+ if (!(mode & Preformatted))
1653
+ TrimSpaces(doc, element);
1654
+
1655
+ element = TY_(CloneNode)( doc, element );
1656
+ TY_(InsertNodeAtEnd)(node, element);
1657
+ continue;
1658
+ }
1659
+
1660
+ if ( nodeIsHR(node) )
1661
+ {
1662
+ if ( !TY_(nodeIsElement)(node) )
1663
+ {
1664
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1665
+ TY_(FreeNode)( doc, node);
1666
+ continue;
1667
+ }
1668
+
1669
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1670
+
1671
+ /* insert hr before heading if heading is empty */
1672
+ if (element->content == NULL)
1673
+ {
1674
+ TY_(InsertNodeBeforeElement)(element, node);
1675
+ continue;
1676
+ }
1677
+
1678
+ /* split heading and insert hr before 2nd part */
1679
+ TY_(InsertNodeAfterElement)(element, node);
1680
+
1681
+ if (!(mode & Preformatted))
1682
+ TrimSpaces(doc, element);
1683
+
1684
+ element = TY_(CloneNode)( doc, element );
1685
+ TY_(InsertNodeAfterElement)(node, element);
1686
+ continue;
1687
+ }
1688
+ }
1689
+
1690
+ if ( nodeIsDT(element) )
1691
+ {
1692
+ if ( nodeIsHR(node) )
1693
+ {
1694
+ Node *dd;
1695
+ if ( !TY_(nodeIsElement)(node) )
1696
+ {
1697
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1698
+ TY_(FreeNode)( doc, node);
1699
+ continue;
1700
+ }
1701
+
1702
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1703
+ dd = TY_(InferredTag)(doc, TidyTag_DD);
1704
+
1705
+ /* insert hr within dd before dt if dt is empty */
1706
+ if (element->content == NULL)
1707
+ {
1708
+ TY_(InsertNodeBeforeElement)(element, dd);
1709
+ TY_(InsertNodeAtEnd)(dd, node);
1710
+ continue;
1711
+ }
1712
+
1713
+ /* split dt and insert hr within dd before 2nd part */
1714
+ TY_(InsertNodeAfterElement)(element, dd);
1715
+ TY_(InsertNodeAtEnd)(dd, node);
1716
+
1717
+ if (!(mode & Preformatted))
1718
+ TrimSpaces(doc, element);
1719
+
1720
+ element = TY_(CloneNode)( doc, element );
1721
+ TY_(InsertNodeAfterElement)(dd, element);
1722
+ continue;
1723
+ }
1724
+ }
1725
+
1726
+
1727
+ /*
1728
+ if this is the end tag for an ancestor element
1729
+ then infer end tag for this element
1730
+ */
1731
+ if (node->type == EndTag)
1732
+ {
1733
+ for (parent = element->parent;
1734
+ parent != NULL; parent = parent->parent)
1735
+ {
1736
+ if (node->tag == parent->tag)
1737
+ {
1738
+ if (!(element->tag->model & CM_OPT) && !element->implicit)
1739
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1740
+
1741
+ if( TY_(IsPushedLast)( doc, element, node ) )
1742
+ TY_(PopInline)( doc, element );
1743
+ TY_(UngetToken)( doc );
1744
+
1745
+ if (!(mode & Preformatted))
1746
+ TrimSpaces(doc, element);
1747
+
1748
+ return;
1749
+ }
1750
+ }
1751
+ }
1752
+
1753
+ /* block level tags end this element */
1754
+ if (!(node->tag->model & CM_INLINE) &&
1755
+ !(element->tag->model & CM_MIXED))
1756
+ {
1757
+ if ( !TY_(nodeIsElement)(node) )
1758
+ {
1759
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1760
+ TY_(FreeNode)( doc, node);
1761
+ continue;
1762
+ }
1763
+
1764
+ if (!(element->tag->model & CM_OPT))
1765
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1766
+
1767
+ if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
1768
+ {
1769
+ MoveToHead(doc, element, node);
1770
+ continue;
1771
+ }
1772
+
1773
+ /*
1774
+ prevent anchors from propagating into block tags
1775
+ except for headings h1 to h6
1776
+ */
1777
+ if ( nodeIsA(element) )
1778
+ {
1779
+ if (node->tag && !(node->tag->model & CM_HEADING))
1780
+ TY_(PopInline)( doc, element );
1781
+ else if (!(element->content))
1782
+ {
1783
+ TY_(DiscardElement)( doc, element );
1784
+ TY_(UngetToken)( doc );
1785
+ return;
1786
+ }
1787
+ }
1788
+
1789
+ TY_(UngetToken)( doc );
1790
+
1791
+ if (!(mode & Preformatted))
1792
+ TrimSpaces(doc, element);
1793
+
1794
+ return;
1795
+ }
1796
+
1797
+ /* parse inline element */
1798
+ if (TY_(nodeIsElement)(node))
1799
+ {
1800
+ if (node->implicit)
1801
+ TY_(ReportError)(doc, element, node, INSERTING_TAG);
1802
+
1803
+ /* trim white space before <br> */
1804
+ if ( nodeIsBR(node) )
1805
+ TrimSpaces(doc, element);
1806
+
1807
+ TY_(InsertNodeAtEnd)(element, node);
1808
+ ParseTag(doc, node, mode);
1809
+ continue;
1810
+ }
1811
+
1812
+ /* discard unexpected tags */
1813
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1814
+ TY_(FreeNode)( doc, node );
1815
+ continue;
1816
+ }
1817
+
1818
+ if (!(element->tag->model & CM_OPT))
1819
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1820
+
1821
+ }
1822
+
1823
+ void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
1824
+ {
1825
+ Lexer* lexer = doc->lexer;
1826
+ if ( lexer->isvoyager )
1827
+ {
1828
+ Node *node = TY_(GetToken)( doc, mode);
1829
+ if ( node )
1830
+ {
1831
+ if ( !(node->type == EndTag && node->tag == element->tag) )
1832
+ {
1833
+ TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY);
1834
+ TY_(UngetToken)( doc );
1835
+ }
1836
+ else
1837
+ {
1838
+ TY_(FreeNode)( doc, node );
1839
+ }
1840
+ }
1841
+ }
1842
+ }
1843
+
1844
+ void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode)
1845
+ {
1846
+ Lexer* lexer = doc->lexer;
1847
+ Node *node, *parent;
1848
+
1849
+ if (list->tag->model & CM_EMPTY)
1850
+ return;
1851
+
1852
+ lexer->insert = NULL; /* defer implicit inline start tags */
1853
+
1854
+ while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1855
+ {
1856
+ if (node->tag == list->tag && node->type == EndTag)
1857
+ {
1858
+ TY_(FreeNode)( doc, node);
1859
+ list->closed = yes;
1860
+ return;
1861
+ }
1862
+
1863
+ /* deal with comments etc. */
1864
+ if (InsertMisc(list, node))
1865
+ continue;
1866
+
1867
+ if (TY_(nodeIsText)(node))
1868
+ {
1869
+ TY_(UngetToken)( doc );
1870
+ node = TY_(InferredTag)(doc, TidyTag_DT);
1871
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1872
+ }
1873
+
1874
+ if (node->tag == NULL)
1875
+ {
1876
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1877
+ TY_(FreeNode)( doc, node);
1878
+ continue;
1879
+ }
1880
+
1881
+ /*
1882
+ if this is the end tag for an ancestor element
1883
+ then infer end tag for this element
1884
+ */
1885
+ if (node->type == EndTag)
1886
+ {
1887
+ Bool discardIt = no;
1888
+ if ( nodeIsFORM(node) )
1889
+ {
1890
+ BadForm( doc );
1891
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1892
+ TY_(FreeNode)( doc, node );
1893
+ continue;
1894
+ }
1895
+
1896
+ for (parent = list->parent;
1897
+ parent != NULL; parent = parent->parent)
1898
+ {
1899
+ /* Do not match across BODY to avoid infinite loop
1900
+ between ParseBody and this parser,
1901
+ See http://tidy.sf.net/bug/1098012. */
1902
+ if (nodeIsBODY(parent))
1903
+ {
1904
+ discardIt = yes;
1905
+ break;
1906
+ }
1907
+ if (node->tag == parent->tag)
1908
+ {
1909
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
1910
+
1911
+ TY_(UngetToken)( doc );
1912
+ return;
1913
+ }
1914
+ }
1915
+ if (discardIt)
1916
+ {
1917
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1918
+ TY_(FreeNode)( doc, node);
1919
+ continue;
1920
+ }
1921
+ }
1922
+
1923
+ /* center in a dt or a dl breaks the dl list in two */
1924
+ if ( nodeIsCENTER(node) )
1925
+ {
1926
+ if (list->content)
1927
+ TY_(InsertNodeAfterElement)(list, node);
1928
+ else /* trim empty dl list */
1929
+ {
1930
+ TY_(InsertNodeBeforeElement)(list, node);
1931
+
1932
+ /* #540296 tidy dumps with empty definition list */
1933
+ #if 0
1934
+ TY_(DiscardElement)(list);
1935
+ #endif
1936
+ }
1937
+
1938
+ /* #426885 - fix by Glenn Carroll 19 Apr 00, and
1939
+ Gary Dechaines 11 Aug 00 */
1940
+ /* ParseTag can destroy node, if it finds that
1941
+ * this <center> is followed immediately by </center>.
1942
+ * It's awkward but necessary to determine if this
1943
+ * has happened.
1944
+ */
1945
+ parent = node->parent;
1946
+
1947
+ /* and parse contents of center */
1948
+ lexer->excludeBlocks = no;
1949
+ ParseTag( doc, node, mode);
1950
+ lexer->excludeBlocks = yes;
1951
+
1952
+ /* now create a new dl element,
1953
+ * unless node has been blown away because the
1954
+ * center was empty, as above.
1955
+ */
1956
+ if (parent->last == node)
1957
+ {
1958
+ list = TY_(InferredTag)(doc, TidyTag_DL);
1959
+ TY_(InsertNodeAfterElement)(node, list);
1960
+ }
1961
+ continue;
1962
+ }
1963
+
1964
+ if ( !(nodeIsDT(node) || nodeIsDD(node)) )
1965
+ {
1966
+ TY_(UngetToken)( doc );
1967
+
1968
+ if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
1969
+ {
1970
+ TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN);
1971
+ return;
1972
+ }
1973
+
1974
+ /* if DD appeared directly in BODY then exclude blocks */
1975
+ if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
1976
+ return;
1977
+
1978
+ node = TY_(InferredTag)(doc, TidyTag_DD);
1979
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1980
+ }
1981
+
1982
+ if (node->type == EndTag)
1983
+ {
1984
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1985
+ TY_(FreeNode)( doc, node);
1986
+ continue;
1987
+ }
1988
+
1989
+ /* node should be <DT> or <DD>*/
1990
+ TY_(InsertNodeAtEnd)(list, node);
1991
+ ParseTag( doc, node, IgnoreWhitespace);
1992
+ }
1993
+
1994
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
1995
+ }
1996
+
1997
+ static Bool FindLastLI( Node *list, Node **lastli )
1998
+ {
1999
+ Node *node;
2000
+
2001
+ *lastli = NULL;
2002
+ for ( node = list->content; node ; node = node->next )
2003
+ if ( nodeIsLI(node) && node->type == StartTag )
2004
+ *lastli=node;
2005
+ return *lastli ? yes:no;
2006
+ }
2007
+
2008
+ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode))
2009
+ {
2010
+ Lexer* lexer = doc->lexer;
2011
+ Node *node, *parent, *lastli;
2012
+ Bool wasblock;
2013
+
2014
+ if (list->tag->model & CM_EMPTY)
2015
+ return;
2016
+
2017
+ lexer->insert = NULL; /* defer implicit inline start tags */
2018
+
2019
+ while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
2020
+ {
2021
+ if (node->tag == list->tag && node->type == EndTag)
2022
+ {
2023
+ TY_(FreeNode)( doc, node);
2024
+ list->closed = yes;
2025
+ return;
2026
+ }
2027
+
2028
+ /* deal with comments etc. */
2029
+ if (InsertMisc(list, node))
2030
+ continue;
2031
+
2032
+ if (node->type != TextNode && node->tag == NULL)
2033
+ {
2034
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2035
+ TY_(FreeNode)( doc, node);
2036
+ continue;
2037
+ }
2038
+
2039
+ /*
2040
+ if this is the end tag for an ancestor element
2041
+ then infer end tag for this element
2042
+ */
2043
+ if (node->type == EndTag)
2044
+ {
2045
+ if ( nodeIsFORM(node) )
2046
+ {
2047
+ BadForm( doc );
2048
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2049
+ TY_(FreeNode)( doc, node );
2050
+ continue;
2051
+ }
2052
+
2053
+ if (TY_(nodeHasCM)(node,CM_INLINE))
2054
+ {
2055
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2056
+ TY_(PopInline)( doc, node );
2057
+ TY_(FreeNode)( doc, node);
2058
+ continue;
2059
+ }
2060
+
2061
+ for ( parent = list->parent;
2062
+ parent != NULL; parent = parent->parent )
2063
+ {
2064
+ /* Do not match across BODY to avoid infinite loop
2065
+ between ParseBody and this parser,
2066
+ See http://tidy.sf.net/bug/1053626. */
2067
+ if (nodeIsBODY(parent))
2068
+ break;
2069
+ if (node->tag == parent->tag)
2070
+ {
2071
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2072
+ TY_(UngetToken)( doc );
2073
+ return;
2074
+ }
2075
+ }
2076
+
2077
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2078
+ TY_(FreeNode)( doc, node);
2079
+ continue;
2080
+ }
2081
+
2082
+ if ( !nodeIsLI(node) )
2083
+ {
2084
+ TY_(UngetToken)( doc );
2085
+
2086
+ if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
2087
+ {
2088
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2089
+ return;
2090
+ }
2091
+ /* http://tidy.sf.net/issue/1316307 */
2092
+ /* In exiled mode, return so table processing can continue. */
2093
+ else if ( lexer->exiled
2094
+ && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
2095
+ || nodeIsTABLE(node)) )
2096
+ return;
2097
+
2098
+ /* http://tidy.sf.net/issue/836462
2099
+ If "list" is an unordered list, insert the next tag within
2100
+ the last <li> to preserve the numbering to match the visual
2101
+ rendering of most browsers. */
2102
+ if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
2103
+ {
2104
+ /* Create a node for error reporting */
2105
+ node = TY_(InferredTag)(doc, TidyTag_LI);
2106
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2107
+ TY_(FreeNode)( doc, node);
2108
+ node = lastli;
2109
+ }
2110
+ else
2111
+ {
2112
+ /* Add an inferred <li> */
2113
+ wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
2114
+ node = TY_(InferredTag)(doc, TidyTag_LI);
2115
+ /* Add "display: inline" to avoid a blank line after <li> with
2116
+ Internet Explorer. See http://tidy.sf.net/issue/836462 */
2117
+ TY_(AddStyleProperty)( doc, node,
2118
+ wasblock
2119
+ ? "list-style: none; display: inline"
2120
+ : "list-style: none"
2121
+ );
2122
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2123
+ TY_(InsertNodeAtEnd)(list,node);
2124
+ }
2125
+ }
2126
+ else
2127
+ /* node is <LI> */
2128
+ TY_(InsertNodeAtEnd)(list,node);
2129
+
2130
+ ParseTag( doc, node, IgnoreWhitespace);
2131
+ }
2132
+
2133
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
2134
+ }
2135
+
2136
+ /*
2137
+ unexpected content in table row is moved to just before
2138
+ the table in accordance with Netscape and IE. This code
2139
+ assumes that node hasn't been inserted into the row.
2140
+ */
2141
+ static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
2142
+ Node *node )
2143
+ {
2144
+ Node *table;
2145
+
2146
+ /* first find the table element */
2147
+ for (table = row->parent; table; table = table->parent)
2148
+ {
2149
+ if ( nodeIsTABLE(table) )
2150
+ {
2151
+ TY_(InsertNodeBeforeElement)( table, node );
2152
+ return;
2153
+ }
2154
+ }
2155
+ /* No table element */
2156
+ TY_(InsertNodeBeforeElement)( row->parent, node );
2157
+ }
2158
+
2159
+ /*
2160
+ if a table row is empty then insert an empty cell
2161
+ this practice is consistent with browser behavior
2162
+ and avoids potential problems with row spanning cells
2163
+ */
2164
+ static void FixEmptyRow(TidyDocImpl* doc, Node *row)
2165
+ {
2166
+ Node *cell;
2167
+
2168
+ if (row->content == NULL)
2169
+ {
2170
+ cell = TY_(InferredTag)(doc, TidyTag_TD);
2171
+ TY_(InsertNodeAtEnd)(row, cell);
2172
+ TY_(ReportError)(doc, row, cell, MISSING_STARTTAG);
2173
+ }
2174
+ }
2175
+
2176
+ void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode))
2177
+ {
2178
+ Lexer* lexer = doc->lexer;
2179
+ Node *node;
2180
+ Bool exclude_state;
2181
+
2182
+ if (row->tag->model & CM_EMPTY)
2183
+ return;
2184
+
2185
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2186
+ {
2187
+ if (node->tag == row->tag)
2188
+ {
2189
+ if (node->type == EndTag)
2190
+ {
2191
+ TY_(FreeNode)( doc, node);
2192
+ row->closed = yes;
2193
+ FixEmptyRow( doc, row);
2194
+ return;
2195
+ }
2196
+
2197
+ /* New row start implies end of current row */
2198
+ TY_(UngetToken)( doc );
2199
+ FixEmptyRow( doc, row);
2200
+ return;
2201
+ }
2202
+
2203
+ /*
2204
+ if this is the end tag for an ancestor element
2205
+ then infer end tag for this element
2206
+ */
2207
+ if ( node->type == EndTag )
2208
+ {
2209
+ if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
2210
+ && DescendantOf(row, TagId(node)) )
2211
+ {
2212
+ TY_(UngetToken)( doc );
2213
+ return;
2214
+ }
2215
+
2216
+ if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2217
+ {
2218
+ if ( nodeIsFORM(node) )
2219
+ BadForm( doc );
2220
+
2221
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2222
+ TY_(FreeNode)( doc, node);
2223
+ continue;
2224
+ }
2225
+
2226
+ if ( nodeIsTD(node) || nodeIsTH(node) )
2227
+ {
2228
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2229
+ TY_(FreeNode)( doc, node);
2230
+ continue;
2231
+ }
2232
+ }
2233
+
2234
+ /* deal with comments etc. */
2235
+ if (InsertMisc(row, node))
2236
+ continue;
2237
+
2238
+ /* discard unknown tags */
2239
+ if (node->tag == NULL && node->type != TextNode)
2240
+ {
2241
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2242
+ TY_(FreeNode)( doc, node);
2243
+ continue;
2244
+ }
2245
+
2246
+ /* discard unexpected <table> element */
2247
+ if ( nodeIsTABLE(node) )
2248
+ {
2249
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2250
+ TY_(FreeNode)( doc, node);
2251
+ continue;
2252
+ }
2253
+
2254
+ /* THEAD, TFOOT or TBODY */
2255
+ if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
2256
+ {
2257
+ TY_(UngetToken)( doc );
2258
+ return;
2259
+ }
2260
+
2261
+ if (node->type == EndTag)
2262
+ {
2263
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2264
+ TY_(FreeNode)( doc, node);
2265
+ continue;
2266
+ }
2267
+
2268
+ /*
2269
+ if text or inline or block move before table
2270
+ if head content move to head
2271
+ */
2272
+
2273
+ if (node->type != EndTag)
2274
+ {
2275
+ if ( nodeIsFORM(node) )
2276
+ {
2277
+ TY_(UngetToken)( doc );
2278
+ node = TY_(InferredTag)(doc, TidyTag_TD);
2279
+ TY_(ReportError)(doc, row, node, MISSING_STARTTAG);
2280
+ }
2281
+ else if ( TY_(nodeIsText)(node)
2282
+ || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
2283
+ {
2284
+ MoveBeforeTable( doc, row, node );
2285
+ TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2286
+ lexer->exiled = yes;
2287
+ exclude_state = lexer->excludeBlocks;
2288
+ lexer->excludeBlocks = no;
2289
+
2290
+ if (node->type != TextNode)
2291
+ ParseTag( doc, node, IgnoreWhitespace);
2292
+
2293
+ lexer->exiled = no;
2294
+ lexer->excludeBlocks = exclude_state;
2295
+ continue;
2296
+ }
2297
+ else if (node->tag->model & CM_HEAD)
2298
+ {
2299
+ TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2300
+ MoveToHead( doc, row, node);
2301
+ continue;
2302
+ }
2303
+ }
2304
+
2305
+ if ( !(nodeIsTD(node) || nodeIsTH(node)) )
2306
+ {
2307
+ TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2308
+ TY_(FreeNode)( doc, node);
2309
+ continue;
2310
+ }
2311
+
2312
+ /* node should be <TD> or <TH> */
2313
+ TY_(InsertNodeAtEnd)(row, node);
2314
+ exclude_state = lexer->excludeBlocks;
2315
+ lexer->excludeBlocks = no;
2316
+ ParseTag( doc, node, IgnoreWhitespace);
2317
+ lexer->excludeBlocks = exclude_state;
2318
+
2319
+ /* pop inline stack */
2320
+
2321
+ while ( lexer->istacksize > lexer->istackbase )
2322
+ TY_(PopInline)( doc, NULL );
2323
+ }
2324
+
2325
+ }
2326
+
2327
+ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode))
2328
+ {
2329
+ Lexer* lexer = doc->lexer;
2330
+ Node *node, *parent;
2331
+
2332
+ if (rowgroup->tag->model & CM_EMPTY)
2333
+ return;
2334
+
2335
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2336
+ {
2337
+ if (node->tag == rowgroup->tag)
2338
+ {
2339
+ if (node->type == EndTag)
2340
+ {
2341
+ rowgroup->closed = yes;
2342
+ TY_(FreeNode)( doc, node);
2343
+ return;
2344
+ }
2345
+
2346
+ TY_(UngetToken)( doc );
2347
+ return;
2348
+ }
2349
+
2350
+ /* if </table> infer end tag */
2351
+ if ( nodeIsTABLE(node) && node->type == EndTag )
2352
+ {
2353
+ TY_(UngetToken)( doc );
2354
+ return;
2355
+ }
2356
+
2357
+ /* deal with comments etc. */
2358
+ if (InsertMisc(rowgroup, node))
2359
+ continue;
2360
+
2361
+ /* discard unknown tags */
2362
+ if (node->tag == NULL && node->type != TextNode)
2363
+ {
2364
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2365
+ TY_(FreeNode)( doc, node);
2366
+ continue;
2367
+ }
2368
+
2369
+ /*
2370
+ if TD or TH then infer <TR>
2371
+ if text or inline or block move before table
2372
+ if head content move to head
2373
+ */
2374
+
2375
+ if (node->type != EndTag)
2376
+ {
2377
+ if ( nodeIsTD(node) || nodeIsTH(node) )
2378
+ {
2379
+ TY_(UngetToken)( doc );
2380
+ node = TY_(InferredTag)(doc, TidyTag_TR);
2381
+ TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2382
+ }
2383
+ else if ( TY_(nodeIsText)(node)
2384
+ || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2385
+ {
2386
+ MoveBeforeTable( doc, rowgroup, node );
2387
+ TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2388
+ lexer->exiled = yes;
2389
+
2390
+ if (node->type != TextNode)
2391
+ ParseTag(doc, node, IgnoreWhitespace);
2392
+
2393
+ lexer->exiled = no;
2394
+ continue;
2395
+ }
2396
+ else if (node->tag->model & CM_HEAD)
2397
+ {
2398
+ TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2399
+ MoveToHead(doc, rowgroup, node);
2400
+ continue;
2401
+ }
2402
+ }
2403
+
2404
+ /*
2405
+ if this is the end tag for ancestor element
2406
+ then infer end tag for this element
2407
+ */
2408
+ if (node->type == EndTag)
2409
+ {
2410
+ if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2411
+ {
2412
+ if ( nodeIsFORM(node) )
2413
+ BadForm( doc );
2414
+
2415
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2416
+ TY_(FreeNode)( doc, node);
2417
+ continue;
2418
+ }
2419
+
2420
+ if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
2421
+ {
2422
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2423
+ TY_(FreeNode)( doc, node);
2424
+ continue;
2425
+ }
2426
+
2427
+ for ( parent = rowgroup->parent;
2428
+ parent != NULL;
2429
+ parent = parent->parent )
2430
+ {
2431
+ if (node->tag == parent->tag)
2432
+ {
2433
+ TY_(UngetToken)( doc );
2434
+ return;
2435
+ }
2436
+ }
2437
+ }
2438
+
2439
+ /*
2440
+ if THEAD, TFOOT or TBODY then implied end tag
2441
+
2442
+ */
2443
+ if (node->tag->model & CM_ROWGRP)
2444
+ {
2445
+ if (node->type != EndTag)
2446
+ {
2447
+ TY_(UngetToken)( doc );
2448
+ return;
2449
+ }
2450
+ }
2451
+
2452
+ if (node->type == EndTag)
2453
+ {
2454
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2455
+ TY_(FreeNode)( doc, node);
2456
+ continue;
2457
+ }
2458
+
2459
+ if ( !nodeIsTR(node) )
2460
+ {
2461
+ node = TY_(InferredTag)(doc, TidyTag_TR);
2462
+ TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2463
+ TY_(UngetToken)( doc );
2464
+ }
2465
+
2466
+ /* node should be <TR> */
2467
+ TY_(InsertNodeAtEnd)(rowgroup, node);
2468
+ ParseTag(doc, node, IgnoreWhitespace);
2469
+ }
2470
+
2471
+ }
2472
+
2473
+ void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode))
2474
+ {
2475
+ Node *node, *parent;
2476
+
2477
+ if (colgroup->tag->model & CM_EMPTY)
2478
+ return;
2479
+
2480
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2481
+ {
2482
+ if (node->tag == colgroup->tag && node->type == EndTag)
2483
+ {
2484
+ TY_(FreeNode)( doc, node);
2485
+ colgroup->closed = yes;
2486
+ return;
2487
+ }
2488
+
2489
+ /*
2490
+ if this is the end tag for an ancestor element
2491
+ then infer end tag for this element
2492
+ */
2493
+ if (node->type == EndTag)
2494
+ {
2495
+ if ( nodeIsFORM(node) )
2496
+ {
2497
+ BadForm( doc );
2498
+ TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2499
+ TY_(FreeNode)( doc, node);
2500
+ continue;
2501
+ }
2502
+
2503
+ for ( parent = colgroup->parent;
2504
+ parent != NULL;
2505
+ parent = parent->parent )
2506
+ {
2507
+ if (node->tag == parent->tag)
2508
+ {
2509
+ TY_(UngetToken)( doc );
2510
+ return;
2511
+ }
2512
+ }
2513
+ }
2514
+
2515
+ if (TY_(nodeIsText)(node))
2516
+ {
2517
+ TY_(UngetToken)( doc );
2518
+ return;
2519
+ }
2520
+
2521
+ /* deal with comments etc. */
2522
+ if (InsertMisc(colgroup, node))
2523
+ continue;
2524
+
2525
+ /* discard unknown tags */
2526
+ if (node->tag == NULL)
2527
+ {
2528
+ TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2529
+ TY_(FreeNode)( doc, node);
2530
+ continue;
2531
+ }
2532
+
2533
+ if ( !nodeIsCOL(node) )
2534
+ {
2535
+ TY_(UngetToken)( doc );
2536
+ return;
2537
+ }
2538
+
2539
+ if (node->type == EndTag)
2540
+ {
2541
+ TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2542
+ TY_(FreeNode)( doc, node);
2543
+ continue;
2544
+ }
2545
+
2546
+ /* node should be <COL> */
2547
+ TY_(InsertNodeAtEnd)(colgroup, node);
2548
+ ParseTag(doc, node, IgnoreWhitespace);
2549
+ }
2550
+ }
2551
+
2552
+ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode))
2553
+ {
2554
+ Lexer* lexer = doc->lexer;
2555
+ Node *node, *parent;
2556
+ uint istackbase;
2557
+
2558
+ TY_(DeferDup)( doc );
2559
+ istackbase = lexer->istackbase;
2560
+ lexer->istackbase = lexer->istacksize;
2561
+
2562
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2563
+ {
2564
+ if (node->tag == table->tag && node->type == EndTag)
2565
+ {
2566
+ TY_(FreeNode)( doc, node);
2567
+ lexer->istackbase = istackbase;
2568
+ table->closed = yes;
2569
+ return;
2570
+ }
2571
+
2572
+ /* deal with comments etc. */
2573
+ if (InsertMisc(table, node))
2574
+ continue;
2575
+
2576
+ /* discard unknown tags */
2577
+ if (node->tag == NULL && node->type != TextNode)
2578
+ {
2579
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2580
+ TY_(FreeNode)( doc, node);
2581
+ continue;
2582
+ }
2583
+
2584
+ /* if TD or TH or text or inline or block then infer <TR> */
2585
+
2586
+ if (node->type != EndTag)
2587
+ {
2588
+ if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
2589
+ {
2590
+ TY_(UngetToken)( doc );
2591
+ node = TY_(InferredTag)(doc, TidyTag_TR);
2592
+ TY_(ReportError)(doc, table, node, MISSING_STARTTAG);
2593
+ }
2594
+ else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
2595
+ {
2596
+ TY_(InsertNodeBeforeElement)(table, node);
2597
+ TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2598
+ lexer->exiled = yes;
2599
+
2600
+ if (node->type != TextNode)
2601
+ ParseTag(doc, node, IgnoreWhitespace);
2602
+
2603
+ lexer->exiled = no;
2604
+ continue;
2605
+ }
2606
+ else if (node->tag->model & CM_HEAD)
2607
+ {
2608
+ MoveToHead(doc, table, node);
2609
+ continue;
2610
+ }
2611
+ }
2612
+
2613
+ /*
2614
+ if this is the end tag for an ancestor element
2615
+ then infer end tag for this element
2616
+ */
2617
+ if (node->type == EndTag)
2618
+ {
2619
+ if ( nodeIsFORM(node) )
2620
+ {
2621
+ BadForm( doc );
2622
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2623
+ TY_(FreeNode)( doc, node);
2624
+ continue;
2625
+ }
2626
+
2627
+ /* best to discard unexpected block/inline end tags */
2628
+ if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
2629
+ TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2630
+ {
2631
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2632
+ TY_(FreeNode)( doc, node);
2633
+ continue;
2634
+ }
2635
+
2636
+ for ( parent = table->parent;
2637
+ parent != NULL;
2638
+ parent = parent->parent )
2639
+ {
2640
+ if (node->tag == parent->tag)
2641
+ {
2642
+ TY_(ReportError)(doc, table, node, MISSING_ENDTAG_BEFORE );
2643
+ TY_(UngetToken)( doc );
2644
+ lexer->istackbase = istackbase;
2645
+ return;
2646
+ }
2647
+ }
2648
+ }
2649
+
2650
+ if (!(node->tag->model & CM_TABLE))
2651
+ {
2652
+ TY_(UngetToken)( doc );
2653
+ TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2654
+ lexer->istackbase = istackbase;
2655
+ return;
2656
+ }
2657
+
2658
+ if (TY_(nodeIsElement)(node))
2659
+ {
2660
+ TY_(InsertNodeAtEnd)(table, node);
2661
+ ParseTag(doc, node, IgnoreWhitespace);
2662
+ continue;
2663
+ }
2664
+
2665
+ /* discard unexpected text nodes and end tags */
2666
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2667
+ TY_(FreeNode)( doc, node);
2668
+ }
2669
+
2670
+ TY_(ReportError)(doc, table, node, MISSING_ENDTAG_FOR);
2671
+ lexer->istackbase = istackbase;
2672
+ }
2673
+
2674
+ /* acceptable content for pre elements */
2675
+ static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
2676
+ {
2677
+ /* p is coerced to br's, Text OK too */
2678
+ if ( nodeIsP(node) || TY_(nodeIsText)(node) )
2679
+ return yes;
2680
+
2681
+ if ( node->tag == NULL ||
2682
+ nodeIsPARAM(node) ||
2683
+ !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
2684
+ return no;
2685
+
2686
+ return yes;
2687
+ }
2688
+
2689
+ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
2690
+ {
2691
+ Node *node;
2692
+
2693
+ if (pre->tag->model & CM_EMPTY)
2694
+ return;
2695
+
2696
+ TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
2697
+
2698
+ while ((node = TY_(GetToken)(doc, Preformatted)) != NULL)
2699
+ {
2700
+ if ( node->type == EndTag &&
2701
+ (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
2702
+ {
2703
+ if (nodeIsBODY(node) || nodeIsHTML(node))
2704
+ {
2705
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2706
+ TY_(FreeNode)(doc, node);
2707
+ continue;
2708
+ }
2709
+ if (node->tag == pre->tag)
2710
+ {
2711
+ TY_(FreeNode)(doc, node);
2712
+ }
2713
+ else
2714
+ {
2715
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE );
2716
+ TY_(UngetToken)( doc );
2717
+ }
2718
+ pre->closed = yes;
2719
+ TrimSpaces(doc, pre);
2720
+ return;
2721
+ }
2722
+
2723
+ if (TY_(nodeIsText)(node))
2724
+ {
2725
+ TY_(InsertNodeAtEnd)(pre, node);
2726
+ continue;
2727
+ }
2728
+
2729
+ /* deal with comments etc. */
2730
+ if (InsertMisc(pre, node))
2731
+ continue;
2732
+
2733
+ if (node->tag == NULL)
2734
+ {
2735
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2736
+ TY_(FreeNode)(doc, node);
2737
+ continue;
2738
+ }
2739
+
2740
+ /* strip unexpected tags */
2741
+ if ( !PreContent(doc, node) )
2742
+ {
2743
+ Node *newnode;
2744
+
2745
+ /* fix for http://tidy.sf.net/bug/772205 */
2746
+ if (node->type == EndTag)
2747
+ {
2748
+ /* http://tidy.sf.net/issue/1590220 */
2749
+ if ( doc->lexer->exiled
2750
+ && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
2751
+ {
2752
+ TY_(UngetToken)(doc);
2753
+ TrimSpaces(doc, pre);
2754
+ return;
2755
+ }
2756
+
2757
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2758
+ TY_(FreeNode)(doc, node);
2759
+ continue;
2760
+ }
2761
+ /* http://tidy.sf.net/issue/1590220 */
2762
+ else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW)
2763
+ || nodeIsTABLE(node) )
2764
+ {
2765
+ if (!doc->lexer->exiled)
2766
+ /* No missing close warning if exiled. */
2767
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2768
+
2769
+ TY_(UngetToken)(doc);
2770
+ return;
2771
+ }
2772
+
2773
+ /*
2774
+ This is basically what Tidy 04 August 2000 did and far more accurate
2775
+ with respect to browser behaivour than the code commented out above.
2776
+ Tidy could try to propagate the <pre> into each disallowed child where
2777
+ <pre> is allowed in order to replicate some browsers behaivour, but
2778
+ there are a lot of exceptions, e.g. Internet Explorer does not propagate
2779
+ <pre> into table cells while Mozilla does. Opera 6 never propagates
2780
+ <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
2781
+
2782
+ Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
2783
+ which are handled like Mozilla takes them (Opera6 closes all <pre> after
2784
+ the first </pre>).
2785
+
2786
+ There are similar issues like replacing <p> in <pre> with <br>, for
2787
+ example
2788
+
2789
+ <pre>...<p>...</pre> (Input)
2790
+ <pre>...<br>...</pre> (Tidy)
2791
+ <pre>...<br>...</pre> (Opera 7 and Internet Explorer)
2792
+ <pre>...<br><br>...</pre> (Opera 6 and Mozilla)
2793
+
2794
+ <pre>...<p>...</p>...</pre> (Input)
2795
+ <pre>...<br>......</pre> (Tidy, BUG!)
2796
+ <pre>...<br>...<br>...</pre> (Internet Explorer)
2797
+ <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
2798
+ <pre>...<br>...<br><br>...</pre> (Opera 7)
2799
+
2800
+ or something similar, they could also be closing the <pre> and propagate
2801
+ the <pre> into the newly opened <p>.
2802
+
2803
+ Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
2804
+ dissallowed in <pre>, Tidy neither detects this nor does it perform any
2805
+ cleanup operation. Tidy should at least issue a warning if it encounters
2806
+ such constructs.
2807
+
2808
+ Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
2809
+ */
2810
+ TY_(InsertNodeAfterElement)(pre, node);
2811
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2812
+ ParseTag(doc, node, IgnoreWhitespace);
2813
+
2814
+ newnode = TY_(InferredTag)(doc, TidyTag_PRE);
2815
+ TY_(ReportError)(doc, pre, newnode, INSERTING_TAG);
2816
+ pre = newnode;
2817
+ TY_(InsertNodeAfterElement)(node, pre);
2818
+
2819
+ continue;
2820
+ }
2821
+
2822
+ if ( nodeIsP(node) )
2823
+ {
2824
+ if (node->type == StartTag)
2825
+ {
2826
+ TY_(ReportError)(doc, pre, node, USING_BR_INPLACE_OF);
2827
+
2828
+ /* trim white space before <p> in <pre>*/
2829
+ TrimSpaces(doc, pre);
2830
+
2831
+ /* coerce both <p> and </p> to <br> */
2832
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
2833
+ TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
2834
+ TY_(InsertNodeAtEnd)( pre, node );
2835
+ }
2836
+ else
2837
+ {
2838
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2839
+ TY_(FreeNode)( doc, node);
2840
+ }
2841
+ continue;
2842
+ }
2843
+
2844
+ if ( TY_(nodeIsElement)(node) )
2845
+ {
2846
+ /* trim white space before <br> */
2847
+ if ( nodeIsBR(node) )
2848
+ TrimSpaces(doc, pre);
2849
+
2850
+ TY_(InsertNodeAtEnd)(pre, node);
2851
+ ParseTag(doc, node, Preformatted);
2852
+ continue;
2853
+ }
2854
+
2855
+ /* discard unexpected tags */
2856
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2857
+ TY_(FreeNode)( doc, node);
2858
+ }
2859
+
2860
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_FOR);
2861
+ }
2862
+
2863
+ void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2864
+ {
2865
+ Lexer* lexer = doc->lexer;
2866
+ Node *node;
2867
+
2868
+ lexer->insert = NULL; /* defer implicit inline start tags */
2869
+
2870
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2871
+ {
2872
+ if (node->tag == field->tag && node->type == EndTag)
2873
+ {
2874
+ TY_(FreeNode)( doc, node);
2875
+ field->closed = yes;
2876
+ TrimSpaces(doc, field);
2877
+ return;
2878
+ }
2879
+
2880
+ /* deal with comments etc. */
2881
+ if (InsertMisc(field, node))
2882
+ continue;
2883
+
2884
+ if ( node->type == StartTag &&
2885
+ (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
2886
+ {
2887
+ if ( nodeIsOPTGROUP(node) )
2888
+ TY_(ReportError)(doc, field, node, CANT_BE_NESTED);
2889
+
2890
+ TY_(InsertNodeAtEnd)(field, node);
2891
+ ParseTag(doc, node, MixedContent);
2892
+ continue;
2893
+ }
2894
+
2895
+ /* discard unexpected tags */
2896
+ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED );
2897
+ TY_(FreeNode)( doc, node);
2898
+ }
2899
+ }
2900
+
2901
+
2902
+ void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2903
+ {
2904
+ Lexer* lexer = doc->lexer;
2905
+ Node *node;
2906
+
2907
+ lexer->insert = NULL; /* defer implicit inline start tags */
2908
+
2909
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2910
+ {
2911
+ if (node->tag == field->tag && node->type == EndTag)
2912
+ {
2913
+ TY_(FreeNode)( doc, node);
2914
+ field->closed = yes;
2915
+ TrimSpaces(doc, field);
2916
+ return;
2917
+ }
2918
+
2919
+ /* deal with comments etc. */
2920
+ if (InsertMisc(field, node))
2921
+ continue;
2922
+
2923
+ if ( node->type == StartTag &&
2924
+ ( nodeIsOPTION(node) ||
2925
+ nodeIsOPTGROUP(node) ||
2926
+ nodeIsSCRIPT(node))
2927
+ )
2928
+ {
2929
+ TY_(InsertNodeAtEnd)(field, node);
2930
+ ParseTag(doc, node, IgnoreWhitespace);
2931
+ continue;
2932
+ }
2933
+
2934
+ /* discard unexpected tags */
2935
+ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2936
+ TY_(FreeNode)( doc, node);
2937
+ }
2938
+
2939
+ TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2940
+ }
2941
+
2942
+ void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode)
2943
+ {
2944
+ Lexer* lexer = doc->lexer;
2945
+ Node *node;
2946
+
2947
+ lexer->insert = NULL; /* defer implicit inline start tags */
2948
+
2949
+ if ( nodeIsTEXTAREA(field) )
2950
+ mode = Preformatted;
2951
+ else
2952
+ mode = MixedContent; /* kludge for font tags */
2953
+
2954
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
2955
+ {
2956
+ if (node->tag == field->tag && node->type == EndTag)
2957
+ {
2958
+ TY_(FreeNode)( doc, node);
2959
+ field->closed = yes;
2960
+ TrimSpaces(doc, field);
2961
+ return;
2962
+ }
2963
+
2964
+ /* deal with comments etc. */
2965
+ if (InsertMisc(field, node))
2966
+ continue;
2967
+
2968
+ if (TY_(nodeIsText)(node))
2969
+ {
2970
+ /* only called for 1st child */
2971
+ if (field->content == NULL && !(mode & Preformatted))
2972
+ TrimSpaces(doc, field);
2973
+
2974
+ if (node->start >= node->end)
2975
+ {
2976
+ TY_(FreeNode)( doc, node);
2977
+ continue;
2978
+ }
2979
+
2980
+ TY_(InsertNodeAtEnd)(field, node);
2981
+ continue;
2982
+ }
2983
+
2984
+ /* for textarea should all cases of < and & be escaped? */
2985
+
2986
+ /* discard inline tags e.g. font */
2987
+ if ( node->tag
2988
+ && node->tag->model & CM_INLINE
2989
+ && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
2990
+ {
2991
+ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2992
+ TY_(FreeNode)( doc, node);
2993
+ continue;
2994
+ }
2995
+
2996
+ /* terminate element on other tags */
2997
+ if (!(field->tag->model & CM_OPT))
2998
+ TY_(ReportError)(doc, field, node, MISSING_ENDTAG_BEFORE);
2999
+
3000
+ TY_(UngetToken)( doc );
3001
+ TrimSpaces(doc, field);
3002
+ return;
3003
+ }
3004
+
3005
+ if (!(field->tag->model & CM_OPT))
3006
+ TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
3007
+ }
3008
+
3009
+
3010
+ void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode))
3011
+ {
3012
+ Node *node;
3013
+ while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
3014
+ {
3015
+ if (node->tag == title->tag && node->type == StartTag)
3016
+ {
3017
+ TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG);
3018
+ node->type = EndTag;
3019
+ TY_(UngetToken)( doc );
3020
+ continue;
3021
+ }
3022
+ else if (node->tag == title->tag && node->type == EndTag)
3023
+ {
3024
+ TY_(FreeNode)( doc, node);
3025
+ title->closed = yes;
3026
+ TrimSpaces(doc, title);
3027
+ return;
3028
+ }
3029
+
3030
+ if (TY_(nodeIsText)(node))
3031
+ {
3032
+ /* only called for 1st child */
3033
+ if (title->content == NULL)
3034
+ TrimInitialSpace(doc, title, node);
3035
+
3036
+ if (node->start >= node->end)
3037
+ {
3038
+ TY_(FreeNode)( doc, node);
3039
+ continue;
3040
+ }
3041
+
3042
+ TY_(InsertNodeAtEnd)(title, node);
3043
+ continue;
3044
+ }
3045
+
3046
+ /* deal with comments etc. */
3047
+ if (InsertMisc(title, node))
3048
+ continue;
3049
+
3050
+ /* discard unknown tags */
3051
+ if (node->tag == NULL)
3052
+ {
3053
+ TY_(ReportError)(doc, title, node, DISCARDING_UNEXPECTED);
3054
+ TY_(FreeNode)( doc, node);
3055
+ continue;
3056
+ }
3057
+
3058
+ /* pushback unexpected tokens */
3059
+ TY_(ReportError)(doc, title, node, MISSING_ENDTAG_BEFORE);
3060
+ TY_(UngetToken)( doc );
3061
+ TrimSpaces(doc, title);
3062
+ return;
3063
+ }
3064
+
3065
+ TY_(ReportError)(doc, title, node, MISSING_ENDTAG_FOR);
3066
+ }
3067
+
3068
+ /*
3069
+ This isn't quite right for CDATA content as it recognises
3070
+ tags within the content and parses them accordingly.
3071
+ This will unfortunately screw up scripts which include
3072
+ < + letter, < + !, < + ? or < + / + letter
3073
+ */
3074
+
3075
+ void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode))
3076
+ {
3077
+ Node *node;
3078
+
3079
+ doc->lexer->parent = script;
3080
+ node = TY_(GetToken)(doc, CdataContent);
3081
+ doc->lexer->parent = NULL;
3082
+
3083
+ if (node)
3084
+ {
3085
+ TY_(InsertNodeAtEnd)(script, node);
3086
+ }
3087
+ else
3088
+ {
3089
+ /* handle e.g. a document like "<script>" */
3090
+ TY_(ReportError)(doc, script, NULL, MISSING_ENDTAG_FOR);
3091
+ return;
3092
+ }
3093
+
3094
+ node = TY_(GetToken)(doc, IgnoreWhitespace);
3095
+
3096
+ if (!(node && node->type == EndTag && node->tag &&
3097
+ node->tag->id == script->tag->id))
3098
+ {
3099
+ TY_(ReportError)(doc, script, node, MISSING_ENDTAG_FOR);
3100
+
3101
+ if (node)
3102
+ TY_(UngetToken)(doc);
3103
+ }
3104
+ else
3105
+ {
3106
+ TY_(FreeNode)(doc, node);
3107
+ }
3108
+ }
3109
+
3110
+ Bool TY_(IsJavaScript)(Node *node)
3111
+ {
3112
+ Bool result = no;
3113
+ AttVal *attr;
3114
+
3115
+ if (node->attributes == NULL)
3116
+ return yes;
3117
+
3118
+ for (attr = node->attributes; attr; attr = attr->next)
3119
+ {
3120
+ if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
3121
+ && AttrContains(attr, "javascript") )
3122
+ {
3123
+ result = yes;
3124
+ break;
3125
+ }
3126
+ }
3127
+
3128
+ return result;
3129
+ }
3130
+
3131
+ void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
3132
+ {
3133
+ Lexer* lexer = doc->lexer;
3134
+ Node *node;
3135
+ int HasTitle = 0;
3136
+ int HasBase = 0;
3137
+
3138
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3139
+ {
3140
+ if (node->tag == head->tag && node->type == EndTag)
3141
+ {
3142
+ TY_(FreeNode)( doc, node);
3143
+ head->closed = yes;
3144
+ break;
3145
+ }
3146
+
3147
+ /* find and discard multiple <head> elements */
3148
+ /* find and discard <html> in <head> elements */
3149
+ if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
3150
+ {
3151
+ TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3152
+ TY_(FreeNode)(doc, node);
3153
+ continue;
3154
+ }
3155
+
3156
+ if (TY_(nodeIsText)(node))
3157
+ {
3158
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3159
+ TY_(UngetToken)( doc );
3160
+ break;
3161
+ }
3162
+
3163
+ if (node->type == ProcInsTag && node->element &&
3164
+ TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0)
3165
+ {
3166
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3167
+ TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node);
3168
+ continue;
3169
+ }
3170
+
3171
+ /* deal with comments etc. */
3172
+ if (InsertMisc(head, node))
3173
+ continue;
3174
+
3175
+ if (node->type == DocTypeTag)
3176
+ {
3177
+ InsertDocType(doc, head, node);
3178
+ continue;
3179
+ }
3180
+
3181
+ /* discard unknown tags */
3182
+ if (node->tag == NULL)
3183
+ {
3184
+ TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3185
+ TY_(FreeNode)( doc, node);
3186
+ continue;
3187
+ }
3188
+
3189
+ /*
3190
+ if it doesn't belong in the head then
3191
+ treat as implicit end of head and deal
3192
+ with as part of the body
3193
+ */
3194
+ if (!(node->tag->model & CM_HEAD))
3195
+ {
3196
+ /* #545067 Implicit closing of head broken - warn only for XHTML input */
3197
+ if ( lexer->isvoyager )
3198
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN );
3199
+ TY_(UngetToken)( doc );
3200
+ break;
3201
+ }
3202
+
3203
+ if (TY_(nodeIsElement)(node))
3204
+ {
3205
+ if ( nodeIsTITLE(node) )
3206
+ {
3207
+ ++HasTitle;
3208
+
3209
+ if (HasTitle > 1)
3210
+ TY_(ReportError)(doc, head, node,
3211
+ head ?
3212
+ TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3213
+ }
3214
+ else if ( nodeIsBASE(node) )
3215
+ {
3216
+ ++HasBase;
3217
+
3218
+ if (HasBase > 1)
3219
+ TY_(ReportError)(doc, head, node,
3220
+ head ?
3221
+ TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3222
+ }
3223
+ else if ( nodeIsNOSCRIPT(node) )
3224
+ {
3225
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3226
+ }
3227
+
3228
+ #ifdef AUTO_INPUT_ENCODING
3229
+ else if (nodeIsMETA(node))
3230
+ {
3231
+ AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV);
3232
+ AttVal * content = AttrGetById(node, TidyAttr_CONTENT);
3233
+ if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content))
3234
+ {
3235
+ tmbstr val, charset;
3236
+ uint end = 0;
3237
+ val = charset = TY_(tmbstrdup)(doc->allocator, content->value);
3238
+ val = TY_(tmbstrtolower)(val);
3239
+ val = strstr(content->value, "charset");
3240
+
3241
+ if (val)
3242
+ val += 7;
3243
+
3244
+ while(val && *val && (TY_(IsWhite)((tchar)*val) ||
3245
+ *val == '=' || *val == '"' || *val == '\''))
3246
+ ++val;
3247
+
3248
+ while(val && val[end] && !(TY_(IsWhite)((tchar)val[end]) ||
3249
+ val[end] == '"' || val[end] == '\'' || val[end] == ';'))
3250
+ ++end;
3251
+
3252
+ if (val && end)
3253
+ {
3254
+ tmbstr encoding = TY_(tmbstrndup)(doc->allocator,val, end);
3255
+ uint id = TY_(GetEncodingIdFromName)(encoding);
3256
+
3257
+ /* todo: detect mismatch with BOM/XMLDecl/declared */
3258
+ /* todo: error for unsupported encodings */
3259
+ /* todo: try to re-init transcoder */
3260
+ /* todo: change input/output encoding settings */
3261
+ /* todo: store id in StreamIn */
3262
+
3263
+ TidyDocFree(doc, encoding);
3264
+ }
3265
+
3266
+ TidyDocFree(doc, charset);
3267
+ }
3268
+ }
3269
+ #endif /* AUTO_INPUT_ENCODING */
3270
+
3271
+ TY_(InsertNodeAtEnd)(head, node);
3272
+ ParseTag(doc, node, IgnoreWhitespace);
3273
+ continue;
3274
+ }
3275
+
3276
+ /* discard unexpected text nodes and end tags */
3277
+ TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3278
+ TY_(FreeNode)( doc, node);
3279
+ }
3280
+ }
3281
+
3282
+ void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
3283
+ {
3284
+ Lexer* lexer = doc->lexer;
3285
+ Node *node;
3286
+ Bool checkstack, iswhitenode;
3287
+
3288
+ mode = IgnoreWhitespace;
3289
+ checkstack = yes;
3290
+
3291
+ TY_(BumpObject)( doc, body->parent );
3292
+
3293
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
3294
+ {
3295
+ /* find and discard multiple <body> elements */
3296
+ if (node->tag == body->tag && node->type == StartTag)
3297
+ {
3298
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3299
+ TY_(FreeNode)(doc, node);
3300
+ continue;
3301
+ }
3302
+
3303
+ /* #538536 Extra endtags not detected */
3304
+ if ( nodeIsHTML(node) )
3305
+ {
3306
+ if (TY_(nodeIsElement)(node) || lexer->seenEndHtml)
3307
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3308
+ else
3309
+ lexer->seenEndHtml = 1;
3310
+
3311
+ TY_(FreeNode)( doc, node);
3312
+ continue;
3313
+ }
3314
+
3315
+ if ( lexer->seenEndBody &&
3316
+ ( node->type == StartTag ||
3317
+ node->type == EndTag ||
3318
+ node->type == StartEndTag ) )
3319
+ {
3320
+ TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY );
3321
+ }
3322
+
3323
+ if ( node->tag == body->tag && node->type == EndTag )
3324
+ {
3325
+ body->closed = yes;
3326
+ TrimSpaces(doc, body);
3327
+ TY_(FreeNode)( doc, node);
3328
+ lexer->seenEndBody = 1;
3329
+ mode = IgnoreWhitespace;
3330
+
3331
+ if ( nodeIsNOFRAMES(body->parent) )
3332
+ break;
3333
+
3334
+ continue;
3335
+ }
3336
+
3337
+ if ( nodeIsNOFRAMES(node) )
3338
+ {
3339
+ if (node->type == StartTag)
3340
+ {
3341
+ TY_(InsertNodeAtEnd)(body, node);
3342
+ TY_(ParseBlock)(doc, node, mode);
3343
+ continue;
3344
+ }
3345
+
3346
+ if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
3347
+ {
3348
+ TrimSpaces(doc, body);
3349
+ TY_(UngetToken)( doc );
3350
+ break;
3351
+ }
3352
+ }
3353
+
3354
+ if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
3355
+ && nodeIsNOFRAMES(body->parent) )
3356
+ {
3357
+ TrimSpaces(doc, body);
3358
+ TY_(UngetToken)( doc );
3359
+ break;
3360
+ }
3361
+
3362
+ iswhitenode = no;
3363
+
3364
+ if ( TY_(nodeIsText)(node) &&
3365
+ node->end <= node->start + 1 &&
3366
+ lexer->lexbuf[node->start] == ' ' )
3367
+ iswhitenode = yes;
3368
+
3369
+ /* deal with comments etc. */
3370
+ if (InsertMisc(body, node))
3371
+ continue;
3372
+
3373
+ /* #538536 Extra endtags not detected */
3374
+ #if 0
3375
+ if ( lexer->seenEndBody == 1 && !iswhitenode )
3376
+ {
3377
+ ++lexer->seenEndBody;
3378
+ TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY);
3379
+ }
3380
+ #endif
3381
+
3382
+ /* mixed content model permits text */
3383
+ if (TY_(nodeIsText)(node))
3384
+ {
3385
+ if (iswhitenode && mode == IgnoreWhitespace)
3386
+ {
3387
+ TY_(FreeNode)( doc, node);
3388
+ continue;
3389
+ }
3390
+
3391
+ /* HTML 2 and HTML4 strict don't allow text here */
3392
+ TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
3393
+
3394
+ if (checkstack)
3395
+ {
3396
+ checkstack = no;
3397
+
3398
+ if ( TY_(InlineDup)(doc, node) > 0 )
3399
+ continue;
3400
+ }
3401
+
3402
+ TY_(InsertNodeAtEnd)(body, node);
3403
+ mode = MixedContent;
3404
+ continue;
3405
+ }
3406
+
3407
+ if (node->type == DocTypeTag)
3408
+ {
3409
+ InsertDocType(doc, body, node);
3410
+ continue;
3411
+ }
3412
+ /* discard unknown and PARAM tags */
3413
+ if ( node->tag == NULL || nodeIsPARAM(node) )
3414
+ {
3415
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3416
+ TY_(FreeNode)( doc, node);
3417
+ continue;
3418
+ }
3419
+
3420
+ /*
3421
+ Netscape allows LI and DD directly in BODY
3422
+ We infer UL or DL respectively and use this
3423
+ Bool to exclude block-level elements so as
3424
+ to match Netscape's observed behaviour.
3425
+ */
3426
+ lexer->excludeBlocks = no;
3427
+
3428
+ if ( nodeIsINPUT(node) ||
3429
+ (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE))
3430
+ )
3431
+ {
3432
+ /* avoid this error message being issued twice */
3433
+ if (!(node->tag->model & CM_HEAD))
3434
+ TY_(ReportError)(doc, body, node, TAG_NOT_ALLOWED_IN);
3435
+
3436
+ if (node->tag->model & CM_HTML)
3437
+ {
3438
+ /* copy body attributes if current body was inferred */
3439
+ if ( nodeIsBODY(node) && body->implicit
3440
+ && body->attributes == NULL )
3441
+ {
3442
+ body->attributes = node->attributes;
3443
+ node->attributes = NULL;
3444
+ }
3445
+
3446
+ TY_(FreeNode)( doc, node);
3447
+ continue;
3448
+ }
3449
+
3450
+ if (node->tag->model & CM_HEAD)
3451
+ {
3452
+ MoveToHead(doc, body, node);
3453
+ continue;
3454
+ }
3455
+
3456
+ if (node->tag->model & CM_LIST)
3457
+ {
3458
+ TY_(UngetToken)( doc );
3459
+ node = TY_(InferredTag)(doc, TidyTag_UL);
3460
+ AddClassNoIndent(doc, node);
3461
+ lexer->excludeBlocks = yes;
3462
+ }
3463
+ else if (node->tag->model & CM_DEFLIST)
3464
+ {
3465
+ TY_(UngetToken)( doc );
3466
+ node = TY_(InferredTag)(doc, TidyTag_DL);
3467
+ lexer->excludeBlocks = yes;
3468
+ }
3469
+ else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
3470
+ {
3471
+ /* http://tidy.sf.net/issue/2855621 */
3472
+ if (node->type != EndTag) {
3473
+ TY_(UngetToken)( doc );
3474
+ node = TY_(InferredTag)(doc, TidyTag_TABLE);
3475
+ }
3476
+ lexer->excludeBlocks = yes;
3477
+ }
3478
+ else if ( nodeIsINPUT(node) )
3479
+ {
3480
+ TY_(UngetToken)( doc );
3481
+ node = TY_(InferredTag)(doc, TidyTag_FORM);
3482
+ lexer->excludeBlocks = yes;
3483
+ }
3484
+ else
3485
+ {
3486
+ if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) )
3487
+ {
3488
+ TY_(UngetToken)( doc );
3489
+ return;
3490
+ }
3491
+
3492
+ /* ignore </td> </th> <option> etc. */
3493
+ TY_(FreeNode)( doc, node );
3494
+ continue;
3495
+ }
3496
+ }
3497
+
3498
+ if (node->type == EndTag)
3499
+ {
3500
+ if ( nodeIsBR(node) )
3501
+ node->type = StartTag;
3502
+ else if ( nodeIsP(node) )
3503
+ {
3504
+ node->type = StartEndTag;
3505
+ node->implicit = yes;
3506
+ #if OBSOLETE
3507
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
3508
+ FreeAttrs( doc, node ); /* discard align attribute etc. */
3509
+ TY_(InsertNodeAtEnd)(body, node);
3510
+ node = TY_(InferredTag)(doc, TidyTag_BR);
3511
+ #endif
3512
+ }
3513
+ else if ( TY_(nodeHasCM)(node, CM_INLINE) )
3514
+ TY_(PopInline)( doc, node );
3515
+ }
3516
+
3517
+ if (TY_(nodeIsElement)(node))
3518
+ {
3519
+ if ( TY_(nodeHasCM)(node, CM_INLINE) && !TY_(nodeHasCM)(node, CM_MIXED) )
3520
+ {
3521
+ /* HTML4 strict doesn't allow inline content here */
3522
+ /* but HTML2 does allow img elements as children of body */
3523
+ if ( nodeIsIMG(node) )
3524
+ TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT);
3525
+ else
3526
+ TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
3527
+
3528
+ if (checkstack && !node->implicit)
3529
+ {
3530
+ checkstack = no;
3531
+
3532
+ if ( TY_(InlineDup)(doc, node) > 0 )
3533
+ continue;
3534
+ }
3535
+
3536
+ mode = MixedContent;
3537
+ }
3538
+ else
3539
+ {
3540
+ checkstack = yes;
3541
+ mode = IgnoreWhitespace;
3542
+ }
3543
+
3544
+ if (node->implicit)
3545
+ TY_(ReportError)(doc, body, node, INSERTING_TAG);
3546
+
3547
+ TY_(InsertNodeAtEnd)(body, node);
3548
+ ParseTag(doc, node, mode);
3549
+ continue;
3550
+ }
3551
+
3552
+ /* discard unexpected tags */
3553
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3554
+ TY_(FreeNode)( doc, node);
3555
+ }
3556
+ }
3557
+
3558
+ void TY_(ParseNoFrames)(TidyDocImpl* doc, Node *noframes, GetTokenMode mode)
3559
+ {
3560
+ Lexer* lexer = doc->lexer;
3561
+ Node *node;
3562
+
3563
+ if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3564
+ {
3565
+ doc->badAccess |= BA_USING_NOFRAMES;
3566
+ }
3567
+ mode = IgnoreWhitespace;
3568
+
3569
+ while ( (node = TY_(GetToken)(doc, mode)) != NULL )
3570
+ {
3571
+ if ( node->tag == noframes->tag && node->type == EndTag )
3572
+ {
3573
+ TY_(FreeNode)( doc, node);
3574
+ noframes->closed = yes;
3575
+ TrimSpaces(doc, noframes);
3576
+ return;
3577
+ }
3578
+
3579
+ if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
3580
+ {
3581
+ TrimSpaces(doc, noframes);
3582
+ if (node->type == EndTag)
3583
+ {
3584
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3585
+ TY_(FreeNode)( doc, node); /* Throw it away */
3586
+ }
3587
+ else
3588
+ {
3589
+ TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_BEFORE);
3590
+ TY_(UngetToken)( doc );
3591
+ }
3592
+ return;
3593
+ }
3594
+
3595
+ if ( nodeIsHTML(node) )
3596
+ {
3597
+ if (TY_(nodeIsElement)(node))
3598
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3599
+
3600
+ TY_(FreeNode)( doc, node);
3601
+ continue;
3602
+ }
3603
+
3604
+ /* deal with comments etc. */
3605
+ if (InsertMisc(noframes, node))
3606
+ continue;
3607
+
3608
+ if ( nodeIsBODY(node) && node->type == StartTag )
3609
+ {
3610
+ Bool seen_body = lexer->seenEndBody;
3611
+ TY_(InsertNodeAtEnd)(noframes, node);
3612
+ ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/);
3613
+
3614
+ /* fix for bug http://tidy.sf.net/bug/887259 */
3615
+ if (seen_body && TY_(FindBody)(doc) != node)
3616
+ {
3617
+ TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no);
3618
+ MoveNodeToBody(doc, node);
3619
+ }
3620
+ continue;
3621
+ }
3622
+
3623
+ /* implicit body element inferred */
3624
+ if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag))
3625
+ {
3626
+ Node *body = TY_(FindBody)( doc );
3627
+ if ( body || lexer->seenEndBody )
3628
+ {
3629
+ if ( body == NULL )
3630
+ {
3631
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3632
+ TY_(FreeNode)( doc, node);
3633
+ continue;
3634
+ }
3635
+ if ( TY_(nodeIsText)(node) )
3636
+ {
3637
+ TY_(UngetToken)( doc );
3638
+ node = TY_(InferredTag)(doc, TidyTag_P);
3639
+ TY_(ReportError)(doc, noframes, node, CONTENT_AFTER_BODY );
3640
+ }
3641
+ TY_(InsertNodeAtEnd)( body, node );
3642
+ }
3643
+ else
3644
+ {
3645
+ TY_(UngetToken)( doc );
3646
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3647
+ if ( cfgBool(doc, TidyXmlOut) )
3648
+ TY_(ReportError)(doc, noframes, node, INSERTING_TAG);
3649
+ TY_(InsertNodeAtEnd)( noframes, node );
3650
+ }
3651
+
3652
+ ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
3653
+ continue;
3654
+ }
3655
+
3656
+ /* discard unexpected end tags */
3657
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3658
+ TY_(FreeNode)( doc, node);
3659
+ }
3660
+
3661
+ TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_FOR);
3662
+ }
3663
+
3664
+ void TY_(ParseFrameSet)(TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode))
3665
+ {
3666
+ Lexer* lexer = doc->lexer;
3667
+ Node *node;
3668
+
3669
+ if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3670
+ {
3671
+ doc->badAccess |= BA_USING_FRAMES;
3672
+ }
3673
+
3674
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3675
+ {
3676
+ if (node->tag == frameset->tag && node->type == EndTag)
3677
+ {
3678
+ TY_(FreeNode)( doc, node);
3679
+ frameset->closed = yes;
3680
+ TrimSpaces(doc, frameset);
3681
+ return;
3682
+ }
3683
+
3684
+ /* deal with comments etc. */
3685
+ if (InsertMisc(frameset, node))
3686
+ continue;
3687
+
3688
+ if (node->tag == NULL)
3689
+ {
3690
+ TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3691
+ TY_(FreeNode)( doc, node);
3692
+ continue;
3693
+ }
3694
+
3695
+ if (TY_(nodeIsElement)(node))
3696
+ {
3697
+ if (node->tag && node->tag->model & CM_HEAD)
3698
+ {
3699
+ MoveToHead(doc, frameset, node);
3700
+ continue;
3701
+ }
3702
+ }
3703
+
3704
+ if ( nodeIsBODY(node) )
3705
+ {
3706
+ TY_(UngetToken)( doc );
3707
+ node = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3708
+ TY_(ReportError)(doc, frameset, node, INSERTING_TAG);
3709
+ }
3710
+
3711
+ if (node->type == StartTag && (node->tag->model & CM_FRAMES))
3712
+ {
3713
+ TY_(InsertNodeAtEnd)(frameset, node);
3714
+ lexer->excludeBlocks = no;
3715
+ ParseTag(doc, node, MixedContent);
3716
+ continue;
3717
+ }
3718
+ else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES))
3719
+ {
3720
+ TY_(InsertNodeAtEnd)(frameset, node);
3721
+ continue;
3722
+ }
3723
+
3724
+ /* discard unexpected tags */
3725
+ #if SUPPORT_ACCESSIBILITY_CHECKS
3726
+ /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */
3727
+ if ( nodeIsA(node) )
3728
+ doc->badAccess |= BA_INVALID_LINK_NOFRAMES;
3729
+ #endif
3730
+
3731
+ TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3732
+ TY_(FreeNode)( doc, node);
3733
+ }
3734
+
3735
+ TY_(ReportError)(doc, frameset, node, MISSING_ENDTAG_FOR);
3736
+ }
3737
+
3738
+ void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
3739
+ {
3740
+ Node *node, *head;
3741
+ Node *frameset = NULL;
3742
+ Node *noframes = NULL;
3743
+
3744
+ TY_(SetOptionBool)( doc, TidyXmlTags, no );
3745
+
3746
+ for (;;)
3747
+ {
3748
+ node = TY_(GetToken)(doc, IgnoreWhitespace);
3749
+
3750
+ if (node == NULL)
3751
+ {
3752
+ node = TY_(InferredTag)(doc, TidyTag_HEAD);
3753
+ break;
3754
+ }
3755
+
3756
+ if ( nodeIsHEAD(node) )
3757
+ break;
3758
+
3759
+ if (node->tag == html->tag && node->type == EndTag)
3760
+ {
3761
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3762
+ TY_(FreeNode)( doc, node);
3763
+ continue;
3764
+ }
3765
+
3766
+ /* find and discard multiple <html> elements */
3767
+ if (node->tag == html->tag && node->type == StartTag)
3768
+ {
3769
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3770
+ TY_(FreeNode)(doc, node);
3771
+ continue;
3772
+ }
3773
+
3774
+ /* deal with comments etc. */
3775
+ if (InsertMisc(html, node))
3776
+ continue;
3777
+
3778
+ TY_(UngetToken)( doc );
3779
+ node = TY_(InferredTag)(doc, TidyTag_HEAD);
3780
+ break;
3781
+ }
3782
+
3783
+ head = node;
3784
+ TY_(InsertNodeAtEnd)(html, head);
3785
+ TY_(ParseHead)(doc, head, mode);
3786
+
3787
+ for (;;)
3788
+ {
3789
+ node = TY_(GetToken)(doc, IgnoreWhitespace);
3790
+
3791
+ if (node == NULL)
3792
+ {
3793
+ if (frameset == NULL) /* implied body */
3794
+ {
3795
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3796
+ TY_(InsertNodeAtEnd)(html, node);
3797
+ TY_(ParseBody)(doc, node, mode);
3798
+ }
3799
+
3800
+ return;
3801
+ }
3802
+
3803
+ /* robustly handle html tags */
3804
+ if (node->tag == html->tag)
3805
+ {
3806
+ if (node->type != StartTag && frameset == NULL)
3807
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3808
+
3809
+ TY_(FreeNode)( doc, node);
3810
+ continue;
3811
+ }
3812
+
3813
+ /* deal with comments etc. */
3814
+ if (InsertMisc(html, node))
3815
+ continue;
3816
+
3817
+ /* if frameset document coerce <body> to <noframes> */
3818
+ if ( nodeIsBODY(node) )
3819
+ {
3820
+ if (node->type != StartTag)
3821
+ {
3822
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3823
+ TY_(FreeNode)( doc, node);
3824
+ continue;
3825
+ }
3826
+
3827
+ if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3828
+ {
3829
+ if (frameset != NULL)
3830
+ {
3831
+ TY_(UngetToken)( doc );
3832
+
3833
+ if (noframes == NULL)
3834
+ {
3835
+ noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3836
+ TY_(InsertNodeAtEnd)(frameset, noframes);
3837
+ TY_(ReportError)(doc, html, noframes, INSERTING_TAG);
3838
+ }
3839
+ else
3840
+ {
3841
+ if (noframes->type == StartEndTag)
3842
+ noframes->type = StartTag;
3843
+ }
3844
+
3845
+ ParseTag(doc, noframes, mode);
3846
+ continue;
3847
+ }
3848
+ }
3849
+
3850
+ TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3851
+ break; /* to parse body */
3852
+ }
3853
+
3854
+ /* flag an error if we see more than one frameset */
3855
+ if ( nodeIsFRAMESET(node) )
3856
+ {
3857
+ if (node->type != StartTag)
3858
+ {
3859
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3860
+ TY_(FreeNode)( doc, node);
3861
+ continue;
3862
+ }
3863
+
3864
+ if (frameset != NULL)
3865
+ TY_(ReportFatal)(doc, html, node, DUPLICATE_FRAMESET);
3866
+ else
3867
+ frameset = node;
3868
+
3869
+ TY_(InsertNodeAtEnd)(html, node);
3870
+ ParseTag(doc, node, mode);
3871
+
3872
+ /*
3873
+ see if it includes a noframes element so
3874
+ that we can merge subsequent noframes elements
3875
+ */
3876
+
3877
+ for (node = frameset->content; node; node = node->next)
3878
+ {
3879
+ if ( nodeIsNOFRAMES(node) )
3880
+ noframes = node;
3881
+ }
3882
+ continue;
3883
+ }
3884
+
3885
+ /* if not a frameset document coerce <noframes> to <body> */
3886
+ if ( nodeIsNOFRAMES(node) )
3887
+ {
3888
+ if (node->type != StartTag)
3889
+ {
3890
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3891
+ TY_(FreeNode)( doc, node);
3892
+ continue;
3893
+ }
3894
+
3895
+ if (frameset == NULL)
3896
+ {
3897
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3898
+ TY_(FreeNode)( doc, node);
3899
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3900
+ break;
3901
+ }
3902
+
3903
+ if (noframes == NULL)
3904
+ {
3905
+ noframes = node;
3906
+ TY_(InsertNodeAtEnd)(frameset, noframes);
3907
+ }
3908
+ else
3909
+ TY_(FreeNode)( doc, node);
3910
+
3911
+ ParseTag(doc, noframes, mode);
3912
+ continue;
3913
+ }
3914
+
3915
+ if (TY_(nodeIsElement)(node))
3916
+ {
3917
+ if (node->tag && node->tag->model & CM_HEAD)
3918
+ {
3919
+ MoveToHead(doc, html, node);
3920
+ continue;
3921
+ }
3922
+
3923
+ /* discard illegal frame element following a frameset */
3924
+ if ( frameset != NULL && nodeIsFRAME(node) )
3925
+ {
3926
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3927
+ TY_(FreeNode)(doc, node);
3928
+ continue;
3929
+ }
3930
+ }
3931
+
3932
+ TY_(UngetToken)( doc );
3933
+
3934
+ /* insert other content into noframes element */
3935
+
3936
+ if (frameset)
3937
+ {
3938
+ if (noframes == NULL)
3939
+ {
3940
+ noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3941
+ TY_(InsertNodeAtEnd)(frameset, noframes);
3942
+ }
3943
+ else
3944
+ {
3945
+ TY_(ReportError)(doc, html, node, NOFRAMES_CONTENT);
3946
+ if (noframes->type == StartEndTag)
3947
+ noframes->type = StartTag;
3948
+ }
3949
+
3950
+ TY_(ConstrainVersion)(doc, VERS_FRAMESET);
3951
+ ParseTag(doc, noframes, mode);
3952
+ continue;
3953
+ }
3954
+
3955
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3956
+ TY_(ReportError)(doc, html, node, INSERTING_TAG );
3957
+ TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3958
+ break;
3959
+ }
3960
+
3961
+ /* node must be body */
3962
+
3963
+ TY_(InsertNodeAtEnd)(html, node);
3964
+ ParseTag(doc, node, mode);
3965
+ }
3966
+
3967
+ static Bool nodeCMIsOnlyInline( Node* node )
3968
+ {
3969
+ return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
3970
+ }
3971
+
3972
+ static void EncloseBodyText(TidyDocImpl* doc)
3973
+ {
3974
+ Node* node;
3975
+ Node* body = TY_(FindBody)(doc);
3976
+
3977
+ if (!body)
3978
+ return;
3979
+
3980
+ node = body->content;
3981
+
3982
+ while (node)
3983
+ {
3984
+ if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) ||
3985
+ (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node)))
3986
+ {
3987
+ Node* p = TY_(InferredTag)(doc, TidyTag_P);
3988
+ TY_(InsertNodeBeforeElement)(node, p);
3989
+ while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node)))
3990
+ {
3991
+ Node* next = node->next;
3992
+ TY_(RemoveNode)(node);
3993
+ TY_(InsertNodeAtEnd)(p, node);
3994
+ node = next;
3995
+ }
3996
+ TrimSpaces(doc, p);
3997
+ continue;
3998
+ }
3999
+ node = node->next;
4000
+ }
4001
+ }
4002
+
4003
+ /* <form>, <blockquote> and <noscript> do not allow #PCDATA in
4004
+ HTML 4.01 Strict (%block; model instead of %flow;).
4005
+ When requested, text nodes in these elements are wrapped in <p>. */
4006
+ static void EncloseBlockText(TidyDocImpl* doc, Node* node)
4007
+ {
4008
+ Node *next;
4009
+ Node *block;
4010
+
4011
+ while (node)
4012
+ {
4013
+ next = node->next;
4014
+
4015
+ if (node->content)
4016
+ EncloseBlockText(doc, node->content);
4017
+
4018
+ if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
4019
+ nodeIsBLOCKQUOTE(node))
4020
+ || !node->content)
4021
+ {
4022
+ node = next;
4023
+ continue;
4024
+ }
4025
+
4026
+ block = node->content;
4027
+
4028
+ if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) ||
4029
+ (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block)))
4030
+ {
4031
+ Node* p = TY_(InferredTag)(doc, TidyTag_P);
4032
+ TY_(InsertNodeBeforeElement)(block, p);
4033
+ while (block &&
4034
+ (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block)))
4035
+ {
4036
+ Node* tempNext = block->next;
4037
+ TY_(RemoveNode)(block);
4038
+ TY_(InsertNodeAtEnd)(p, block);
4039
+ block = tempNext;
4040
+ }
4041
+ TrimSpaces(doc, p);
4042
+ continue;
4043
+ }
4044
+
4045
+ node = next;
4046
+ }
4047
+ }
4048
+
4049
+ static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
4050
+ {
4051
+ Node *next;
4052
+
4053
+ while (node)
4054
+ {
4055
+ next = node->next;
4056
+
4057
+ if (nodeIsDIR(node) || nodeIsMENU(node))
4058
+ TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
4059
+
4060
+ if (nodeIsXMP(node) || nodeIsLISTING(node) ||
4061
+ (node->tag && node->tag->id == TidyTag_PLAINTEXT))
4062
+ TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes);
4063
+
4064
+ if (node->content)
4065
+ ReplaceObsoleteElements(doc, node->content);
4066
+
4067
+ node = next;
4068
+ }
4069
+ }
4070
+
4071
+ static void AttributeChecks(TidyDocImpl* doc, Node* node)
4072
+ {
4073
+ Node *next;
4074
+
4075
+ while (node)
4076
+ {
4077
+ next = node->next;
4078
+
4079
+ if (TY_(nodeIsElement)(node))
4080
+ {
4081
+ if (node->tag->chkattrs)
4082
+ node->tag->chkattrs(doc, node);
4083
+ else
4084
+ TY_(CheckAttributes)(doc, node);
4085
+ }
4086
+
4087
+ if (node->content)
4088
+ AttributeChecks(doc, node->content);
4089
+
4090
+ assert( next != node ); /* http://tidy.sf.net/issue/1603538 */
4091
+ node = next;
4092
+ }
4093
+ }
4094
+
4095
+ /*
4096
+ HTML is the top level element
4097
+ */
4098
+ void TY_(ParseDocument)(TidyDocImpl* doc)
4099
+ {
4100
+ Node *node, *html, *doctype = NULL;
4101
+
4102
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4103
+ {
4104
+ if (node->type == XmlDecl)
4105
+ {
4106
+ if (TY_(FindXmlDecl)(doc) && doc->root.content)
4107
+ {
4108
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4109
+ TY_(FreeNode)(doc, node);
4110
+ continue;
4111
+ }
4112
+ if (node->line != 1 || (node->line == 1 && node->column != 1))
4113
+ {
4114
+ TY_(ReportError)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
4115
+ }
4116
+ }
4117
+ #ifdef AUTO_INPUT_ENCODING
4118
+ if (node->type == XmlDecl)
4119
+ {
4120
+ AttVal* encoding = GetAttrByName(node, "encoding");
4121
+ if (AttrHasValue(encoding))
4122
+ {
4123
+ uint id = TY_(GetEncodingIdFromName)(encoding->value);
4124
+
4125
+ /* todo: detect mismatch with BOM/XMLDecl/declared */
4126
+ /* todo: error for unsupported encodings */
4127
+ /* todo: try to re-init transcoder */
4128
+ /* todo: change input/output encoding settings */
4129
+ /* todo: store id in StreamIn */
4130
+ }
4131
+ }
4132
+ #endif /* AUTO_INPUT_ENCODING */
4133
+
4134
+ /* deal with comments etc. */
4135
+ if (InsertMisc( &doc->root, node ))
4136
+ continue;
4137
+
4138
+ if (node->type == DocTypeTag)
4139
+ {
4140
+ if (doctype == NULL)
4141
+ {
4142
+ TY_(InsertNodeAtEnd)( &doc->root, node);
4143
+ doctype = node;
4144
+ }
4145
+ else
4146
+ {
4147
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4148
+ TY_(FreeNode)( doc, node);
4149
+ }
4150
+ continue;
4151
+ }
4152
+
4153
+ if (node->type == EndTag)
4154
+ {
4155
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4156
+ TY_(FreeNode)( doc, node);
4157
+ continue;
4158
+ }
4159
+
4160
+ if (node->type == StartTag && nodeIsHTML(node))
4161
+ {
4162
+ AttVal *xmlns;
4163
+
4164
+ xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS);
4165
+
4166
+ if (AttrValueIs(xmlns, XHTML_NAMESPACE))
4167
+ {
4168
+ Bool htmlOut = cfgBool( doc, TidyHtmlOut );
4169
+ doc->lexer->isvoyager = yes; /* Unless plain HTML */
4170
+ TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
4171
+ TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */
4172
+
4173
+ /* adjust other config options, just as in config.c */
4174
+ if ( !htmlOut )
4175
+ {
4176
+ TY_(SetOptionBool)( doc, TidyUpperCaseTags, no );
4177
+ TY_(SetOptionBool)( doc, TidyUpperCaseAttrs, no );
4178
+ }
4179
+ }
4180
+ }
4181
+
4182
+ if ( node->type != StartTag || !nodeIsHTML(node) )
4183
+ {
4184
+ TY_(UngetToken)( doc );
4185
+ html = TY_(InferredTag)(doc, TidyTag_HTML);
4186
+ }
4187
+ else
4188
+ html = node;
4189
+
4190
+ if (!TY_(FindDocType)(doc))
4191
+ TY_(ReportError)(doc, NULL, NULL, MISSING_DOCTYPE);
4192
+
4193
+ TY_(InsertNodeAtEnd)( &doc->root, html);
4194
+ TY_(ParseHTML)( doc, html, IgnoreWhitespace );
4195
+ break;
4196
+ }
4197
+
4198
+ #if SUPPORT_ACCESSIBILITY_CHECKS
4199
+ /* do this before any more document fixes */
4200
+ if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 )
4201
+ TY_(AccessibilityChecks)( doc );
4202
+ #endif /* #if SUPPORT_ACCESSIBILITY_CHECKS */
4203
+
4204
+ if (!TY_(FindHTML)(doc))
4205
+ {
4206
+ /* a later check should complain if <body> is empty */
4207
+ html = TY_(InferredTag)(doc, TidyTag_HTML);
4208
+ TY_(InsertNodeAtEnd)( &doc->root, html);
4209
+ TY_(ParseHTML)(doc, html, IgnoreWhitespace);
4210
+ }
4211
+
4212
+ if (!TY_(FindTITLE)(doc))
4213
+ {
4214
+ Node* head = TY_(FindHEAD)(doc);
4215
+ TY_(ReportError)(doc, head, NULL, MISSING_TITLE_ELEMENT);
4216
+ TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE));
4217
+ }
4218
+
4219
+ AttributeChecks(doc, &doc->root);
4220
+ ReplaceObsoleteElements(doc, &doc->root);
4221
+ TY_(DropEmptyElements)(doc, &doc->root);
4222
+ CleanSpaces(doc, &doc->root);
4223
+
4224
+ if (cfgBool(doc, TidyEncloseBodyText))
4225
+ EncloseBodyText(doc);
4226
+ if (cfgBool(doc, TidyEncloseBlockText))
4227
+ EncloseBlockText(doc, &doc->root);
4228
+ }
4229
+
4230
+ Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element)
4231
+ {
4232
+ AttVal *attribute;
4233
+
4234
+ /* search attributes for xml:space */
4235
+ for (attribute = element->attributes; attribute; attribute = attribute->next)
4236
+ {
4237
+ if (attrIsXML_SPACE(attribute))
4238
+ {
4239
+ if (AttrValueIs(attribute, "preserve"))
4240
+ return yes;
4241
+
4242
+ return no;
4243
+ }
4244
+ }
4245
+
4246
+ if (element->element == NULL)
4247
+ return no;
4248
+
4249
+ /* kludge for html docs without explicit xml:space attribute */
4250
+ if (nodeIsPRE(element) ||
4251
+ nodeIsSCRIPT(element) ||
4252
+ nodeIsSTYLE(element) ||
4253
+ TY_(FindParser)(doc, element) == TY_(ParsePre))
4254
+ return yes;
4255
+
4256
+ /* kludge for XSL docs */
4257
+ if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 )
4258
+ return yes;
4259
+
4260
+ return no;
4261
+ }
4262
+
4263
+ /*
4264
+ XML documents
4265
+ */
4266
+ static void ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode)
4267
+ {
4268
+ Lexer* lexer = doc->lexer;
4269
+ Node *node;
4270
+
4271
+ /* if node is pre or has xml:space="preserve" then do so */
4272
+
4273
+ if ( TY_(XMLPreserveWhiteSpace)(doc, element) )
4274
+ mode = Preformatted;
4275
+
4276
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
4277
+ {
4278
+ if (node->type == EndTag &&
4279
+ node->element && element->element &&
4280
+ TY_(tmbstrcmp)(node->element, element->element) == 0)
4281
+ {
4282
+ TY_(FreeNode)( doc, node);
4283
+ element->closed = yes;
4284
+ break;
4285
+ }
4286
+
4287
+ /* discard unexpected end tags */
4288
+ if (node->type == EndTag)
4289
+ {
4290
+ if (element)
4291
+ TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG_IN);
4292
+ else
4293
+ TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG);
4294
+
4295
+ TY_(FreeNode)( doc, node);
4296
+ continue;
4297
+ }
4298
+
4299
+ /* parse content on seeing start tag */
4300
+ if (node->type == StartTag)
4301
+ ParseXMLElement( doc, node, mode );
4302
+
4303
+ TY_(InsertNodeAtEnd)(element, node);
4304
+ }
4305
+
4306
+ /*
4307
+ if first child is text then trim initial space and
4308
+ delete text node if it is empty.
4309
+ */
4310
+
4311
+ node = element->content;
4312
+
4313
+ if (TY_(nodeIsText)(node) && mode != Preformatted)
4314
+ {
4315
+ if ( lexer->lexbuf[node->start] == ' ' )
4316
+ {
4317
+ node->start++;
4318
+
4319
+ if (node->start >= node->end)
4320
+ TY_(DiscardElement)( doc, node );
4321
+ }
4322
+ }
4323
+
4324
+ /*
4325
+ if last child is text then trim final space and
4326
+ delete the text node if it is empty
4327
+ */
4328
+
4329
+ node = element->last;
4330
+
4331
+ if (TY_(nodeIsText)(node) && mode != Preformatted)
4332
+ {
4333
+ if ( lexer->lexbuf[node->end - 1] == ' ' )
4334
+ {
4335
+ node->end--;
4336
+
4337
+ if (node->start >= node->end)
4338
+ TY_(DiscardElement)( doc, node );
4339
+ }
4340
+ }
4341
+ }
4342
+
4343
+ void TY_(ParseXMLDocument)(TidyDocImpl* doc)
4344
+ {
4345
+ Node *node, *doctype = NULL;
4346
+
4347
+ TY_(SetOptionBool)( doc, TidyXmlTags, yes );
4348
+
4349
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4350
+ {
4351
+ /* discard unexpected end tags */
4352
+ if (node->type == EndTag)
4353
+ {
4354
+ TY_(ReportError)(doc, NULL, node, UNEXPECTED_ENDTAG);
4355
+ TY_(FreeNode)( doc, node);
4356
+ continue;
4357
+ }
4358
+
4359
+ /* deal with comments etc. */
4360
+ if (InsertMisc( &doc->root, node))
4361
+ continue;
4362
+
4363
+ if (node->type == DocTypeTag)
4364
+ {
4365
+ if (doctype == NULL)
4366
+ {
4367
+ TY_(InsertNodeAtEnd)( &doc->root, node);
4368
+ doctype = node;
4369
+ }
4370
+ else
4371
+ {
4372
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4373
+ TY_(FreeNode)( doc, node);
4374
+ }
4375
+ continue;
4376
+ }
4377
+
4378
+ if (node->type == StartEndTag)
4379
+ {
4380
+ TY_(InsertNodeAtEnd)( &doc->root, node);
4381
+ continue;
4382
+ }
4383
+
4384
+ /* if start tag then parse element's content */
4385
+ if (node->type == StartTag)
4386
+ {
4387
+ TY_(InsertNodeAtEnd)( &doc->root, node );
4388
+ ParseXMLElement( doc, node, IgnoreWhitespace );
4389
+ continue;
4390
+ }
4391
+
4392
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4393
+ TY_(FreeNode)( doc, node);
4394
+ }
4395
+
4396
+ /* ensure presence of initial <?xml version="1.0"?> */
4397
+ if ( cfgBool(doc, TidyXmlDecl) )
4398
+ TY_(FixXmlDecl)( doc );
4399
+ }
4400
+
4401
+ /*
4402
+ * local variables:
4403
+ * mode: c
4404
+ * indent-tabs-mode: nil
4405
+ * c-basic-offset: 4
4406
+ * eval: (c-set-offset 'substatement-open 0)
4407
+ * end:
4408
+ */