tidy-ext 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/parser.c ADDED
@@ -0,0 +1,4408 @@
1
+ /* parser.c -- HTML Parser
2
+
3
+ (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: krusch $
9
+ $Date: 2009/10/27 19:27:49 $
10
+ $Revision: 1.188 $
11
+
12
+ */
13
+
14
+ #include "tidy-int.h"
15
+ #include "lexer.h"
16
+ #include "parser.h"
17
+ #include "message.h"
18
+ #include "clean.h"
19
+ #include "tags.h"
20
+ #include "tmbstr.h"
21
+
22
+ #ifdef AUTO_INPUT_ENCODING
23
+ #include "charsets.h"
24
+ #endif
25
+
26
+ Bool TY_(CheckNodeIntegrity)(Node *node)
27
+ {
28
+ #ifndef NO_NODE_INTEGRITY_CHECK
29
+ Node *child;
30
+
31
+ if (node->prev)
32
+ {
33
+ if (node->prev->next != node)
34
+ return no;
35
+ }
36
+
37
+ if (node->next)
38
+ {
39
+ if (node->next == node || node->next->prev != node)
40
+ return no;
41
+ }
42
+
43
+ if (node->parent)
44
+ {
45
+ if (node->prev == NULL && node->parent->content != node)
46
+ return no;
47
+
48
+ if (node->next == NULL && node->parent->last != node)
49
+ return no;
50
+ }
51
+
52
+ for (child = node->content; child; child = child->next)
53
+ if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
54
+ return no;
55
+
56
+ #endif
57
+ return yes;
58
+ }
59
+
60
+ /*
61
+ used to determine how attributes
62
+ without values should be printed
63
+ this was introduced to deal with
64
+ user defined tags e.g. Cold Fusion
65
+ */
66
+ Bool TY_(IsNewNode)(Node *node)
67
+ {
68
+ if (node && node->tag)
69
+ {
70
+ return (node->tag->model & CM_NEW);
71
+ }
72
+ return yes;
73
+ }
74
+
75
+ void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
76
+ {
77
+ const Dict* tag = TY_(LookupTagDef)(tid);
78
+ Node* tmp = TY_(InferredTag)(doc, tag->id);
79
+
80
+ if (obsolete)
81
+ TY_(ReportWarning)(doc, node, tmp, OBSOLETE_ELEMENT);
82
+ else if (unexpected)
83
+ TY_(ReportError)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
84
+ else
85
+ TY_(ReportNotice)(doc, node, tmp, REPLACING_ELEMENT);
86
+
87
+ TidyDocFree(doc, tmp->element);
88
+ TidyDocFree(doc, tmp);
89
+
90
+ node->was = node->tag;
91
+ node->tag = tag;
92
+ node->type = StartTag;
93
+ node->implicit = yes;
94
+ TidyDocFree(doc, node->element);
95
+ node->element = TY_(tmbstrdup)(doc->allocator, tag->name);
96
+ }
97
+
98
+ /* extract a node and its children from a markup tree */
99
+ Node *TY_(RemoveNode)(Node *node)
100
+ {
101
+ if (node->prev)
102
+ node->prev->next = node->next;
103
+
104
+ if (node->next)
105
+ node->next->prev = node->prev;
106
+
107
+ if (node->parent)
108
+ {
109
+ if (node->parent->content == node)
110
+ node->parent->content = node->next;
111
+
112
+ if (node->parent->last == node)
113
+ node->parent->last = node->prev;
114
+ }
115
+
116
+ node->parent = node->prev = node->next = NULL;
117
+ return node;
118
+ }
119
+
120
+ /* remove node from markup tree and discard it */
121
+ Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
122
+ {
123
+ Node *next = NULL;
124
+
125
+ if (element)
126
+ {
127
+ next = element->next;
128
+ TY_(RemoveNode)(element);
129
+ TY_(FreeNode)( doc, element);
130
+ }
131
+
132
+ return next;
133
+ }
134
+
135
+ /*
136
+ insert "node" into markup tree as the firt element
137
+ of content of "element"
138
+ */
139
+ void TY_(InsertNodeAtStart)(Node *element, Node *node)
140
+ {
141
+ node->parent = element;
142
+
143
+ if (element->content == NULL)
144
+ element->last = node;
145
+ else
146
+ element->content->prev = node;
147
+
148
+ node->next = element->content;
149
+ node->prev = NULL;
150
+ element->content = node;
151
+ }
152
+
153
+ /*
154
+ insert "node" into markup tree as the last element
155
+ of content of "element"
156
+ */
157
+ void TY_(InsertNodeAtEnd)(Node *element, Node *node)
158
+ {
159
+ node->parent = element;
160
+ node->prev = element->last;
161
+
162
+ if (element->last != NULL)
163
+ element->last->next = node;
164
+ else
165
+ element->content = node;
166
+
167
+ element->last = node;
168
+ }
169
+
170
+ /*
171
+ insert "node" into markup tree in place of "element"
172
+ which is moved to become the child of the node
173
+ */
174
+ static void InsertNodeAsParent(Node *element, Node *node)
175
+ {
176
+ node->content = element;
177
+ node->last = element;
178
+ node->parent = element->parent;
179
+ element->parent = node;
180
+
181
+ if (node->parent->content == element)
182
+ node->parent->content = node;
183
+
184
+ if (node->parent->last == element)
185
+ node->parent->last = node;
186
+
187
+ node->prev = element->prev;
188
+ element->prev = NULL;
189
+
190
+ if (node->prev)
191
+ node->prev->next = node;
192
+
193
+ node->next = element->next;
194
+ element->next = NULL;
195
+
196
+ if (node->next)
197
+ node->next->prev = node;
198
+ }
199
+
200
+ /* insert "node" into markup tree before "element" */
201
+ void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
202
+ {
203
+ Node *parent;
204
+
205
+ parent = element->parent;
206
+ node->parent = parent;
207
+ node->next = element;
208
+ node->prev = element->prev;
209
+ element->prev = node;
210
+
211
+ if (node->prev)
212
+ node->prev->next = node;
213
+
214
+ if (parent->content == element)
215
+ parent->content = node;
216
+ }
217
+
218
+ /* insert "node" into markup tree after "element" */
219
+ void TY_(InsertNodeAfterElement)(Node *element, Node *node)
220
+ {
221
+ Node *parent;
222
+
223
+ parent = element->parent;
224
+ node->parent = parent;
225
+
226
+ /* AQ - 13 Jan 2000 fix for parent == NULL */
227
+ if (parent != NULL && parent->last == element)
228
+ parent->last = node;
229
+ else
230
+ {
231
+ node->next = element->next;
232
+ /* AQ - 13 Jan 2000 fix for node->next == NULL */
233
+ if (node->next != NULL)
234
+ node->next->prev = node;
235
+ }
236
+
237
+ element->next = node;
238
+ node->prev = element;
239
+ }
240
+
241
+ static Bool CanPrune( TidyDocImpl* doc, Node *element )
242
+ {
243
+ if ( TY_(nodeIsText)(element) )
244
+ return yes;
245
+
246
+ if ( element->content )
247
+ return no;
248
+
249
+ if ( element->tag == NULL )
250
+ return no;
251
+
252
+ if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
253
+ return no;
254
+
255
+ if ( nodeIsA(element) && element->attributes != NULL )
256
+ return no;
257
+
258
+ if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
259
+ return no;
260
+
261
+ if ( element->tag->model & CM_ROW )
262
+ return no;
263
+
264
+ if ( element->tag->model & CM_EMPTY )
265
+ return no;
266
+
267
+ if ( nodeIsAPPLET(element) )
268
+ return no;
269
+
270
+ if ( nodeIsOBJECT(element) )
271
+ return no;
272
+
273
+ if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
274
+ return no;
275
+
276
+ if ( nodeIsTITLE(element) )
277
+ return no;
278
+
279
+ /* #433359 - fix by Randy Waki 12 Mar 01 */
280
+ if ( nodeIsIFRAME(element) )
281
+ return no;
282
+
283
+ /* fix for bug 770297 */
284
+ if (nodeIsTEXTAREA(element))
285
+ return no;
286
+
287
+ if ( attrGetID(element) || attrGetNAME(element) )
288
+ return no;
289
+
290
+ /* fix for bug 695408; a better fix would look for unknown and */
291
+ /* known proprietary attributes that make the element significant */
292
+ if (attrGetDATAFLD(element))
293
+ return no;
294
+
295
+ /* fix for bug 723772, don't trim new-...-tags */
296
+ if (element->tag->id == TidyTag_UNKNOWN)
297
+ return no;
298
+
299
+ if (nodeIsBODY(element))
300
+ return no;
301
+
302
+ if (nodeIsCOLGROUP(element))
303
+ return no;
304
+
305
+ return yes;
306
+ }
307
+
308
+ /* return next element */
309
+ Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
310
+ {
311
+ if ( CanPrune(doc, element) )
312
+ {
313
+ if (element->type != TextNode)
314
+ TY_(ReportNotice)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
315
+
316
+ return TY_(DiscardElement)(doc, element);
317
+ }
318
+ return element->next;
319
+ }
320
+
321
+ Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
322
+ {
323
+ Node* next;
324
+
325
+ while (node)
326
+ {
327
+ next = node->next;
328
+
329
+ if (node->content)
330
+ TY_(DropEmptyElements)(doc, node->content);
331
+
332
+ if (!TY_(nodeIsElement)(node) &&
333
+ !(TY_(nodeIsText)(node) && !(node->start < node->end)))
334
+ {
335
+ node = next;
336
+ continue;
337
+ }
338
+
339
+ next = TY_(TrimEmptyElement)(doc, node);
340
+ node = next;
341
+ }
342
+
343
+ return node;
344
+ }
345
+
346
+ /*
347
+ errors in positioning of form start or end tags
348
+ generally require human intervention to fix
349
+ */
350
+ static void BadForm( TidyDocImpl* doc )
351
+ {
352
+ doc->badForm = yes;
353
+ /* doc->errors++; */
354
+ }
355
+
356
+ /*
357
+ This maps
358
+ <em>hello </em><strong>world</strong>
359
+ to
360
+ <em>hello</em> <strong>world</strong>
361
+
362
+ If last child of element is a text node
363
+ then trim trailing white space character
364
+ moving it to after element's end tag.
365
+ */
366
+ static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
367
+ {
368
+ Lexer* lexer = doc->lexer;
369
+ byte c;
370
+
371
+ if (TY_(nodeIsText)(last))
372
+ {
373
+ if (last->end > last->start)
374
+ {
375
+ c = (byte) lexer->lexbuf[ last->end - 1 ];
376
+
377
+ if ( c == ' '
378
+ #ifdef COMMENT_NBSP_FIX
379
+ || c == 160
380
+ #endif
381
+ )
382
+ {
383
+ #ifdef COMMENT_NBSP_FIX
384
+ /* take care with <td>&nbsp;</td> */
385
+ if ( c == 160 &&
386
+ ( element->tag == doc->tags.tag_td ||
387
+ element->tag == doc->tags.tag_th )
388
+ )
389
+ {
390
+ if (last->end > last->start + 1)
391
+ last->end -= 1;
392
+ }
393
+ else
394
+ #endif
395
+ {
396
+ last->end -= 1;
397
+ if ( (element->tag->model & CM_INLINE) &&
398
+ !(element->tag->model & CM_FIELD) )
399
+ lexer->insertspace = yes;
400
+ }
401
+ }
402
+ }
403
+ }
404
+ }
405
+
406
+ #if 0
407
+ static Node *EscapeTag(Lexer *lexer, Node *element)
408
+ {
409
+ Node *node = NewNode(lexer->allocator, lexer);
410
+
411
+ node->start = lexer->lexsize;
412
+ AddByte(lexer, '<');
413
+
414
+ if (element->type == EndTag)
415
+ AddByte(lexer, '/');
416
+
417
+ if (element->element)
418
+ {
419
+ char *p;
420
+ for (p = element->element; *p != '\0'; ++p)
421
+ AddByte(lexer, *p);
422
+ }
423
+ else if (element->type == DocTypeTag)
424
+ {
425
+ uint i;
426
+ AddStringLiteral( lexer, "!DOCTYPE " );
427
+ for (i = element->start; i < element->end; ++i)
428
+ AddByte(lexer, lexer->lexbuf[i]);
429
+ }
430
+
431
+ if (element->type == StartEndTag)
432
+ AddByte(lexer, '/');
433
+
434
+ AddByte(lexer, '>');
435
+ node->end = lexer->lexsize;
436
+
437
+ return node;
438
+ }
439
+ #endif /* 0 */
440
+
441
+ /* Only true for text nodes. */
442
+ Bool TY_(IsBlank)(Lexer *lexer, Node *node)
443
+ {
444
+ Bool isBlank = TY_(nodeIsText)(node);
445
+ if ( isBlank )
446
+ isBlank = ( node->end == node->start || /* Zero length */
447
+ ( node->end == node->start+1 /* or one blank. */
448
+ && lexer->lexbuf[node->start] == ' ' ) );
449
+ return isBlank;
450
+ }
451
+
452
+ /*
453
+ This maps
454
+ <p>hello<em> world</em>
455
+ to
456
+ <p>hello <em>world</em>
457
+
458
+ Trims initial space, by moving it before the
459
+ start tag, or if this element is the first in
460
+ parent's content, then by discarding the space
461
+ */
462
+ static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
463
+ {
464
+ Lexer* lexer = doc->lexer;
465
+ Node *prev, *node;
466
+
467
+ if ( TY_(nodeIsText)(text) &&
468
+ lexer->lexbuf[text->start] == ' ' &&
469
+ text->start < text->end )
470
+ {
471
+ if ( (element->tag->model & CM_INLINE) &&
472
+ !(element->tag->model & CM_FIELD) )
473
+ {
474
+ prev = element->prev;
475
+
476
+ if (TY_(nodeIsText)(prev))
477
+ {
478
+ if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
479
+ lexer->lexbuf[(prev->end)++] = ' ';
480
+
481
+ ++(element->start);
482
+ }
483
+ else /* create new node */
484
+ {
485
+ node = TY_(NewNode)(lexer->allocator, lexer);
486
+ node->start = (element->start)++;
487
+ node->end = element->start;
488
+ lexer->lexbuf[node->start] = ' ';
489
+ TY_(InsertNodeBeforeElement)(element ,node);
490
+ }
491
+ }
492
+
493
+ /* discard the space in current node */
494
+ ++(text->start);
495
+ }
496
+ }
497
+
498
+ static Bool IsPreDescendant(Node* node)
499
+ {
500
+ Node *parent = node->parent;
501
+
502
+ while (parent)
503
+ {
504
+ if (parent->tag && parent->tag->parser == TY_(ParsePre))
505
+ return yes;
506
+
507
+ parent = parent->parent;
508
+ }
509
+
510
+ return no;
511
+ }
512
+
513
+ static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
514
+ {
515
+ Node* next;
516
+
517
+ if (!TY_(nodeIsText)(node))
518
+ return no;
519
+
520
+ if (node->parent->type == DocTypeTag)
521
+ return no;
522
+
523
+ if (IsPreDescendant(node))
524
+ return no;
525
+
526
+ if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
527
+ return no;
528
+
529
+ next = node->next;
530
+
531
+ /* <p>... </p> */
532
+ if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
533
+ return yes;
534
+
535
+ /* <div><small>... </small><h3>...</h3></div> */
536
+ if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
537
+ return yes;
538
+
539
+ if (!next)
540
+ return no;
541
+
542
+ if (nodeIsBR(next))
543
+ return yes;
544
+
545
+ if (TY_(nodeHasCM)(next, CM_INLINE))
546
+ return no;
547
+
548
+ /* <a href='/'>...</a> <p>...</p> */
549
+ if (next->type == StartTag)
550
+ return yes;
551
+
552
+ /* <strong>...</strong> <hr /> */
553
+ if (next->type == StartEndTag)
554
+ return yes;
555
+
556
+ /* evil adjacent text nodes, Tidy should not generate these :-( */
557
+ if (TY_(nodeIsText)(next) && next->start < next->end
558
+ && TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
559
+ return yes;
560
+
561
+ return no;
562
+ }
563
+
564
+ static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
565
+ {
566
+ if (!TY_(nodeIsText)(node))
567
+ return no;
568
+
569
+ if (node->parent->type == DocTypeTag)
570
+ return no;
571
+
572
+ if (IsPreDescendant(node))
573
+ return no;
574
+
575
+ if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
576
+ return no;
577
+
578
+ /* <p>...<br> <em>...</em>...</p> */
579
+ if (nodeIsBR(node->prev))
580
+ return yes;
581
+
582
+ /* <p> ...</p> */
583
+ if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
584
+ return yes;
585
+
586
+ /* <h4>...</h4> <em>...</em> */
587
+ if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
588
+ TY_(nodeIsElement)(node->prev))
589
+ return yes;
590
+
591
+ /* <p><span> ...</span></p> */
592
+ if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
593
+ return yes;
594
+
595
+ return no;
596
+ }
597
+
598
+ static void CleanSpaces(TidyDocImpl* doc, Node* node)
599
+ {
600
+ Node* next;
601
+
602
+ while (node)
603
+ {
604
+ next = node->next;
605
+
606
+ if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
607
+ while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
608
+ ++(node->start);
609
+
610
+ if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
611
+ while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
612
+ --(node->end);
613
+
614
+ if (TY_(nodeIsText)(node) && !(node->start < node->end))
615
+ {
616
+ TY_(RemoveNode)(node);
617
+ TY_(FreeNode)(doc, node);
618
+ node = next;
619
+
620
+ continue;
621
+ }
622
+
623
+ if (node->content)
624
+ CleanSpaces(doc, node->content);
625
+
626
+ node = next;
627
+ }
628
+ }
629
+
630
+ /*
631
+ Move initial and trailing space out.
632
+ This routine maps:
633
+
634
+ hello<em> world</em>
635
+ to
636
+ hello <em>world</em>
637
+ and
638
+ <em>hello </em><strong>world</strong>
639
+ to
640
+ <em>hello</em> <strong>world</strong>
641
+ */
642
+ static void TrimSpaces( TidyDocImpl* doc, Node *element)
643
+ {
644
+ Node* text = element->content;
645
+
646
+ if (nodeIsPRE(element) || IsPreDescendant(element))
647
+ return;
648
+
649
+ if (TY_(nodeIsText)(text))
650
+ TrimInitialSpace(doc, element, text);
651
+
652
+ text = element->last;
653
+
654
+ if (TY_(nodeIsText)(text))
655
+ TrimTrailingSpace(doc, element, text);
656
+ }
657
+
658
+ static Bool DescendantOf( Node *element, TidyTagId tid )
659
+ {
660
+ Node *parent;
661
+ for ( parent = element->parent;
662
+ parent != NULL;
663
+ parent = parent->parent )
664
+ {
665
+ if ( TagIsId(parent, tid) )
666
+ return yes;
667
+ }
668
+ return no;
669
+ }
670
+
671
+ static Bool InsertMisc(Node *element, Node *node)
672
+ {
673
+ if (node->type == CommentTag ||
674
+ node->type == ProcInsTag ||
675
+ node->type == CDATATag ||
676
+ node->type == SectionTag ||
677
+ node->type == AspTag ||
678
+ node->type == JsteTag ||
679
+ node->type == PhpTag )
680
+ {
681
+ TY_(InsertNodeAtEnd)(element, node);
682
+ return yes;
683
+ }
684
+
685
+ if ( node->type == XmlDecl )
686
+ {
687
+ Node* root = element;
688
+ while ( root && root->parent )
689
+ root = root->parent;
690
+ if ( root && !(root->content && root->content->type == XmlDecl))
691
+ {
692
+ TY_(InsertNodeAtStart)( root, node );
693
+ return yes;
694
+ }
695
+ }
696
+
697
+ /* Declared empty tags seem to be slipping through
698
+ ** the cracks. This is an experiment to figure out
699
+ ** a decent place to pick them up.
700
+ */
701
+ if ( node->tag &&
702
+ TY_(nodeIsElement)(node) &&
703
+ TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
704
+ (node->tag->versions & VERS_PROPRIETARY) != 0 )
705
+ {
706
+ TY_(InsertNodeAtEnd)(element, node);
707
+ return yes;
708
+ }
709
+
710
+ return no;
711
+ }
712
+
713
+
714
+ static void ParseTag( TidyDocImpl* doc, Node *node, GetTokenMode mode )
715
+ {
716
+ Lexer* lexer = doc->lexer;
717
+ /*
718
+ Fix by GLP 2000-12-21. Need to reset insertspace if this
719
+ is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
720
+ */
721
+ if (node->tag->model & CM_EMPTY)
722
+ {
723
+ lexer->waswhite = no;
724
+ if (node->tag->parser == NULL)
725
+ return;
726
+ }
727
+ else if (!(node->tag->model & CM_INLINE))
728
+ lexer->insertspace = no;
729
+
730
+ if (node->tag->parser == NULL)
731
+ return;
732
+
733
+ if (node->type == StartEndTag)
734
+ return;
735
+
736
+ (*node->tag->parser)( doc, node, mode );
737
+ }
738
+
739
+ /*
740
+ the doctype has been found after other tags,
741
+ and needs moving to before the html element
742
+ */
743
+ static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
744
+ {
745
+ Node* existing = TY_(FindDocType)( doc );
746
+ if ( existing )
747
+ {
748
+ TY_(ReportError)(doc, element, doctype, DISCARDING_UNEXPECTED );
749
+ TY_(FreeNode)( doc, doctype );
750
+ }
751
+ else
752
+ {
753
+ TY_(ReportError)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
754
+ while ( !nodeIsHTML(element) )
755
+ element = element->parent;
756
+ TY_(InsertNodeBeforeElement)( element, doctype );
757
+ }
758
+ }
759
+
760
+ /*
761
+ move node to the head, where element is used as starting
762
+ point in hunt for head. normally called during parsing
763
+ */
764
+ static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
765
+ {
766
+ Node *head;
767
+
768
+ TY_(RemoveNode)( node ); /* make sure that node is isolated */
769
+
770
+ if ( TY_(nodeIsElement)(node) )
771
+ {
772
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN );
773
+
774
+ head = TY_(FindHEAD)(doc);
775
+ assert(head != NULL);
776
+
777
+ TY_(InsertNodeAtEnd)(head, node);
778
+
779
+ if ( node->tag->parser )
780
+ ParseTag( doc, node, IgnoreWhitespace );
781
+ }
782
+ else
783
+ {
784
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
785
+ TY_(FreeNode)( doc, node );
786
+ }
787
+ }
788
+
789
+ /* moves given node to end of body element */
790
+ static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
791
+ {
792
+ Node* body = TY_(FindBody)( doc );
793
+ if ( body )
794
+ {
795
+ TY_(RemoveNode)( node );
796
+ TY_(InsertNodeAtEnd)( body, node );
797
+ }
798
+ }
799
+
800
+ static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
801
+ {
802
+ ctmbstr sprop =
803
+ "padding-left: 2ex; margin-left: 0ex"
804
+ "; margin-top: 0ex; margin-bottom: 0ex";
805
+ if ( !cfgBool(doc, TidyDecorateInferredUL) )
806
+ return;
807
+ if ( cfgBool(doc, TidyMakeClean) )
808
+ TY_(AddStyleAsClass)( doc, node, sprop );
809
+ else
810
+ TY_(AddStyleProperty)( doc, node, sprop );
811
+ }
812
+
813
+ /*
814
+ element is node created by the lexer
815
+ upon seeing the start tag, or by the
816
+ parser when the start tag is inferred
817
+ */
818
+ void TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode)
819
+ {
820
+ Lexer* lexer = doc->lexer;
821
+ Node *node;
822
+ Bool checkstack = yes;
823
+ uint istackbase = 0;
824
+
825
+ if ( element->tag->model & CM_EMPTY )
826
+ return;
827
+
828
+ if ( nodeIsFORM(element) &&
829
+ DescendantOf(element, TidyTag_FORM) )
830
+ TY_(ReportError)(doc, element, NULL, ILLEGAL_NESTING );
831
+
832
+ /*
833
+ InlineDup() asks the lexer to insert inline emphasis tags
834
+ currently pushed on the istack, but take care to avoid
835
+ propagating inline emphasis inside OBJECT or APPLET.
836
+ For these elements a fresh inline stack context is created
837
+ and disposed of upon reaching the end of the element.
838
+ They thus behave like table cells in this respect.
839
+ */
840
+ if (element->tag->model & CM_OBJECT)
841
+ {
842
+ istackbase = lexer->istackbase;
843
+ lexer->istackbase = lexer->istacksize;
844
+ }
845
+
846
+ if (!(element->tag->model & CM_MIXED))
847
+ TY_(InlineDup)( doc, NULL );
848
+
849
+ mode = IgnoreWhitespace;
850
+
851
+ while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
852
+ {
853
+ /* end tag for this element */
854
+ if (node->type == EndTag && node->tag &&
855
+ (node->tag == element->tag || element->was == node->tag))
856
+ {
857
+ TY_(FreeNode)( doc, node );
858
+
859
+ if (element->tag->model & CM_OBJECT)
860
+ {
861
+ /* pop inline stack */
862
+ while (lexer->istacksize > lexer->istackbase)
863
+ TY_(PopInline)( doc, NULL );
864
+ lexer->istackbase = istackbase;
865
+ }
866
+
867
+ element->closed = yes;
868
+ TrimSpaces( doc, element );
869
+ return;
870
+ }
871
+
872
+ if ( nodeIsBODY( node ) && DescendantOf( element, TidyTag_HEAD ))
873
+ {
874
+ /* If we're in the HEAD, close it before proceeding.
875
+ This is an extremely rare occurance, but has been observed.
876
+ */
877
+ TY_(UngetToken)( doc );
878
+ break;
879
+ }
880
+
881
+ if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
882
+ {
883
+ if ( TY_(nodeIsElement)(node) )
884
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
885
+ TY_(FreeNode)( doc, node );
886
+ continue;
887
+ }
888
+
889
+
890
+ if (node->type == EndTag)
891
+ {
892
+ if (node->tag == NULL)
893
+ {
894
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
895
+ TY_(FreeNode)( doc, node );
896
+ continue;
897
+ }
898
+ else if ( nodeIsBR(node) )
899
+ node->type = StartTag;
900
+ else if ( nodeIsP(node) )
901
+ {
902
+ /* Cannot have a block inside a paragraph, so no checking
903
+ for an ancestor is necessary -- but we _can_ have
904
+ paragraphs inside a block, so change it to an implicit
905
+ empty paragraph, to be dealt with according to the user's
906
+ options
907
+ */
908
+ node->type = StartEndTag;
909
+ node->implicit = yes;
910
+ #if OBSOLETE
911
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
912
+ TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
913
+ TY_(InsertNodeAtEnd)( element, node );
914
+ node = InferredTag(doc, TidyTag_BR);
915
+ #endif
916
+ }
917
+ else if (DescendantOf( element, node->tag->id ))
918
+ {
919
+ /*
920
+ if this is the end tag for an ancestor element
921
+ then infer end tag for this element
922
+ */
923
+ TY_(UngetToken)( doc );
924
+ break;
925
+ #if OBSOLETE
926
+ Node *parent;
927
+ for ( parent = element->parent;
928
+ parent != NULL;
929
+ parent = parent->parent )
930
+ {
931
+ if (node->tag == parent->tag)
932
+ {
933
+ if (!(element->tag->model & CM_OPT))
934
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
935
+
936
+ TY_(UngetToken)( doc );
937
+
938
+ if (element->tag->model & CM_OBJECT)
939
+ {
940
+ /* pop inline stack */
941
+ while (lexer->istacksize > lexer->istackbase)
942
+ TY_(PopInline)( doc, NULL );
943
+ lexer->istackbase = istackbase;
944
+ }
945
+
946
+ TrimSpaces( doc, element );
947
+ return;
948
+ }
949
+ }
950
+ #endif
951
+ }
952
+ else
953
+ {
954
+ /* special case </tr> etc. for stuff moved in front of table */
955
+ if ( lexer->exiled
956
+ && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
957
+ {
958
+ TY_(UngetToken)( doc );
959
+ TrimSpaces( doc, element );
960
+ return;
961
+ }
962
+ }
963
+ }
964
+
965
+ /* mixed content model permits text */
966
+ if (TY_(nodeIsText)(node))
967
+ {
968
+ if ( checkstack )
969
+ {
970
+ checkstack = no;
971
+ if (!(element->tag->model & CM_MIXED))
972
+ {
973
+ if ( TY_(InlineDup)(doc, node) > 0 )
974
+ continue;
975
+ }
976
+ }
977
+
978
+ TY_(InsertNodeAtEnd)(element, node);
979
+ mode = MixedContent;
980
+
981
+ /*
982
+ HTML4 strict doesn't allow mixed content for
983
+ elements with %block; as their content model
984
+ */
985
+ /*
986
+ But only body, map, blockquote, form and
987
+ noscript have content model %block;
988
+ */
989
+ if ( nodeIsBODY(element) ||
990
+ nodeIsMAP(element) ||
991
+ nodeIsBLOCKQUOTE(element) ||
992
+ nodeIsFORM(element) ||
993
+ nodeIsNOSCRIPT(element) )
994
+ TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
995
+ continue;
996
+ }
997
+
998
+ if ( InsertMisc(element, node) )
999
+ continue;
1000
+
1001
+ /* allow PARAM elements? */
1002
+ if ( nodeIsPARAM(node) )
1003
+ {
1004
+ if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
1005
+ {
1006
+ TY_(InsertNodeAtEnd)(element, node);
1007
+ continue;
1008
+ }
1009
+
1010
+ /* otherwise discard it */
1011
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1012
+ TY_(FreeNode)( doc, node );
1013
+ continue;
1014
+ }
1015
+
1016
+ /* allow AREA elements? */
1017
+ if ( nodeIsAREA(node) )
1018
+ {
1019
+ if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
1020
+ {
1021
+ TY_(InsertNodeAtEnd)(element, node);
1022
+ continue;
1023
+ }
1024
+
1025
+ /* otherwise discard it */
1026
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1027
+ TY_(FreeNode)( doc, node );
1028
+ continue;
1029
+ }
1030
+
1031
+ /* ignore unknown start/end tags */
1032
+ if ( node->tag == NULL )
1033
+ {
1034
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1035
+ TY_(FreeNode)( doc, node );
1036
+ continue;
1037
+ }
1038
+
1039
+ /*
1040
+ Allow CM_INLINE elements here.
1041
+
1042
+ Allow CM_BLOCK elements here unless
1043
+ lexer->excludeBlocks is yes.
1044
+
1045
+ LI and DD are special cased.
1046
+
1047
+ Otherwise infer end tag for this element.
1048
+ */
1049
+
1050
+ if ( !TY_(nodeHasCM)(node, CM_INLINE) )
1051
+ {
1052
+ if ( !TY_(nodeIsElement)(node) )
1053
+ {
1054
+ if ( nodeIsFORM(node) )
1055
+ BadForm( doc );
1056
+
1057
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1058
+ TY_(FreeNode)( doc, node );
1059
+ continue;
1060
+ }
1061
+
1062
+ /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1063
+ /*
1064
+ If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1065
+ start tag, discard the start tag and let the subsequent content get
1066
+ parsed as content of the enclosing LI. This seems to mimic IE and
1067
+ Netscape, and avoids an infinite loop: without this check,
1068
+ ParseBlock (which is parsing the LI's content) and ParseList (which
1069
+ is parsing the LI's parent's content) repeatedly defer to each
1070
+ other to parse the illegal start tag, each time inferring a missing
1071
+ </li> or <li> respectively.
1072
+
1073
+ NOTE: This check is a bit fragile. It specifically checks for the
1074
+ four tags that happen to weave their way through the current series
1075
+ of tests performed by ParseBlock and ParseList to trigger the
1076
+ infinite loop.
1077
+ */
1078
+ if ( nodeIsLI(element) )
1079
+ {
1080
+ if ( nodeIsFRAME(node) ||
1081
+ nodeIsFRAMESET(node) ||
1082
+ nodeIsOPTGROUP(node) ||
1083
+ nodeIsOPTION(node) )
1084
+ {
1085
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1086
+ TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */
1087
+ continue;
1088
+ }
1089
+ }
1090
+
1091
+ if ( nodeIsTD(element) || nodeIsTH(element) )
1092
+ {
1093
+ /* if parent is a table cell, avoid inferring the end of the cell */
1094
+
1095
+ if ( TY_(nodeHasCM)(node, CM_HEAD) )
1096
+ {
1097
+ MoveToHead( doc, element, node );
1098
+ continue;
1099
+ }
1100
+
1101
+ if ( TY_(nodeHasCM)(node, CM_LIST) )
1102
+ {
1103
+ TY_(UngetToken)( doc );
1104
+ node = TY_(InferredTag)(doc, TidyTag_UL);
1105
+ AddClassNoIndent(doc, node);
1106
+ lexer->excludeBlocks = yes;
1107
+ }
1108
+ else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1109
+ {
1110
+ TY_(UngetToken)( doc );
1111
+ node = TY_(InferredTag)(doc, TidyTag_DL);
1112
+ lexer->excludeBlocks = yes;
1113
+ }
1114
+
1115
+ /* infer end of current table cell */
1116
+ if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
1117
+ {
1118
+ TY_(UngetToken)( doc );
1119
+ TrimSpaces( doc, element );
1120
+ return;
1121
+ }
1122
+ }
1123
+ else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
1124
+ {
1125
+ if ( lexer->excludeBlocks )
1126
+ {
1127
+ if ( !TY_(nodeHasCM)(element, CM_OPT) )
1128
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1129
+
1130
+ TY_(UngetToken)( doc );
1131
+
1132
+ if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1133
+ lexer->istackbase = istackbase;
1134
+
1135
+ TrimSpaces( doc, element );
1136
+ return;
1137
+ }
1138
+ }
1139
+ else /* things like list items */
1140
+ {
1141
+ if (node->tag->model & CM_HEAD)
1142
+ {
1143
+ MoveToHead( doc, element, node );
1144
+ continue;
1145
+ }
1146
+
1147
+ /*
1148
+ special case where a form start tag
1149
+ occurs in a tr and is followed by td or th
1150
+ */
1151
+
1152
+ if ( nodeIsFORM(element) &&
1153
+ nodeIsTD(element->parent) &&
1154
+ element->parent->implicit )
1155
+ {
1156
+ if ( nodeIsTD(node) )
1157
+ {
1158
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1159
+ TY_(FreeNode)( doc, node );
1160
+ continue;
1161
+ }
1162
+
1163
+ if ( nodeIsTH(node) )
1164
+ {
1165
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1166
+ TY_(FreeNode)( doc, node );
1167
+ node = element->parent;
1168
+ TidyDocFree(doc, node->element);
1169
+ node->element = TY_(tmbstrdup)(doc->allocator, "th");
1170
+ node->tag = TY_(LookupTagDef)( TidyTag_TH );
1171
+ continue;
1172
+ }
1173
+ }
1174
+
1175
+ if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
1176
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1177
+
1178
+ TY_(UngetToken)( doc );
1179
+
1180
+ if ( TY_(nodeHasCM)(node, CM_LIST) )
1181
+ {
1182
+ if ( element->parent && element->parent->tag &&
1183
+ element->parent->tag->parser == TY_(ParseList) )
1184
+ {
1185
+ TrimSpaces( doc, element );
1186
+ return;
1187
+ }
1188
+
1189
+ node = TY_(InferredTag)(doc, TidyTag_UL);
1190
+ AddClassNoIndent(doc, node);
1191
+ }
1192
+ else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1193
+ {
1194
+ if ( nodeIsDL(element->parent) )
1195
+ {
1196
+ TrimSpaces( doc, element );
1197
+ return;
1198
+ }
1199
+
1200
+ node = TY_(InferredTag)(doc, TidyTag_DL);
1201
+ }
1202
+ else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
1203
+ {
1204
+ /* http://tidy.sf.net/issue/1316307 */
1205
+ /* In exiled mode, return so table processing can
1206
+ continue. */
1207
+ if (lexer->exiled)
1208
+ return;
1209
+ node = TY_(InferredTag)(doc, TidyTag_TABLE);
1210
+ }
1211
+ else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1212
+ {
1213
+ /* pop inline stack */
1214
+ while ( lexer->istacksize > lexer->istackbase )
1215
+ TY_(PopInline)( doc, NULL );
1216
+ lexer->istackbase = istackbase;
1217
+ TrimSpaces( doc, element );
1218
+ return;
1219
+
1220
+ }
1221
+ else
1222
+ {
1223
+ TrimSpaces( doc, element );
1224
+ return;
1225
+ }
1226
+ }
1227
+ }
1228
+
1229
+ /* parse known element */
1230
+ if (TY_(nodeIsElement)(node))
1231
+ {
1232
+ if (node->tag->model & CM_INLINE)
1233
+ {
1234
+ if (checkstack && !node->implicit)
1235
+ {
1236
+ checkstack = no;
1237
+
1238
+ if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1239
+ {
1240
+ if ( TY_(InlineDup)(doc, node) > 0 )
1241
+ continue;
1242
+ }
1243
+ }
1244
+
1245
+ mode = MixedContent;
1246
+ }
1247
+ else
1248
+ {
1249
+ checkstack = yes;
1250
+ mode = IgnoreWhitespace;
1251
+ }
1252
+
1253
+ /* trim white space before <br> */
1254
+ if ( nodeIsBR(node) )
1255
+ TrimSpaces( doc, element );
1256
+
1257
+ TY_(InsertNodeAtEnd)(element, node);
1258
+
1259
+ if (node->implicit)
1260
+ TY_(ReportError)(doc, element, node, INSERTING_TAG );
1261
+
1262
+ ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
1263
+ continue;
1264
+ }
1265
+
1266
+ /* discard unexpected tags */
1267
+ if (node->type == EndTag)
1268
+ TY_(PopInline)( doc, node ); /* if inline end tag */
1269
+
1270
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1271
+ TY_(FreeNode)( doc, node );
1272
+ continue;
1273
+ }
1274
+
1275
+ if (!(element->tag->model & CM_OPT))
1276
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1277
+
1278
+ if (element->tag->model & CM_OBJECT)
1279
+ {
1280
+ /* pop inline stack */
1281
+ while ( lexer->istacksize > lexer->istackbase )
1282
+ TY_(PopInline)( doc, NULL );
1283
+ lexer->istackbase = istackbase;
1284
+ }
1285
+
1286
+ TrimSpaces( doc, element );
1287
+ }
1288
+
1289
+ void TY_(ParseInline)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
1290
+ {
1291
+ Lexer* lexer = doc->lexer;
1292
+ Node *node, *parent;
1293
+
1294
+ if (element->tag->model & CM_EMPTY)
1295
+ return;
1296
+
1297
+ /*
1298
+ ParseInline is used for some block level elements like H1 to H6
1299
+ For such elements we need to insert inline emphasis tags currently
1300
+ on the inline stack. For Inline elements, we normally push them
1301
+ onto the inline stack provided they aren't implicit or OBJECT/APPLET.
1302
+ This test is carried out in PushInline and PopInline, see istack.c
1303
+
1304
+ InlineDup(...) is not called for elements with a CM_MIXED (inline and
1305
+ block) content model, e.g. <del> or <ins>, otherwise constructs like
1306
+
1307
+ <p>111<a name='foo'>222<del>333</del>444</a>555</p>
1308
+ <p>111<span>222<del>333</del>444</span>555</p>
1309
+ <p>111<em>222<del>333</del>444</em>555</p>
1310
+
1311
+ will get corrupted.
1312
+ */
1313
+ if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
1314
+ !TY_(nodeHasCM)(element, CM_MIXED))
1315
+ TY_(InlineDup)(doc, NULL);
1316
+ else if (TY_(nodeHasCM)(element, CM_INLINE))
1317
+ TY_(PushInline)(doc, element);
1318
+
1319
+ if ( nodeIsNOBR(element) )
1320
+ doc->badLayout |= USING_NOBR;
1321
+ else if ( nodeIsFONT(element) )
1322
+ doc->badLayout |= USING_FONT;
1323
+
1324
+ /* Inline elements may or may not be within a preformatted element */
1325
+ if (mode != Preformatted)
1326
+ mode = MixedContent;
1327
+
1328
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
1329
+ {
1330
+ /* end tag for current element */
1331
+ if (node->tag == element->tag && node->type == EndTag)
1332
+ {
1333
+ if (element->tag->model & CM_INLINE)
1334
+ TY_(PopInline)( doc, node );
1335
+
1336
+ TY_(FreeNode)( doc, node );
1337
+
1338
+ if (!(mode & Preformatted))
1339
+ TrimSpaces(doc, element);
1340
+
1341
+ /*
1342
+ if a font element wraps an anchor and nothing else
1343
+ then move the font element inside the anchor since
1344
+ otherwise it won't alter the anchor text color
1345
+ */
1346
+ if ( nodeIsFONT(element) &&
1347
+ element->content && element->content == element->last )
1348
+ {
1349
+ Node *child = element->content;
1350
+
1351
+ if ( nodeIsA(child) )
1352
+ {
1353
+ child->parent = element->parent;
1354
+ child->next = element->next;
1355
+ child->prev = element->prev;
1356
+
1357
+ element->next = NULL;
1358
+ element->prev = NULL;
1359
+ element->parent = child;
1360
+
1361
+ element->content = child->content;
1362
+ element->last = child->last;
1363
+ child->content = element;
1364
+
1365
+ TY_(FixNodeLinks)(child);
1366
+ TY_(FixNodeLinks)(element);
1367
+ }
1368
+ }
1369
+
1370
+ element->closed = yes;
1371
+ TrimSpaces( doc, element );
1372
+ return;
1373
+ }
1374
+
1375
+ /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
1376
+ /* (see additional conditions below) */
1377
+ /* otherwise emphasis nesting is probably unintentional */
1378
+ /* big, small, sub, sup have cumulative effect to leave them alone */
1379
+ if ( node->type == StartTag
1380
+ && node->tag == element->tag
1381
+ && TY_(IsPushed)( doc, node )
1382
+ && !node->implicit
1383
+ && !element->implicit
1384
+ && node->tag && (node->tag->model & CM_INLINE)
1385
+ && !nodeIsA(node)
1386
+ && !nodeIsFONT(node)
1387
+ && !nodeIsBIG(node)
1388
+ && !nodeIsSMALL(node)
1389
+ && !nodeIsSUB(node)
1390
+ && !nodeIsSUP(node)
1391
+ && !nodeIsQ(node)
1392
+ && !nodeIsSPAN(node)
1393
+ )
1394
+ {
1395
+ /* proceeds only if "node" does not have any attribute and
1396
+ follows a text node not finishing with a space */
1397
+ if (element->content != NULL && node->attributes == NULL
1398
+ && TY_(nodeIsText)(element->last)
1399
+ && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
1400
+ {
1401
+ TY_(ReportWarning)(doc, element, node, COERCE_TO_ENDTAG_WARN);
1402
+ node->type = EndTag;
1403
+ TY_(UngetToken)(doc);
1404
+ continue;
1405
+ }
1406
+
1407
+ if (node->attributes == NULL || element->attributes == NULL)
1408
+ TY_(ReportWarning)(doc, element, node, NESTED_EMPHASIS);
1409
+ }
1410
+ else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
1411
+ nodeIsQ(node) )
1412
+ {
1413
+ TY_(ReportWarning)(doc, element, node, NESTED_QUOTATION);
1414
+ }
1415
+
1416
+ if ( TY_(nodeIsText)(node) )
1417
+ {
1418
+ /* only called for 1st child */
1419
+ if ( element->content == NULL && !(mode & Preformatted) )
1420
+ TrimSpaces( doc, element );
1421
+
1422
+ if ( node->start >= node->end )
1423
+ {
1424
+ TY_(FreeNode)( doc, node );
1425
+ continue;
1426
+ }
1427
+
1428
+ TY_(InsertNodeAtEnd)(element, node);
1429
+ continue;
1430
+ }
1431
+
1432
+ /* mixed content model so allow text */
1433
+ if (InsertMisc(element, node))
1434
+ continue;
1435
+
1436
+ /* deal with HTML tags */
1437
+ if ( nodeIsHTML(node) )
1438
+ {
1439
+ if ( TY_(nodeIsElement)(node) )
1440
+ {
1441
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED );
1442
+ TY_(FreeNode)( doc, node );
1443
+ continue;
1444
+ }
1445
+
1446
+ /* otherwise infer end of inline element */
1447
+ TY_(UngetToken)( doc );
1448
+
1449
+ if (!(mode & Preformatted))
1450
+ TrimSpaces(doc, element);
1451
+
1452
+ return;
1453
+ }
1454
+
1455
+ /* within <dt> or <pre> map <p> to <br> */
1456
+ if ( nodeIsP(node) &&
1457
+ node->type == StartTag &&
1458
+ ( (mode & Preformatted) ||
1459
+ nodeIsDT(element) ||
1460
+ DescendantOf(element, TidyTag_DT )
1461
+ )
1462
+ )
1463
+ {
1464
+ node->tag = TY_(LookupTagDef)( TidyTag_BR );
1465
+ TidyDocFree(doc, node->element);
1466
+ node->element = TY_(tmbstrdup)(doc->allocator, "br");
1467
+ TrimSpaces(doc, element);
1468
+ TY_(InsertNodeAtEnd)(element, node);
1469
+ continue;
1470
+ }
1471
+
1472
+ /* <p> allowed within <address> in HTML 4.01 Transitional */
1473
+ if ( nodeIsP(node) &&
1474
+ node->type == StartTag &&
1475
+ nodeIsADDRESS(element) )
1476
+ {
1477
+ TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1478
+ TY_(InsertNodeAtEnd)(element, node);
1479
+ (*node->tag->parser)( doc, node, mode );
1480
+ continue;
1481
+ }
1482
+
1483
+ /* ignore unknown and PARAM tags */
1484
+ if ( node->tag == NULL || nodeIsPARAM(node) )
1485
+ {
1486
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1487
+ TY_(FreeNode)( doc, node );
1488
+ continue;
1489
+ }
1490
+
1491
+ if ( nodeIsBR(node) && node->type == EndTag )
1492
+ node->type = StartTag;
1493
+
1494
+ if ( node->type == EndTag )
1495
+ {
1496
+ /* coerce </br> to <br> */
1497
+ if ( nodeIsBR(node) )
1498
+ node->type = StartTag;
1499
+ else if ( nodeIsP(node) )
1500
+ {
1501
+ /* coerce unmatched </p> to <br><br> */
1502
+ if ( !DescendantOf(element, TidyTag_P) )
1503
+ {
1504
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
1505
+ TrimSpaces( doc, element );
1506
+ TY_(InsertNodeAtEnd)( element, node );
1507
+ node = TY_(InferredTag)(doc, TidyTag_BR);
1508
+ TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
1509
+ continue;
1510
+ }
1511
+ }
1512
+ else if ( TY_(nodeHasCM)(node, CM_INLINE)
1513
+ && !nodeIsA(node)
1514
+ && !TY_(nodeHasCM)(node, CM_OBJECT)
1515
+ && TY_(nodeHasCM)(element, CM_INLINE) )
1516
+ {
1517
+ /* allow any inline end tag to end current element */
1518
+
1519
+ /* http://tidy.sf.net/issue/1426419 */
1520
+ /* but, like the browser, retain an earlier inline element.
1521
+ This is implemented by setting the lexer into a mode
1522
+ where it gets tokens from the inline stack rather than
1523
+ from the input stream. Check if the scenerio fits. */
1524
+ if ( !nodeIsA(element)
1525
+ && (node->tag != element->tag)
1526
+ && TY_(IsPushed)( doc, node )
1527
+ && TY_(IsPushed)( doc, element ) )
1528
+ {
1529
+ /* we have something like
1530
+ <b>bold <i>bold and italic</b> italics</i> */
1531
+ if ( TY_(SwitchInline)( doc, element, node ) )
1532
+ {
1533
+ TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1534
+ TY_(UngetToken)( doc ); /* put this back */
1535
+ TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
1536
+ if (!(mode & Preformatted))
1537
+ TrimSpaces( doc, element );
1538
+ return; /* close <i>, but will re-open it, after </b> */
1539
+ }
1540
+ }
1541
+ TY_(PopInline)( doc, element );
1542
+
1543
+ if ( !nodeIsA(element) )
1544
+ {
1545
+ if ( nodeIsA(node) && node->tag != element->tag )
1546
+ {
1547
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1548
+ TY_(UngetToken)( doc );
1549
+ }
1550
+ else
1551
+ {
1552
+ TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG);
1553
+ TY_(FreeNode)( doc, node);
1554
+ }
1555
+
1556
+ if (!(mode & Preformatted))
1557
+ TrimSpaces(doc, element);
1558
+
1559
+ return;
1560
+ }
1561
+
1562
+ /* if parent is <a> then discard unexpected inline end tag */
1563
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1564
+ TY_(FreeNode)( doc, node);
1565
+ continue;
1566
+ } /* special case </tr> etc. for stuff moved in front of table */
1567
+ else if ( lexer->exiled
1568
+ && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
1569
+ {
1570
+ TY_(UngetToken)( doc );
1571
+ TrimSpaces(doc, element);
1572
+ return;
1573
+ }
1574
+ }
1575
+
1576
+ /* allow any header tag to end current header */
1577
+ if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
1578
+ {
1579
+
1580
+ if ( node->tag == element->tag )
1581
+ {
1582
+ TY_(ReportError)(doc, element, node, NON_MATCHING_ENDTAG );
1583
+ TY_(FreeNode)( doc, node);
1584
+ }
1585
+ else
1586
+ {
1587
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE );
1588
+ TY_(UngetToken)( doc );
1589
+ }
1590
+
1591
+ if (!(mode & Preformatted))
1592
+ TrimSpaces(doc, element);
1593
+
1594
+ return;
1595
+ }
1596
+
1597
+ /*
1598
+ an <A> tag to ends any open <A> element
1599
+ but <A href=...> is mapped to </A><A href=...>
1600
+ */
1601
+ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1602
+ /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
1603
+ if ( nodeIsA(node) && !node->implicit &&
1604
+ (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1605
+ {
1606
+ /* coerce <a> to </a> unless it has some attributes */
1607
+ /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
1608
+ /* other fixes by Dave Raggett */
1609
+ /* if (node->attributes == NULL) */
1610
+ if (node->type != EndTag && node->attributes == NULL)
1611
+ {
1612
+ node->type = EndTag;
1613
+ TY_(ReportError)(doc, element, node, COERCE_TO_ENDTAG);
1614
+ /* TY_(PopInline)( doc, node ); */
1615
+ TY_(UngetToken)( doc );
1616
+ continue;
1617
+ }
1618
+
1619
+ TY_(UngetToken)( doc );
1620
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1621
+ /* TY_(PopInline)( doc, element ); */
1622
+
1623
+ if (!(mode & Preformatted))
1624
+ TrimSpaces(doc, element);
1625
+
1626
+ return;
1627
+ }
1628
+
1629
+ if (element->tag->model & CM_HEADING)
1630
+ {
1631
+ if ( nodeIsCENTER(node) || nodeIsDIV(node) )
1632
+ {
1633
+ if (!TY_(nodeIsElement)(node))
1634
+ {
1635
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1636
+ TY_(FreeNode)( doc, node);
1637
+ continue;
1638
+ }
1639
+
1640
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1641
+
1642
+ /* insert center as parent if heading is empty */
1643
+ if (element->content == NULL)
1644
+ {
1645
+ InsertNodeAsParent(element, node);
1646
+ continue;
1647
+ }
1648
+
1649
+ /* split heading and make center parent of 2nd part */
1650
+ TY_(InsertNodeAfterElement)(element, node);
1651
+
1652
+ if (!(mode & Preformatted))
1653
+ TrimSpaces(doc, element);
1654
+
1655
+ element = TY_(CloneNode)( doc, element );
1656
+ TY_(InsertNodeAtEnd)(node, element);
1657
+ continue;
1658
+ }
1659
+
1660
+ if ( nodeIsHR(node) )
1661
+ {
1662
+ if ( !TY_(nodeIsElement)(node) )
1663
+ {
1664
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1665
+ TY_(FreeNode)( doc, node);
1666
+ continue;
1667
+ }
1668
+
1669
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1670
+
1671
+ /* insert hr before heading if heading is empty */
1672
+ if (element->content == NULL)
1673
+ {
1674
+ TY_(InsertNodeBeforeElement)(element, node);
1675
+ continue;
1676
+ }
1677
+
1678
+ /* split heading and insert hr before 2nd part */
1679
+ TY_(InsertNodeAfterElement)(element, node);
1680
+
1681
+ if (!(mode & Preformatted))
1682
+ TrimSpaces(doc, element);
1683
+
1684
+ element = TY_(CloneNode)( doc, element );
1685
+ TY_(InsertNodeAfterElement)(node, element);
1686
+ continue;
1687
+ }
1688
+ }
1689
+
1690
+ if ( nodeIsDT(element) )
1691
+ {
1692
+ if ( nodeIsHR(node) )
1693
+ {
1694
+ Node *dd;
1695
+ if ( !TY_(nodeIsElement)(node) )
1696
+ {
1697
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1698
+ TY_(FreeNode)( doc, node);
1699
+ continue;
1700
+ }
1701
+
1702
+ TY_(ReportError)(doc, element, node, TAG_NOT_ALLOWED_IN);
1703
+ dd = TY_(InferredTag)(doc, TidyTag_DD);
1704
+
1705
+ /* insert hr within dd before dt if dt is empty */
1706
+ if (element->content == NULL)
1707
+ {
1708
+ TY_(InsertNodeBeforeElement)(element, dd);
1709
+ TY_(InsertNodeAtEnd)(dd, node);
1710
+ continue;
1711
+ }
1712
+
1713
+ /* split dt and insert hr within dd before 2nd part */
1714
+ TY_(InsertNodeAfterElement)(element, dd);
1715
+ TY_(InsertNodeAtEnd)(dd, node);
1716
+
1717
+ if (!(mode & Preformatted))
1718
+ TrimSpaces(doc, element);
1719
+
1720
+ element = TY_(CloneNode)( doc, element );
1721
+ TY_(InsertNodeAfterElement)(dd, element);
1722
+ continue;
1723
+ }
1724
+ }
1725
+
1726
+
1727
+ /*
1728
+ if this is the end tag for an ancestor element
1729
+ then infer end tag for this element
1730
+ */
1731
+ if (node->type == EndTag)
1732
+ {
1733
+ for (parent = element->parent;
1734
+ parent != NULL; parent = parent->parent)
1735
+ {
1736
+ if (node->tag == parent->tag)
1737
+ {
1738
+ if (!(element->tag->model & CM_OPT) && !element->implicit)
1739
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1740
+
1741
+ if( TY_(IsPushedLast)( doc, element, node ) )
1742
+ TY_(PopInline)( doc, element );
1743
+ TY_(UngetToken)( doc );
1744
+
1745
+ if (!(mode & Preformatted))
1746
+ TrimSpaces(doc, element);
1747
+
1748
+ return;
1749
+ }
1750
+ }
1751
+ }
1752
+
1753
+ /* block level tags end this element */
1754
+ if (!(node->tag->model & CM_INLINE) &&
1755
+ !(element->tag->model & CM_MIXED))
1756
+ {
1757
+ if ( !TY_(nodeIsElement)(node) )
1758
+ {
1759
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1760
+ TY_(FreeNode)( doc, node);
1761
+ continue;
1762
+ }
1763
+
1764
+ if (!(element->tag->model & CM_OPT))
1765
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_BEFORE);
1766
+
1767
+ if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
1768
+ {
1769
+ MoveToHead(doc, element, node);
1770
+ continue;
1771
+ }
1772
+
1773
+ /*
1774
+ prevent anchors from propagating into block tags
1775
+ except for headings h1 to h6
1776
+ */
1777
+ if ( nodeIsA(element) )
1778
+ {
1779
+ if (node->tag && !(node->tag->model & CM_HEADING))
1780
+ TY_(PopInline)( doc, element );
1781
+ else if (!(element->content))
1782
+ {
1783
+ TY_(DiscardElement)( doc, element );
1784
+ TY_(UngetToken)( doc );
1785
+ return;
1786
+ }
1787
+ }
1788
+
1789
+ TY_(UngetToken)( doc );
1790
+
1791
+ if (!(mode & Preformatted))
1792
+ TrimSpaces(doc, element);
1793
+
1794
+ return;
1795
+ }
1796
+
1797
+ /* parse inline element */
1798
+ if (TY_(nodeIsElement)(node))
1799
+ {
1800
+ if (node->implicit)
1801
+ TY_(ReportError)(doc, element, node, INSERTING_TAG);
1802
+
1803
+ /* trim white space before <br> */
1804
+ if ( nodeIsBR(node) )
1805
+ TrimSpaces(doc, element);
1806
+
1807
+ TY_(InsertNodeAtEnd)(element, node);
1808
+ ParseTag(doc, node, mode);
1809
+ continue;
1810
+ }
1811
+
1812
+ /* discard unexpected tags */
1813
+ TY_(ReportError)(doc, element, node, DISCARDING_UNEXPECTED);
1814
+ TY_(FreeNode)( doc, node );
1815
+ continue;
1816
+ }
1817
+
1818
+ if (!(element->tag->model & CM_OPT))
1819
+ TY_(ReportError)(doc, element, node, MISSING_ENDTAG_FOR);
1820
+
1821
+ }
1822
+
1823
+ void TY_(ParseEmpty)(TidyDocImpl* doc, Node *element, GetTokenMode mode)
1824
+ {
1825
+ Lexer* lexer = doc->lexer;
1826
+ if ( lexer->isvoyager )
1827
+ {
1828
+ Node *node = TY_(GetToken)( doc, mode);
1829
+ if ( node )
1830
+ {
1831
+ if ( !(node->type == EndTag && node->tag == element->tag) )
1832
+ {
1833
+ TY_(ReportError)(doc, element, node, ELEMENT_NOT_EMPTY);
1834
+ TY_(UngetToken)( doc );
1835
+ }
1836
+ else
1837
+ {
1838
+ TY_(FreeNode)( doc, node );
1839
+ }
1840
+ }
1841
+ }
1842
+ }
1843
+
1844
+ void TY_(ParseDefList)(TidyDocImpl* doc, Node *list, GetTokenMode mode)
1845
+ {
1846
+ Lexer* lexer = doc->lexer;
1847
+ Node *node, *parent;
1848
+
1849
+ if (list->tag->model & CM_EMPTY)
1850
+ return;
1851
+
1852
+ lexer->insert = NULL; /* defer implicit inline start tags */
1853
+
1854
+ while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
1855
+ {
1856
+ if (node->tag == list->tag && node->type == EndTag)
1857
+ {
1858
+ TY_(FreeNode)( doc, node);
1859
+ list->closed = yes;
1860
+ return;
1861
+ }
1862
+
1863
+ /* deal with comments etc. */
1864
+ if (InsertMisc(list, node))
1865
+ continue;
1866
+
1867
+ if (TY_(nodeIsText)(node))
1868
+ {
1869
+ TY_(UngetToken)( doc );
1870
+ node = TY_(InferredTag)(doc, TidyTag_DT);
1871
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1872
+ }
1873
+
1874
+ if (node->tag == NULL)
1875
+ {
1876
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1877
+ TY_(FreeNode)( doc, node);
1878
+ continue;
1879
+ }
1880
+
1881
+ /*
1882
+ if this is the end tag for an ancestor element
1883
+ then infer end tag for this element
1884
+ */
1885
+ if (node->type == EndTag)
1886
+ {
1887
+ Bool discardIt = no;
1888
+ if ( nodeIsFORM(node) )
1889
+ {
1890
+ BadForm( doc );
1891
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1892
+ TY_(FreeNode)( doc, node );
1893
+ continue;
1894
+ }
1895
+
1896
+ for (parent = list->parent;
1897
+ parent != NULL; parent = parent->parent)
1898
+ {
1899
+ /* Do not match across BODY to avoid infinite loop
1900
+ between ParseBody and this parser,
1901
+ See http://tidy.sf.net/bug/1098012. */
1902
+ if (nodeIsBODY(parent))
1903
+ {
1904
+ discardIt = yes;
1905
+ break;
1906
+ }
1907
+ if (node->tag == parent->tag)
1908
+ {
1909
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
1910
+
1911
+ TY_(UngetToken)( doc );
1912
+ return;
1913
+ }
1914
+ }
1915
+ if (discardIt)
1916
+ {
1917
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1918
+ TY_(FreeNode)( doc, node);
1919
+ continue;
1920
+ }
1921
+ }
1922
+
1923
+ /* center in a dt or a dl breaks the dl list in two */
1924
+ if ( nodeIsCENTER(node) )
1925
+ {
1926
+ if (list->content)
1927
+ TY_(InsertNodeAfterElement)(list, node);
1928
+ else /* trim empty dl list */
1929
+ {
1930
+ TY_(InsertNodeBeforeElement)(list, node);
1931
+
1932
+ /* #540296 tidy dumps with empty definition list */
1933
+ #if 0
1934
+ TY_(DiscardElement)(list);
1935
+ #endif
1936
+ }
1937
+
1938
+ /* #426885 - fix by Glenn Carroll 19 Apr 00, and
1939
+ Gary Dechaines 11 Aug 00 */
1940
+ /* ParseTag can destroy node, if it finds that
1941
+ * this <center> is followed immediately by </center>.
1942
+ * It's awkward but necessary to determine if this
1943
+ * has happened.
1944
+ */
1945
+ parent = node->parent;
1946
+
1947
+ /* and parse contents of center */
1948
+ lexer->excludeBlocks = no;
1949
+ ParseTag( doc, node, mode);
1950
+ lexer->excludeBlocks = yes;
1951
+
1952
+ /* now create a new dl element,
1953
+ * unless node has been blown away because the
1954
+ * center was empty, as above.
1955
+ */
1956
+ if (parent->last == node)
1957
+ {
1958
+ list = TY_(InferredTag)(doc, TidyTag_DL);
1959
+ TY_(InsertNodeAfterElement)(node, list);
1960
+ }
1961
+ continue;
1962
+ }
1963
+
1964
+ if ( !(nodeIsDT(node) || nodeIsDD(node)) )
1965
+ {
1966
+ TY_(UngetToken)( doc );
1967
+
1968
+ if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
1969
+ {
1970
+ TY_(ReportError)(doc, list, node, TAG_NOT_ALLOWED_IN);
1971
+ return;
1972
+ }
1973
+
1974
+ /* if DD appeared directly in BODY then exclude blocks */
1975
+ if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
1976
+ return;
1977
+
1978
+ node = TY_(InferredTag)(doc, TidyTag_DD);
1979
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG);
1980
+ }
1981
+
1982
+ if (node->type == EndTag)
1983
+ {
1984
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
1985
+ TY_(FreeNode)( doc, node);
1986
+ continue;
1987
+ }
1988
+
1989
+ /* node should be <DT> or <DD>*/
1990
+ TY_(InsertNodeAtEnd)(list, node);
1991
+ ParseTag( doc, node, IgnoreWhitespace);
1992
+ }
1993
+
1994
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
1995
+ }
1996
+
1997
+ static Bool FindLastLI( Node *list, Node **lastli )
1998
+ {
1999
+ Node *node;
2000
+
2001
+ *lastli = NULL;
2002
+ for ( node = list->content; node ; node = node->next )
2003
+ if ( nodeIsLI(node) && node->type == StartTag )
2004
+ *lastli=node;
2005
+ return *lastli ? yes:no;
2006
+ }
2007
+
2008
+ void TY_(ParseList)(TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode))
2009
+ {
2010
+ Lexer* lexer = doc->lexer;
2011
+ Node *node, *parent, *lastli;
2012
+ Bool wasblock;
2013
+
2014
+ if (list->tag->model & CM_EMPTY)
2015
+ return;
2016
+
2017
+ lexer->insert = NULL; /* defer implicit inline start tags */
2018
+
2019
+ while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
2020
+ {
2021
+ if (node->tag == list->tag && node->type == EndTag)
2022
+ {
2023
+ TY_(FreeNode)( doc, node);
2024
+ list->closed = yes;
2025
+ return;
2026
+ }
2027
+
2028
+ /* deal with comments etc. */
2029
+ if (InsertMisc(list, node))
2030
+ continue;
2031
+
2032
+ if (node->type != TextNode && node->tag == NULL)
2033
+ {
2034
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2035
+ TY_(FreeNode)( doc, node);
2036
+ continue;
2037
+ }
2038
+
2039
+ /*
2040
+ if this is the end tag for an ancestor element
2041
+ then infer end tag for this element
2042
+ */
2043
+ if (node->type == EndTag)
2044
+ {
2045
+ if ( nodeIsFORM(node) )
2046
+ {
2047
+ BadForm( doc );
2048
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2049
+ TY_(FreeNode)( doc, node );
2050
+ continue;
2051
+ }
2052
+
2053
+ if (TY_(nodeHasCM)(node,CM_INLINE))
2054
+ {
2055
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2056
+ TY_(PopInline)( doc, node );
2057
+ TY_(FreeNode)( doc, node);
2058
+ continue;
2059
+ }
2060
+
2061
+ for ( parent = list->parent;
2062
+ parent != NULL; parent = parent->parent )
2063
+ {
2064
+ /* Do not match across BODY to avoid infinite loop
2065
+ between ParseBody and this parser,
2066
+ See http://tidy.sf.net/bug/1053626. */
2067
+ if (nodeIsBODY(parent))
2068
+ break;
2069
+ if (node->tag == parent->tag)
2070
+ {
2071
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2072
+ TY_(UngetToken)( doc );
2073
+ return;
2074
+ }
2075
+ }
2076
+
2077
+ TY_(ReportError)(doc, list, node, DISCARDING_UNEXPECTED);
2078
+ TY_(FreeNode)( doc, node);
2079
+ continue;
2080
+ }
2081
+
2082
+ if ( !nodeIsLI(node) )
2083
+ {
2084
+ TY_(UngetToken)( doc );
2085
+
2086
+ if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
2087
+ {
2088
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_BEFORE);
2089
+ return;
2090
+ }
2091
+ /* http://tidy.sf.net/issue/1316307 */
2092
+ /* In exiled mode, return so table processing can continue. */
2093
+ else if ( lexer->exiled
2094
+ && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
2095
+ || nodeIsTABLE(node)) )
2096
+ return;
2097
+
2098
+ /* http://tidy.sf.net/issue/836462
2099
+ If "list" is an unordered list, insert the next tag within
2100
+ the last <li> to preserve the numbering to match the visual
2101
+ rendering of most browsers. */
2102
+ if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
2103
+ {
2104
+ /* Create a node for error reporting */
2105
+ node = TY_(InferredTag)(doc, TidyTag_LI);
2106
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2107
+ TY_(FreeNode)( doc, node);
2108
+ node = lastli;
2109
+ }
2110
+ else
2111
+ {
2112
+ /* Add an inferred <li> */
2113
+ wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
2114
+ node = TY_(InferredTag)(doc, TidyTag_LI);
2115
+ /* Add "display: inline" to avoid a blank line after <li> with
2116
+ Internet Explorer. See http://tidy.sf.net/issue/836462 */
2117
+ TY_(AddStyleProperty)( doc, node,
2118
+ wasblock
2119
+ ? "list-style: none; display: inline"
2120
+ : "list-style: none"
2121
+ );
2122
+ TY_(ReportError)(doc, list, node, MISSING_STARTTAG );
2123
+ TY_(InsertNodeAtEnd)(list,node);
2124
+ }
2125
+ }
2126
+ else
2127
+ /* node is <LI> */
2128
+ TY_(InsertNodeAtEnd)(list,node);
2129
+
2130
+ ParseTag( doc, node, IgnoreWhitespace);
2131
+ }
2132
+
2133
+ TY_(ReportError)(doc, list, node, MISSING_ENDTAG_FOR);
2134
+ }
2135
+
2136
+ /*
2137
+ unexpected content in table row is moved to just before
2138
+ the table in accordance with Netscape and IE. This code
2139
+ assumes that node hasn't been inserted into the row.
2140
+ */
2141
+ static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
2142
+ Node *node )
2143
+ {
2144
+ Node *table;
2145
+
2146
+ /* first find the table element */
2147
+ for (table = row->parent; table; table = table->parent)
2148
+ {
2149
+ if ( nodeIsTABLE(table) )
2150
+ {
2151
+ TY_(InsertNodeBeforeElement)( table, node );
2152
+ return;
2153
+ }
2154
+ }
2155
+ /* No table element */
2156
+ TY_(InsertNodeBeforeElement)( row->parent, node );
2157
+ }
2158
+
2159
+ /*
2160
+ if a table row is empty then insert an empty cell
2161
+ this practice is consistent with browser behavior
2162
+ and avoids potential problems with row spanning cells
2163
+ */
2164
+ static void FixEmptyRow(TidyDocImpl* doc, Node *row)
2165
+ {
2166
+ Node *cell;
2167
+
2168
+ if (row->content == NULL)
2169
+ {
2170
+ cell = TY_(InferredTag)(doc, TidyTag_TD);
2171
+ TY_(InsertNodeAtEnd)(row, cell);
2172
+ TY_(ReportError)(doc, row, cell, MISSING_STARTTAG);
2173
+ }
2174
+ }
2175
+
2176
+ void TY_(ParseRow)(TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode))
2177
+ {
2178
+ Lexer* lexer = doc->lexer;
2179
+ Node *node;
2180
+ Bool exclude_state;
2181
+
2182
+ if (row->tag->model & CM_EMPTY)
2183
+ return;
2184
+
2185
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2186
+ {
2187
+ if (node->tag == row->tag)
2188
+ {
2189
+ if (node->type == EndTag)
2190
+ {
2191
+ TY_(FreeNode)( doc, node);
2192
+ row->closed = yes;
2193
+ FixEmptyRow( doc, row);
2194
+ return;
2195
+ }
2196
+
2197
+ /* New row start implies end of current row */
2198
+ TY_(UngetToken)( doc );
2199
+ FixEmptyRow( doc, row);
2200
+ return;
2201
+ }
2202
+
2203
+ /*
2204
+ if this is the end tag for an ancestor element
2205
+ then infer end tag for this element
2206
+ */
2207
+ if ( node->type == EndTag )
2208
+ {
2209
+ if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
2210
+ && DescendantOf(row, TagId(node)) )
2211
+ {
2212
+ TY_(UngetToken)( doc );
2213
+ return;
2214
+ }
2215
+
2216
+ if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2217
+ {
2218
+ if ( nodeIsFORM(node) )
2219
+ BadForm( doc );
2220
+
2221
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2222
+ TY_(FreeNode)( doc, node);
2223
+ continue;
2224
+ }
2225
+
2226
+ if ( nodeIsTD(node) || nodeIsTH(node) )
2227
+ {
2228
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2229
+ TY_(FreeNode)( doc, node);
2230
+ continue;
2231
+ }
2232
+ }
2233
+
2234
+ /* deal with comments etc. */
2235
+ if (InsertMisc(row, node))
2236
+ continue;
2237
+
2238
+ /* discard unknown tags */
2239
+ if (node->tag == NULL && node->type != TextNode)
2240
+ {
2241
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2242
+ TY_(FreeNode)( doc, node);
2243
+ continue;
2244
+ }
2245
+
2246
+ /* discard unexpected <table> element */
2247
+ if ( nodeIsTABLE(node) )
2248
+ {
2249
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2250
+ TY_(FreeNode)( doc, node);
2251
+ continue;
2252
+ }
2253
+
2254
+ /* THEAD, TFOOT or TBODY */
2255
+ if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
2256
+ {
2257
+ TY_(UngetToken)( doc );
2258
+ return;
2259
+ }
2260
+
2261
+ if (node->type == EndTag)
2262
+ {
2263
+ TY_(ReportError)(doc, row, node, DISCARDING_UNEXPECTED);
2264
+ TY_(FreeNode)( doc, node);
2265
+ continue;
2266
+ }
2267
+
2268
+ /*
2269
+ if text or inline or block move before table
2270
+ if head content move to head
2271
+ */
2272
+
2273
+ if (node->type != EndTag)
2274
+ {
2275
+ if ( nodeIsFORM(node) )
2276
+ {
2277
+ TY_(UngetToken)( doc );
2278
+ node = TY_(InferredTag)(doc, TidyTag_TD);
2279
+ TY_(ReportError)(doc, row, node, MISSING_STARTTAG);
2280
+ }
2281
+ else if ( TY_(nodeIsText)(node)
2282
+ || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
2283
+ {
2284
+ MoveBeforeTable( doc, row, node );
2285
+ TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2286
+ lexer->exiled = yes;
2287
+ exclude_state = lexer->excludeBlocks;
2288
+ lexer->excludeBlocks = no;
2289
+
2290
+ if (node->type != TextNode)
2291
+ ParseTag( doc, node, IgnoreWhitespace);
2292
+
2293
+ lexer->exiled = no;
2294
+ lexer->excludeBlocks = exclude_state;
2295
+ continue;
2296
+ }
2297
+ else if (node->tag->model & CM_HEAD)
2298
+ {
2299
+ TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2300
+ MoveToHead( doc, row, node);
2301
+ continue;
2302
+ }
2303
+ }
2304
+
2305
+ if ( !(nodeIsTD(node) || nodeIsTH(node)) )
2306
+ {
2307
+ TY_(ReportError)(doc, row, node, TAG_NOT_ALLOWED_IN);
2308
+ TY_(FreeNode)( doc, node);
2309
+ continue;
2310
+ }
2311
+
2312
+ /* node should be <TD> or <TH> */
2313
+ TY_(InsertNodeAtEnd)(row, node);
2314
+ exclude_state = lexer->excludeBlocks;
2315
+ lexer->excludeBlocks = no;
2316
+ ParseTag( doc, node, IgnoreWhitespace);
2317
+ lexer->excludeBlocks = exclude_state;
2318
+
2319
+ /* pop inline stack */
2320
+
2321
+ while ( lexer->istacksize > lexer->istackbase )
2322
+ TY_(PopInline)( doc, NULL );
2323
+ }
2324
+
2325
+ }
2326
+
2327
+ void TY_(ParseRowGroup)(TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode))
2328
+ {
2329
+ Lexer* lexer = doc->lexer;
2330
+ Node *node, *parent;
2331
+
2332
+ if (rowgroup->tag->model & CM_EMPTY)
2333
+ return;
2334
+
2335
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2336
+ {
2337
+ if (node->tag == rowgroup->tag)
2338
+ {
2339
+ if (node->type == EndTag)
2340
+ {
2341
+ rowgroup->closed = yes;
2342
+ TY_(FreeNode)( doc, node);
2343
+ return;
2344
+ }
2345
+
2346
+ TY_(UngetToken)( doc );
2347
+ return;
2348
+ }
2349
+
2350
+ /* if </table> infer end tag */
2351
+ if ( nodeIsTABLE(node) && node->type == EndTag )
2352
+ {
2353
+ TY_(UngetToken)( doc );
2354
+ return;
2355
+ }
2356
+
2357
+ /* deal with comments etc. */
2358
+ if (InsertMisc(rowgroup, node))
2359
+ continue;
2360
+
2361
+ /* discard unknown tags */
2362
+ if (node->tag == NULL && node->type != TextNode)
2363
+ {
2364
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2365
+ TY_(FreeNode)( doc, node);
2366
+ continue;
2367
+ }
2368
+
2369
+ /*
2370
+ if TD or TH then infer <TR>
2371
+ if text or inline or block move before table
2372
+ if head content move to head
2373
+ */
2374
+
2375
+ if (node->type != EndTag)
2376
+ {
2377
+ if ( nodeIsTD(node) || nodeIsTH(node) )
2378
+ {
2379
+ TY_(UngetToken)( doc );
2380
+ node = TY_(InferredTag)(doc, TidyTag_TR);
2381
+ TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2382
+ }
2383
+ else if ( TY_(nodeIsText)(node)
2384
+ || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2385
+ {
2386
+ MoveBeforeTable( doc, rowgroup, node );
2387
+ TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2388
+ lexer->exiled = yes;
2389
+
2390
+ if (node->type != TextNode)
2391
+ ParseTag(doc, node, IgnoreWhitespace);
2392
+
2393
+ lexer->exiled = no;
2394
+ continue;
2395
+ }
2396
+ else if (node->tag->model & CM_HEAD)
2397
+ {
2398
+ TY_(ReportError)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
2399
+ MoveToHead(doc, rowgroup, node);
2400
+ continue;
2401
+ }
2402
+ }
2403
+
2404
+ /*
2405
+ if this is the end tag for ancestor element
2406
+ then infer end tag for this element
2407
+ */
2408
+ if (node->type == EndTag)
2409
+ {
2410
+ if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2411
+ {
2412
+ if ( nodeIsFORM(node) )
2413
+ BadForm( doc );
2414
+
2415
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2416
+ TY_(FreeNode)( doc, node);
2417
+ continue;
2418
+ }
2419
+
2420
+ if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
2421
+ {
2422
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2423
+ TY_(FreeNode)( doc, node);
2424
+ continue;
2425
+ }
2426
+
2427
+ for ( parent = rowgroup->parent;
2428
+ parent != NULL;
2429
+ parent = parent->parent )
2430
+ {
2431
+ if (node->tag == parent->tag)
2432
+ {
2433
+ TY_(UngetToken)( doc );
2434
+ return;
2435
+ }
2436
+ }
2437
+ }
2438
+
2439
+ /*
2440
+ if THEAD, TFOOT or TBODY then implied end tag
2441
+
2442
+ */
2443
+ if (node->tag->model & CM_ROWGRP)
2444
+ {
2445
+ if (node->type != EndTag)
2446
+ {
2447
+ TY_(UngetToken)( doc );
2448
+ return;
2449
+ }
2450
+ }
2451
+
2452
+ if (node->type == EndTag)
2453
+ {
2454
+ TY_(ReportError)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
2455
+ TY_(FreeNode)( doc, node);
2456
+ continue;
2457
+ }
2458
+
2459
+ if ( !nodeIsTR(node) )
2460
+ {
2461
+ node = TY_(InferredTag)(doc, TidyTag_TR);
2462
+ TY_(ReportError)(doc, rowgroup, node, MISSING_STARTTAG);
2463
+ TY_(UngetToken)( doc );
2464
+ }
2465
+
2466
+ /* node should be <TR> */
2467
+ TY_(InsertNodeAtEnd)(rowgroup, node);
2468
+ ParseTag(doc, node, IgnoreWhitespace);
2469
+ }
2470
+
2471
+ }
2472
+
2473
+ void TY_(ParseColGroup)(TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode))
2474
+ {
2475
+ Node *node, *parent;
2476
+
2477
+ if (colgroup->tag->model & CM_EMPTY)
2478
+ return;
2479
+
2480
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2481
+ {
2482
+ if (node->tag == colgroup->tag && node->type == EndTag)
2483
+ {
2484
+ TY_(FreeNode)( doc, node);
2485
+ colgroup->closed = yes;
2486
+ return;
2487
+ }
2488
+
2489
+ /*
2490
+ if this is the end tag for an ancestor element
2491
+ then infer end tag for this element
2492
+ */
2493
+ if (node->type == EndTag)
2494
+ {
2495
+ if ( nodeIsFORM(node) )
2496
+ {
2497
+ BadForm( doc );
2498
+ TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2499
+ TY_(FreeNode)( doc, node);
2500
+ continue;
2501
+ }
2502
+
2503
+ for ( parent = colgroup->parent;
2504
+ parent != NULL;
2505
+ parent = parent->parent )
2506
+ {
2507
+ if (node->tag == parent->tag)
2508
+ {
2509
+ TY_(UngetToken)( doc );
2510
+ return;
2511
+ }
2512
+ }
2513
+ }
2514
+
2515
+ if (TY_(nodeIsText)(node))
2516
+ {
2517
+ TY_(UngetToken)( doc );
2518
+ return;
2519
+ }
2520
+
2521
+ /* deal with comments etc. */
2522
+ if (InsertMisc(colgroup, node))
2523
+ continue;
2524
+
2525
+ /* discard unknown tags */
2526
+ if (node->tag == NULL)
2527
+ {
2528
+ TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2529
+ TY_(FreeNode)( doc, node);
2530
+ continue;
2531
+ }
2532
+
2533
+ if ( !nodeIsCOL(node) )
2534
+ {
2535
+ TY_(UngetToken)( doc );
2536
+ return;
2537
+ }
2538
+
2539
+ if (node->type == EndTag)
2540
+ {
2541
+ TY_(ReportError)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2542
+ TY_(FreeNode)( doc, node);
2543
+ continue;
2544
+ }
2545
+
2546
+ /* node should be <COL> */
2547
+ TY_(InsertNodeAtEnd)(colgroup, node);
2548
+ ParseTag(doc, node, IgnoreWhitespace);
2549
+ }
2550
+ }
2551
+
2552
+ void TY_(ParseTableTag)(TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode))
2553
+ {
2554
+ Lexer* lexer = doc->lexer;
2555
+ Node *node, *parent;
2556
+ uint istackbase;
2557
+
2558
+ TY_(DeferDup)( doc );
2559
+ istackbase = lexer->istackbase;
2560
+ lexer->istackbase = lexer->istacksize;
2561
+
2562
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2563
+ {
2564
+ if (node->tag == table->tag && node->type == EndTag)
2565
+ {
2566
+ TY_(FreeNode)( doc, node);
2567
+ lexer->istackbase = istackbase;
2568
+ table->closed = yes;
2569
+ return;
2570
+ }
2571
+
2572
+ /* deal with comments etc. */
2573
+ if (InsertMisc(table, node))
2574
+ continue;
2575
+
2576
+ /* discard unknown tags */
2577
+ if (node->tag == NULL && node->type != TextNode)
2578
+ {
2579
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2580
+ TY_(FreeNode)( doc, node);
2581
+ continue;
2582
+ }
2583
+
2584
+ /* if TD or TH or text or inline or block then infer <TR> */
2585
+
2586
+ if (node->type != EndTag)
2587
+ {
2588
+ if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
2589
+ {
2590
+ TY_(UngetToken)( doc );
2591
+ node = TY_(InferredTag)(doc, TidyTag_TR);
2592
+ TY_(ReportError)(doc, table, node, MISSING_STARTTAG);
2593
+ }
2594
+ else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
2595
+ {
2596
+ TY_(InsertNodeBeforeElement)(table, node);
2597
+ TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2598
+ lexer->exiled = yes;
2599
+
2600
+ if (node->type != TextNode)
2601
+ ParseTag(doc, node, IgnoreWhitespace);
2602
+
2603
+ lexer->exiled = no;
2604
+ continue;
2605
+ }
2606
+ else if (node->tag->model & CM_HEAD)
2607
+ {
2608
+ MoveToHead(doc, table, node);
2609
+ continue;
2610
+ }
2611
+ }
2612
+
2613
+ /*
2614
+ if this is the end tag for an ancestor element
2615
+ then infer end tag for this element
2616
+ */
2617
+ if (node->type == EndTag)
2618
+ {
2619
+ if ( nodeIsFORM(node) )
2620
+ {
2621
+ BadForm( doc );
2622
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2623
+ TY_(FreeNode)( doc, node);
2624
+ continue;
2625
+ }
2626
+
2627
+ /* best to discard unexpected block/inline end tags */
2628
+ if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
2629
+ TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
2630
+ {
2631
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2632
+ TY_(FreeNode)( doc, node);
2633
+ continue;
2634
+ }
2635
+
2636
+ for ( parent = table->parent;
2637
+ parent != NULL;
2638
+ parent = parent->parent )
2639
+ {
2640
+ if (node->tag == parent->tag)
2641
+ {
2642
+ TY_(ReportError)(doc, table, node, MISSING_ENDTAG_BEFORE );
2643
+ TY_(UngetToken)( doc );
2644
+ lexer->istackbase = istackbase;
2645
+ return;
2646
+ }
2647
+ }
2648
+ }
2649
+
2650
+ if (!(node->tag->model & CM_TABLE))
2651
+ {
2652
+ TY_(UngetToken)( doc );
2653
+ TY_(ReportError)(doc, table, node, TAG_NOT_ALLOWED_IN);
2654
+ lexer->istackbase = istackbase;
2655
+ return;
2656
+ }
2657
+
2658
+ if (TY_(nodeIsElement)(node))
2659
+ {
2660
+ TY_(InsertNodeAtEnd)(table, node);
2661
+ ParseTag(doc, node, IgnoreWhitespace);
2662
+ continue;
2663
+ }
2664
+
2665
+ /* discard unexpected text nodes and end tags */
2666
+ TY_(ReportError)(doc, table, node, DISCARDING_UNEXPECTED);
2667
+ TY_(FreeNode)( doc, node);
2668
+ }
2669
+
2670
+ TY_(ReportError)(doc, table, node, MISSING_ENDTAG_FOR);
2671
+ lexer->istackbase = istackbase;
2672
+ }
2673
+
2674
+ /* acceptable content for pre elements */
2675
+ static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
2676
+ {
2677
+ /* p is coerced to br's, Text OK too */
2678
+ if ( nodeIsP(node) || TY_(nodeIsText)(node) )
2679
+ return yes;
2680
+
2681
+ if ( node->tag == NULL ||
2682
+ nodeIsPARAM(node) ||
2683
+ !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
2684
+ return no;
2685
+
2686
+ return yes;
2687
+ }
2688
+
2689
+ void TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
2690
+ {
2691
+ Node *node;
2692
+
2693
+ if (pre->tag->model & CM_EMPTY)
2694
+ return;
2695
+
2696
+ TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
2697
+
2698
+ while ((node = TY_(GetToken)(doc, Preformatted)) != NULL)
2699
+ {
2700
+ if ( node->type == EndTag &&
2701
+ (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
2702
+ {
2703
+ if (nodeIsBODY(node) || nodeIsHTML(node))
2704
+ {
2705
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2706
+ TY_(FreeNode)(doc, node);
2707
+ continue;
2708
+ }
2709
+ if (node->tag == pre->tag)
2710
+ {
2711
+ TY_(FreeNode)(doc, node);
2712
+ }
2713
+ else
2714
+ {
2715
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE );
2716
+ TY_(UngetToken)( doc );
2717
+ }
2718
+ pre->closed = yes;
2719
+ TrimSpaces(doc, pre);
2720
+ return;
2721
+ }
2722
+
2723
+ if (TY_(nodeIsText)(node))
2724
+ {
2725
+ TY_(InsertNodeAtEnd)(pre, node);
2726
+ continue;
2727
+ }
2728
+
2729
+ /* deal with comments etc. */
2730
+ if (InsertMisc(pre, node))
2731
+ continue;
2732
+
2733
+ if (node->tag == NULL)
2734
+ {
2735
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2736
+ TY_(FreeNode)(doc, node);
2737
+ continue;
2738
+ }
2739
+
2740
+ /* strip unexpected tags */
2741
+ if ( !PreContent(doc, node) )
2742
+ {
2743
+ Node *newnode;
2744
+
2745
+ /* fix for http://tidy.sf.net/bug/772205 */
2746
+ if (node->type == EndTag)
2747
+ {
2748
+ /* http://tidy.sf.net/issue/1590220 */
2749
+ if ( doc->lexer->exiled
2750
+ && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
2751
+ {
2752
+ TY_(UngetToken)(doc);
2753
+ TrimSpaces(doc, pre);
2754
+ return;
2755
+ }
2756
+
2757
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2758
+ TY_(FreeNode)(doc, node);
2759
+ continue;
2760
+ }
2761
+ /* http://tidy.sf.net/issue/1590220 */
2762
+ else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW)
2763
+ || nodeIsTABLE(node) )
2764
+ {
2765
+ if (!doc->lexer->exiled)
2766
+ /* No missing close warning if exiled. */
2767
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2768
+
2769
+ TY_(UngetToken)(doc);
2770
+ return;
2771
+ }
2772
+
2773
+ /*
2774
+ This is basically what Tidy 04 August 2000 did and far more accurate
2775
+ with respect to browser behaivour than the code commented out above.
2776
+ Tidy could try to propagate the <pre> into each disallowed child where
2777
+ <pre> is allowed in order to replicate some browsers behaivour, but
2778
+ there are a lot of exceptions, e.g. Internet Explorer does not propagate
2779
+ <pre> into table cells while Mozilla does. Opera 6 never propagates
2780
+ <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
2781
+
2782
+ Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
2783
+ which are handled like Mozilla takes them (Opera6 closes all <pre> after
2784
+ the first </pre>).
2785
+
2786
+ There are similar issues like replacing <p> in <pre> with <br>, for
2787
+ example
2788
+
2789
+ <pre>...<p>...</pre> (Input)
2790
+ <pre>...<br>...</pre> (Tidy)
2791
+ <pre>...<br>...</pre> (Opera 7 and Internet Explorer)
2792
+ <pre>...<br><br>...</pre> (Opera 6 and Mozilla)
2793
+
2794
+ <pre>...<p>...</p>...</pre> (Input)
2795
+ <pre>...<br>......</pre> (Tidy, BUG!)
2796
+ <pre>...<br>...<br>...</pre> (Internet Explorer)
2797
+ <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
2798
+ <pre>...<br>...<br><br>...</pre> (Opera 7)
2799
+
2800
+ or something similar, they could also be closing the <pre> and propagate
2801
+ the <pre> into the newly opened <p>.
2802
+
2803
+ Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
2804
+ dissallowed in <pre>, Tidy neither detects this nor does it perform any
2805
+ cleanup operation. Tidy should at least issue a warning if it encounters
2806
+ such constructs.
2807
+
2808
+ Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
2809
+ */
2810
+ TY_(InsertNodeAfterElement)(pre, node);
2811
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_BEFORE);
2812
+ ParseTag(doc, node, IgnoreWhitespace);
2813
+
2814
+ newnode = TY_(InferredTag)(doc, TidyTag_PRE);
2815
+ TY_(ReportError)(doc, pre, newnode, INSERTING_TAG);
2816
+ pre = newnode;
2817
+ TY_(InsertNodeAfterElement)(node, pre);
2818
+
2819
+ continue;
2820
+ }
2821
+
2822
+ if ( nodeIsP(node) )
2823
+ {
2824
+ if (node->type == StartTag)
2825
+ {
2826
+ TY_(ReportError)(doc, pre, node, USING_BR_INPLACE_OF);
2827
+
2828
+ /* trim white space before <p> in <pre>*/
2829
+ TrimSpaces(doc, pre);
2830
+
2831
+ /* coerce both <p> and </p> to <br> */
2832
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
2833
+ TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
2834
+ TY_(InsertNodeAtEnd)( pre, node );
2835
+ }
2836
+ else
2837
+ {
2838
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2839
+ TY_(FreeNode)( doc, node);
2840
+ }
2841
+ continue;
2842
+ }
2843
+
2844
+ if ( TY_(nodeIsElement)(node) )
2845
+ {
2846
+ /* trim white space before <br> */
2847
+ if ( nodeIsBR(node) )
2848
+ TrimSpaces(doc, pre);
2849
+
2850
+ TY_(InsertNodeAtEnd)(pre, node);
2851
+ ParseTag(doc, node, Preformatted);
2852
+ continue;
2853
+ }
2854
+
2855
+ /* discard unexpected tags */
2856
+ TY_(ReportError)(doc, pre, node, DISCARDING_UNEXPECTED);
2857
+ TY_(FreeNode)( doc, node);
2858
+ }
2859
+
2860
+ TY_(ReportError)(doc, pre, node, MISSING_ENDTAG_FOR);
2861
+ }
2862
+
2863
+ void TY_(ParseOptGroup)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2864
+ {
2865
+ Lexer* lexer = doc->lexer;
2866
+ Node *node;
2867
+
2868
+ lexer->insert = NULL; /* defer implicit inline start tags */
2869
+
2870
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2871
+ {
2872
+ if (node->tag == field->tag && node->type == EndTag)
2873
+ {
2874
+ TY_(FreeNode)( doc, node);
2875
+ field->closed = yes;
2876
+ TrimSpaces(doc, field);
2877
+ return;
2878
+ }
2879
+
2880
+ /* deal with comments etc. */
2881
+ if (InsertMisc(field, node))
2882
+ continue;
2883
+
2884
+ if ( node->type == StartTag &&
2885
+ (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
2886
+ {
2887
+ if ( nodeIsOPTGROUP(node) )
2888
+ TY_(ReportError)(doc, field, node, CANT_BE_NESTED);
2889
+
2890
+ TY_(InsertNodeAtEnd)(field, node);
2891
+ ParseTag(doc, node, MixedContent);
2892
+ continue;
2893
+ }
2894
+
2895
+ /* discard unexpected tags */
2896
+ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED );
2897
+ TY_(FreeNode)( doc, node);
2898
+ }
2899
+ }
2900
+
2901
+
2902
+ void TY_(ParseSelect)(TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode))
2903
+ {
2904
+ Lexer* lexer = doc->lexer;
2905
+ Node *node;
2906
+
2907
+ lexer->insert = NULL; /* defer implicit inline start tags */
2908
+
2909
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2910
+ {
2911
+ if (node->tag == field->tag && node->type == EndTag)
2912
+ {
2913
+ TY_(FreeNode)( doc, node);
2914
+ field->closed = yes;
2915
+ TrimSpaces(doc, field);
2916
+ return;
2917
+ }
2918
+
2919
+ /* deal with comments etc. */
2920
+ if (InsertMisc(field, node))
2921
+ continue;
2922
+
2923
+ if ( node->type == StartTag &&
2924
+ ( nodeIsOPTION(node) ||
2925
+ nodeIsOPTGROUP(node) ||
2926
+ nodeIsSCRIPT(node))
2927
+ )
2928
+ {
2929
+ TY_(InsertNodeAtEnd)(field, node);
2930
+ ParseTag(doc, node, IgnoreWhitespace);
2931
+ continue;
2932
+ }
2933
+
2934
+ /* discard unexpected tags */
2935
+ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2936
+ TY_(FreeNode)( doc, node);
2937
+ }
2938
+
2939
+ TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
2940
+ }
2941
+
2942
+ void TY_(ParseText)(TidyDocImpl* doc, Node *field, GetTokenMode mode)
2943
+ {
2944
+ Lexer* lexer = doc->lexer;
2945
+ Node *node;
2946
+
2947
+ lexer->insert = NULL; /* defer implicit inline start tags */
2948
+
2949
+ if ( nodeIsTEXTAREA(field) )
2950
+ mode = Preformatted;
2951
+ else
2952
+ mode = MixedContent; /* kludge for font tags */
2953
+
2954
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
2955
+ {
2956
+ if (node->tag == field->tag && node->type == EndTag)
2957
+ {
2958
+ TY_(FreeNode)( doc, node);
2959
+ field->closed = yes;
2960
+ TrimSpaces(doc, field);
2961
+ return;
2962
+ }
2963
+
2964
+ /* deal with comments etc. */
2965
+ if (InsertMisc(field, node))
2966
+ continue;
2967
+
2968
+ if (TY_(nodeIsText)(node))
2969
+ {
2970
+ /* only called for 1st child */
2971
+ if (field->content == NULL && !(mode & Preformatted))
2972
+ TrimSpaces(doc, field);
2973
+
2974
+ if (node->start >= node->end)
2975
+ {
2976
+ TY_(FreeNode)( doc, node);
2977
+ continue;
2978
+ }
2979
+
2980
+ TY_(InsertNodeAtEnd)(field, node);
2981
+ continue;
2982
+ }
2983
+
2984
+ /* for textarea should all cases of < and & be escaped? */
2985
+
2986
+ /* discard inline tags e.g. font */
2987
+ if ( node->tag
2988
+ && node->tag->model & CM_INLINE
2989
+ && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
2990
+ {
2991
+ TY_(ReportError)(doc, field, node, DISCARDING_UNEXPECTED);
2992
+ TY_(FreeNode)( doc, node);
2993
+ continue;
2994
+ }
2995
+
2996
+ /* terminate element on other tags */
2997
+ if (!(field->tag->model & CM_OPT))
2998
+ TY_(ReportError)(doc, field, node, MISSING_ENDTAG_BEFORE);
2999
+
3000
+ TY_(UngetToken)( doc );
3001
+ TrimSpaces(doc, field);
3002
+ return;
3003
+ }
3004
+
3005
+ if (!(field->tag->model & CM_OPT))
3006
+ TY_(ReportError)(doc, field, node, MISSING_ENDTAG_FOR);
3007
+ }
3008
+
3009
+
3010
+ void TY_(ParseTitle)(TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode))
3011
+ {
3012
+ Node *node;
3013
+ while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
3014
+ {
3015
+ if (node->tag == title->tag && node->type == StartTag)
3016
+ {
3017
+ TY_(ReportError)(doc, title, node, COERCE_TO_ENDTAG);
3018
+ node->type = EndTag;
3019
+ TY_(UngetToken)( doc );
3020
+ continue;
3021
+ }
3022
+ else if (node->tag == title->tag && node->type == EndTag)
3023
+ {
3024
+ TY_(FreeNode)( doc, node);
3025
+ title->closed = yes;
3026
+ TrimSpaces(doc, title);
3027
+ return;
3028
+ }
3029
+
3030
+ if (TY_(nodeIsText)(node))
3031
+ {
3032
+ /* only called for 1st child */
3033
+ if (title->content == NULL)
3034
+ TrimInitialSpace(doc, title, node);
3035
+
3036
+ if (node->start >= node->end)
3037
+ {
3038
+ TY_(FreeNode)( doc, node);
3039
+ continue;
3040
+ }
3041
+
3042
+ TY_(InsertNodeAtEnd)(title, node);
3043
+ continue;
3044
+ }
3045
+
3046
+ /* deal with comments etc. */
3047
+ if (InsertMisc(title, node))
3048
+ continue;
3049
+
3050
+ /* discard unknown tags */
3051
+ if (node->tag == NULL)
3052
+ {
3053
+ TY_(ReportError)(doc, title, node, DISCARDING_UNEXPECTED);
3054
+ TY_(FreeNode)( doc, node);
3055
+ continue;
3056
+ }
3057
+
3058
+ /* pushback unexpected tokens */
3059
+ TY_(ReportError)(doc, title, node, MISSING_ENDTAG_BEFORE);
3060
+ TY_(UngetToken)( doc );
3061
+ TrimSpaces(doc, title);
3062
+ return;
3063
+ }
3064
+
3065
+ TY_(ReportError)(doc, title, node, MISSING_ENDTAG_FOR);
3066
+ }
3067
+
3068
+ /*
3069
+ This isn't quite right for CDATA content as it recognises
3070
+ tags within the content and parses them accordingly.
3071
+ This will unfortunately screw up scripts which include
3072
+ < + letter, < + !, < + ? or < + / + letter
3073
+ */
3074
+
3075
+ void TY_(ParseScript)(TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode))
3076
+ {
3077
+ Node *node;
3078
+
3079
+ doc->lexer->parent = script;
3080
+ node = TY_(GetToken)(doc, CdataContent);
3081
+ doc->lexer->parent = NULL;
3082
+
3083
+ if (node)
3084
+ {
3085
+ TY_(InsertNodeAtEnd)(script, node);
3086
+ }
3087
+ else
3088
+ {
3089
+ /* handle e.g. a document like "<script>" */
3090
+ TY_(ReportError)(doc, script, NULL, MISSING_ENDTAG_FOR);
3091
+ return;
3092
+ }
3093
+
3094
+ node = TY_(GetToken)(doc, IgnoreWhitespace);
3095
+
3096
+ if (!(node && node->type == EndTag && node->tag &&
3097
+ node->tag->id == script->tag->id))
3098
+ {
3099
+ TY_(ReportError)(doc, script, node, MISSING_ENDTAG_FOR);
3100
+
3101
+ if (node)
3102
+ TY_(UngetToken)(doc);
3103
+ }
3104
+ else
3105
+ {
3106
+ TY_(FreeNode)(doc, node);
3107
+ }
3108
+ }
3109
+
3110
+ Bool TY_(IsJavaScript)(Node *node)
3111
+ {
3112
+ Bool result = no;
3113
+ AttVal *attr;
3114
+
3115
+ if (node->attributes == NULL)
3116
+ return yes;
3117
+
3118
+ for (attr = node->attributes; attr; attr = attr->next)
3119
+ {
3120
+ if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
3121
+ && AttrContains(attr, "javascript") )
3122
+ {
3123
+ result = yes;
3124
+ break;
3125
+ }
3126
+ }
3127
+
3128
+ return result;
3129
+ }
3130
+
3131
+ void TY_(ParseHead)(TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode))
3132
+ {
3133
+ Lexer* lexer = doc->lexer;
3134
+ Node *node;
3135
+ int HasTitle = 0;
3136
+ int HasBase = 0;
3137
+
3138
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3139
+ {
3140
+ if (node->tag == head->tag && node->type == EndTag)
3141
+ {
3142
+ TY_(FreeNode)( doc, node);
3143
+ head->closed = yes;
3144
+ break;
3145
+ }
3146
+
3147
+ /* find and discard multiple <head> elements */
3148
+ /* find and discard <html> in <head> elements */
3149
+ if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
3150
+ {
3151
+ TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3152
+ TY_(FreeNode)(doc, node);
3153
+ continue;
3154
+ }
3155
+
3156
+ if (TY_(nodeIsText)(node))
3157
+ {
3158
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3159
+ TY_(UngetToken)( doc );
3160
+ break;
3161
+ }
3162
+
3163
+ if (node->type == ProcInsTag && node->element &&
3164
+ TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0)
3165
+ {
3166
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3167
+ TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node);
3168
+ continue;
3169
+ }
3170
+
3171
+ /* deal with comments etc. */
3172
+ if (InsertMisc(head, node))
3173
+ continue;
3174
+
3175
+ if (node->type == DocTypeTag)
3176
+ {
3177
+ InsertDocType(doc, head, node);
3178
+ continue;
3179
+ }
3180
+
3181
+ /* discard unknown tags */
3182
+ if (node->tag == NULL)
3183
+ {
3184
+ TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3185
+ TY_(FreeNode)( doc, node);
3186
+ continue;
3187
+ }
3188
+
3189
+ /*
3190
+ if it doesn't belong in the head then
3191
+ treat as implicit end of head and deal
3192
+ with as part of the body
3193
+ */
3194
+ if (!(node->tag->model & CM_HEAD))
3195
+ {
3196
+ /* #545067 Implicit closing of head broken - warn only for XHTML input */
3197
+ if ( lexer->isvoyager )
3198
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN );
3199
+ TY_(UngetToken)( doc );
3200
+ break;
3201
+ }
3202
+
3203
+ if (TY_(nodeIsElement)(node))
3204
+ {
3205
+ if ( nodeIsTITLE(node) )
3206
+ {
3207
+ ++HasTitle;
3208
+
3209
+ if (HasTitle > 1)
3210
+ TY_(ReportError)(doc, head, node,
3211
+ head ?
3212
+ TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3213
+ }
3214
+ else if ( nodeIsBASE(node) )
3215
+ {
3216
+ ++HasBase;
3217
+
3218
+ if (HasBase > 1)
3219
+ TY_(ReportError)(doc, head, node,
3220
+ head ?
3221
+ TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
3222
+ }
3223
+ else if ( nodeIsNOSCRIPT(node) )
3224
+ {
3225
+ TY_(ReportError)(doc, head, node, TAG_NOT_ALLOWED_IN);
3226
+ }
3227
+
3228
+ #ifdef AUTO_INPUT_ENCODING
3229
+ else if (nodeIsMETA(node))
3230
+ {
3231
+ AttVal * httpEquiv = AttrGetById(node, TidyAttr_HTTP_EQUIV);
3232
+ AttVal * content = AttrGetById(node, TidyAttr_CONTENT);
3233
+ if (httpEquiv && AttrValueIs(httpEquiv, "Content-Type") && AttrHasValue(content))
3234
+ {
3235
+ tmbstr val, charset;
3236
+ uint end = 0;
3237
+ val = charset = TY_(tmbstrdup)(doc->allocator, content->value);
3238
+ val = TY_(tmbstrtolower)(val);
3239
+ val = strstr(content->value, "charset");
3240
+
3241
+ if (val)
3242
+ val += 7;
3243
+
3244
+ while(val && *val && (TY_(IsWhite)((tchar)*val) ||
3245
+ *val == '=' || *val == '"' || *val == '\''))
3246
+ ++val;
3247
+
3248
+ while(val && val[end] && !(TY_(IsWhite)((tchar)val[end]) ||
3249
+ val[end] == '"' || val[end] == '\'' || val[end] == ';'))
3250
+ ++end;
3251
+
3252
+ if (val && end)
3253
+ {
3254
+ tmbstr encoding = TY_(tmbstrndup)(doc->allocator,val, end);
3255
+ uint id = TY_(GetEncodingIdFromName)(encoding);
3256
+
3257
+ /* todo: detect mismatch with BOM/XMLDecl/declared */
3258
+ /* todo: error for unsupported encodings */
3259
+ /* todo: try to re-init transcoder */
3260
+ /* todo: change input/output encoding settings */
3261
+ /* todo: store id in StreamIn */
3262
+
3263
+ TidyDocFree(doc, encoding);
3264
+ }
3265
+
3266
+ TidyDocFree(doc, charset);
3267
+ }
3268
+ }
3269
+ #endif /* AUTO_INPUT_ENCODING */
3270
+
3271
+ TY_(InsertNodeAtEnd)(head, node);
3272
+ ParseTag(doc, node, IgnoreWhitespace);
3273
+ continue;
3274
+ }
3275
+
3276
+ /* discard unexpected text nodes and end tags */
3277
+ TY_(ReportError)(doc, head, node, DISCARDING_UNEXPECTED);
3278
+ TY_(FreeNode)( doc, node);
3279
+ }
3280
+ }
3281
+
3282
+ void TY_(ParseBody)(TidyDocImpl* doc, Node *body, GetTokenMode mode)
3283
+ {
3284
+ Lexer* lexer = doc->lexer;
3285
+ Node *node;
3286
+ Bool checkstack, iswhitenode;
3287
+
3288
+ mode = IgnoreWhitespace;
3289
+ checkstack = yes;
3290
+
3291
+ TY_(BumpObject)( doc, body->parent );
3292
+
3293
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
3294
+ {
3295
+ /* find and discard multiple <body> elements */
3296
+ if (node->tag == body->tag && node->type == StartTag)
3297
+ {
3298
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3299
+ TY_(FreeNode)(doc, node);
3300
+ continue;
3301
+ }
3302
+
3303
+ /* #538536 Extra endtags not detected */
3304
+ if ( nodeIsHTML(node) )
3305
+ {
3306
+ if (TY_(nodeIsElement)(node) || lexer->seenEndHtml)
3307
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3308
+ else
3309
+ lexer->seenEndHtml = 1;
3310
+
3311
+ TY_(FreeNode)( doc, node);
3312
+ continue;
3313
+ }
3314
+
3315
+ if ( lexer->seenEndBody &&
3316
+ ( node->type == StartTag ||
3317
+ node->type == EndTag ||
3318
+ node->type == StartEndTag ) )
3319
+ {
3320
+ TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY );
3321
+ }
3322
+
3323
+ if ( node->tag == body->tag && node->type == EndTag )
3324
+ {
3325
+ body->closed = yes;
3326
+ TrimSpaces(doc, body);
3327
+ TY_(FreeNode)( doc, node);
3328
+ lexer->seenEndBody = 1;
3329
+ mode = IgnoreWhitespace;
3330
+
3331
+ if ( nodeIsNOFRAMES(body->parent) )
3332
+ break;
3333
+
3334
+ continue;
3335
+ }
3336
+
3337
+ if ( nodeIsNOFRAMES(node) )
3338
+ {
3339
+ if (node->type == StartTag)
3340
+ {
3341
+ TY_(InsertNodeAtEnd)(body, node);
3342
+ TY_(ParseBlock)(doc, node, mode);
3343
+ continue;
3344
+ }
3345
+
3346
+ if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
3347
+ {
3348
+ TrimSpaces(doc, body);
3349
+ TY_(UngetToken)( doc );
3350
+ break;
3351
+ }
3352
+ }
3353
+
3354
+ if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
3355
+ && nodeIsNOFRAMES(body->parent) )
3356
+ {
3357
+ TrimSpaces(doc, body);
3358
+ TY_(UngetToken)( doc );
3359
+ break;
3360
+ }
3361
+
3362
+ iswhitenode = no;
3363
+
3364
+ if ( TY_(nodeIsText)(node) &&
3365
+ node->end <= node->start + 1 &&
3366
+ lexer->lexbuf[node->start] == ' ' )
3367
+ iswhitenode = yes;
3368
+
3369
+ /* deal with comments etc. */
3370
+ if (InsertMisc(body, node))
3371
+ continue;
3372
+
3373
+ /* #538536 Extra endtags not detected */
3374
+ #if 0
3375
+ if ( lexer->seenEndBody == 1 && !iswhitenode )
3376
+ {
3377
+ ++lexer->seenEndBody;
3378
+ TY_(ReportError)(doc, body, node, CONTENT_AFTER_BODY);
3379
+ }
3380
+ #endif
3381
+
3382
+ /* mixed content model permits text */
3383
+ if (TY_(nodeIsText)(node))
3384
+ {
3385
+ if (iswhitenode && mode == IgnoreWhitespace)
3386
+ {
3387
+ TY_(FreeNode)( doc, node);
3388
+ continue;
3389
+ }
3390
+
3391
+ /* HTML 2 and HTML4 strict don't allow text here */
3392
+ TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
3393
+
3394
+ if (checkstack)
3395
+ {
3396
+ checkstack = no;
3397
+
3398
+ if ( TY_(InlineDup)(doc, node) > 0 )
3399
+ continue;
3400
+ }
3401
+
3402
+ TY_(InsertNodeAtEnd)(body, node);
3403
+ mode = MixedContent;
3404
+ continue;
3405
+ }
3406
+
3407
+ if (node->type == DocTypeTag)
3408
+ {
3409
+ InsertDocType(doc, body, node);
3410
+ continue;
3411
+ }
3412
+ /* discard unknown and PARAM tags */
3413
+ if ( node->tag == NULL || nodeIsPARAM(node) )
3414
+ {
3415
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3416
+ TY_(FreeNode)( doc, node);
3417
+ continue;
3418
+ }
3419
+
3420
+ /*
3421
+ Netscape allows LI and DD directly in BODY
3422
+ We infer UL or DL respectively and use this
3423
+ Bool to exclude block-level elements so as
3424
+ to match Netscape's observed behaviour.
3425
+ */
3426
+ lexer->excludeBlocks = no;
3427
+
3428
+ if ( nodeIsINPUT(node) ||
3429
+ (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE))
3430
+ )
3431
+ {
3432
+ /* avoid this error message being issued twice */
3433
+ if (!(node->tag->model & CM_HEAD))
3434
+ TY_(ReportError)(doc, body, node, TAG_NOT_ALLOWED_IN);
3435
+
3436
+ if (node->tag->model & CM_HTML)
3437
+ {
3438
+ /* copy body attributes if current body was inferred */
3439
+ if ( nodeIsBODY(node) && body->implicit
3440
+ && body->attributes == NULL )
3441
+ {
3442
+ body->attributes = node->attributes;
3443
+ node->attributes = NULL;
3444
+ }
3445
+
3446
+ TY_(FreeNode)( doc, node);
3447
+ continue;
3448
+ }
3449
+
3450
+ if (node->tag->model & CM_HEAD)
3451
+ {
3452
+ MoveToHead(doc, body, node);
3453
+ continue;
3454
+ }
3455
+
3456
+ if (node->tag->model & CM_LIST)
3457
+ {
3458
+ TY_(UngetToken)( doc );
3459
+ node = TY_(InferredTag)(doc, TidyTag_UL);
3460
+ AddClassNoIndent(doc, node);
3461
+ lexer->excludeBlocks = yes;
3462
+ }
3463
+ else if (node->tag->model & CM_DEFLIST)
3464
+ {
3465
+ TY_(UngetToken)( doc );
3466
+ node = TY_(InferredTag)(doc, TidyTag_DL);
3467
+ lexer->excludeBlocks = yes;
3468
+ }
3469
+ else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
3470
+ {
3471
+ /* http://tidy.sf.net/issue/2855621 */
3472
+ if (node->type != EndTag) {
3473
+ TY_(UngetToken)( doc );
3474
+ node = TY_(InferredTag)(doc, TidyTag_TABLE);
3475
+ }
3476
+ lexer->excludeBlocks = yes;
3477
+ }
3478
+ else if ( nodeIsINPUT(node) )
3479
+ {
3480
+ TY_(UngetToken)( doc );
3481
+ node = TY_(InferredTag)(doc, TidyTag_FORM);
3482
+ lexer->excludeBlocks = yes;
3483
+ }
3484
+ else
3485
+ {
3486
+ if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) )
3487
+ {
3488
+ TY_(UngetToken)( doc );
3489
+ return;
3490
+ }
3491
+
3492
+ /* ignore </td> </th> <option> etc. */
3493
+ TY_(FreeNode)( doc, node );
3494
+ continue;
3495
+ }
3496
+ }
3497
+
3498
+ if (node->type == EndTag)
3499
+ {
3500
+ if ( nodeIsBR(node) )
3501
+ node->type = StartTag;
3502
+ else if ( nodeIsP(node) )
3503
+ {
3504
+ node->type = StartEndTag;
3505
+ node->implicit = yes;
3506
+ #if OBSOLETE
3507
+ TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
3508
+ FreeAttrs( doc, node ); /* discard align attribute etc. */
3509
+ TY_(InsertNodeAtEnd)(body, node);
3510
+ node = TY_(InferredTag)(doc, TidyTag_BR);
3511
+ #endif
3512
+ }
3513
+ else if ( TY_(nodeHasCM)(node, CM_INLINE) )
3514
+ TY_(PopInline)( doc, node );
3515
+ }
3516
+
3517
+ if (TY_(nodeIsElement)(node))
3518
+ {
3519
+ if ( TY_(nodeHasCM)(node, CM_INLINE) && !TY_(nodeHasCM)(node, CM_MIXED) )
3520
+ {
3521
+ /* HTML4 strict doesn't allow inline content here */
3522
+ /* but HTML2 does allow img elements as children of body */
3523
+ if ( nodeIsIMG(node) )
3524
+ TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT);
3525
+ else
3526
+ TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
3527
+
3528
+ if (checkstack && !node->implicit)
3529
+ {
3530
+ checkstack = no;
3531
+
3532
+ if ( TY_(InlineDup)(doc, node) > 0 )
3533
+ continue;
3534
+ }
3535
+
3536
+ mode = MixedContent;
3537
+ }
3538
+ else
3539
+ {
3540
+ checkstack = yes;
3541
+ mode = IgnoreWhitespace;
3542
+ }
3543
+
3544
+ if (node->implicit)
3545
+ TY_(ReportError)(doc, body, node, INSERTING_TAG);
3546
+
3547
+ TY_(InsertNodeAtEnd)(body, node);
3548
+ ParseTag(doc, node, mode);
3549
+ continue;
3550
+ }
3551
+
3552
+ /* discard unexpected tags */
3553
+ TY_(ReportError)(doc, body, node, DISCARDING_UNEXPECTED);
3554
+ TY_(FreeNode)( doc, node);
3555
+ }
3556
+ }
3557
+
3558
+ void TY_(ParseNoFrames)(TidyDocImpl* doc, Node *noframes, GetTokenMode mode)
3559
+ {
3560
+ Lexer* lexer = doc->lexer;
3561
+ Node *node;
3562
+
3563
+ if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3564
+ {
3565
+ doc->badAccess |= BA_USING_NOFRAMES;
3566
+ }
3567
+ mode = IgnoreWhitespace;
3568
+
3569
+ while ( (node = TY_(GetToken)(doc, mode)) != NULL )
3570
+ {
3571
+ if ( node->tag == noframes->tag && node->type == EndTag )
3572
+ {
3573
+ TY_(FreeNode)( doc, node);
3574
+ noframes->closed = yes;
3575
+ TrimSpaces(doc, noframes);
3576
+ return;
3577
+ }
3578
+
3579
+ if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
3580
+ {
3581
+ TrimSpaces(doc, noframes);
3582
+ if (node->type == EndTag)
3583
+ {
3584
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3585
+ TY_(FreeNode)( doc, node); /* Throw it away */
3586
+ }
3587
+ else
3588
+ {
3589
+ TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_BEFORE);
3590
+ TY_(UngetToken)( doc );
3591
+ }
3592
+ return;
3593
+ }
3594
+
3595
+ if ( nodeIsHTML(node) )
3596
+ {
3597
+ if (TY_(nodeIsElement)(node))
3598
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3599
+
3600
+ TY_(FreeNode)( doc, node);
3601
+ continue;
3602
+ }
3603
+
3604
+ /* deal with comments etc. */
3605
+ if (InsertMisc(noframes, node))
3606
+ continue;
3607
+
3608
+ if ( nodeIsBODY(node) && node->type == StartTag )
3609
+ {
3610
+ Bool seen_body = lexer->seenEndBody;
3611
+ TY_(InsertNodeAtEnd)(noframes, node);
3612
+ ParseTag(doc, node, IgnoreWhitespace /*MixedContent*/);
3613
+
3614
+ /* fix for bug http://tidy.sf.net/bug/887259 */
3615
+ if (seen_body && TY_(FindBody)(doc) != node)
3616
+ {
3617
+ TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no);
3618
+ MoveNodeToBody(doc, node);
3619
+ }
3620
+ continue;
3621
+ }
3622
+
3623
+ /* implicit body element inferred */
3624
+ if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag))
3625
+ {
3626
+ Node *body = TY_(FindBody)( doc );
3627
+ if ( body || lexer->seenEndBody )
3628
+ {
3629
+ if ( body == NULL )
3630
+ {
3631
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3632
+ TY_(FreeNode)( doc, node);
3633
+ continue;
3634
+ }
3635
+ if ( TY_(nodeIsText)(node) )
3636
+ {
3637
+ TY_(UngetToken)( doc );
3638
+ node = TY_(InferredTag)(doc, TidyTag_P);
3639
+ TY_(ReportError)(doc, noframes, node, CONTENT_AFTER_BODY );
3640
+ }
3641
+ TY_(InsertNodeAtEnd)( body, node );
3642
+ }
3643
+ else
3644
+ {
3645
+ TY_(UngetToken)( doc );
3646
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3647
+ if ( cfgBool(doc, TidyXmlOut) )
3648
+ TY_(ReportError)(doc, noframes, node, INSERTING_TAG);
3649
+ TY_(InsertNodeAtEnd)( noframes, node );
3650
+ }
3651
+
3652
+ ParseTag( doc, node, IgnoreWhitespace /*MixedContent*/ );
3653
+ continue;
3654
+ }
3655
+
3656
+ /* discard unexpected end tags */
3657
+ TY_(ReportError)(doc, noframes, node, DISCARDING_UNEXPECTED);
3658
+ TY_(FreeNode)( doc, node);
3659
+ }
3660
+
3661
+ TY_(ReportError)(doc, noframes, node, MISSING_ENDTAG_FOR);
3662
+ }
3663
+
3664
+ void TY_(ParseFrameSet)(TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode))
3665
+ {
3666
+ Lexer* lexer = doc->lexer;
3667
+ Node *node;
3668
+
3669
+ if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3670
+ {
3671
+ doc->badAccess |= BA_USING_FRAMES;
3672
+ }
3673
+
3674
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
3675
+ {
3676
+ if (node->tag == frameset->tag && node->type == EndTag)
3677
+ {
3678
+ TY_(FreeNode)( doc, node);
3679
+ frameset->closed = yes;
3680
+ TrimSpaces(doc, frameset);
3681
+ return;
3682
+ }
3683
+
3684
+ /* deal with comments etc. */
3685
+ if (InsertMisc(frameset, node))
3686
+ continue;
3687
+
3688
+ if (node->tag == NULL)
3689
+ {
3690
+ TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3691
+ TY_(FreeNode)( doc, node);
3692
+ continue;
3693
+ }
3694
+
3695
+ if (TY_(nodeIsElement)(node))
3696
+ {
3697
+ if (node->tag && node->tag->model & CM_HEAD)
3698
+ {
3699
+ MoveToHead(doc, frameset, node);
3700
+ continue;
3701
+ }
3702
+ }
3703
+
3704
+ if ( nodeIsBODY(node) )
3705
+ {
3706
+ TY_(UngetToken)( doc );
3707
+ node = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3708
+ TY_(ReportError)(doc, frameset, node, INSERTING_TAG);
3709
+ }
3710
+
3711
+ if (node->type == StartTag && (node->tag->model & CM_FRAMES))
3712
+ {
3713
+ TY_(InsertNodeAtEnd)(frameset, node);
3714
+ lexer->excludeBlocks = no;
3715
+ ParseTag(doc, node, MixedContent);
3716
+ continue;
3717
+ }
3718
+ else if (node->type == StartEndTag && (node->tag->model & CM_FRAMES))
3719
+ {
3720
+ TY_(InsertNodeAtEnd)(frameset, node);
3721
+ continue;
3722
+ }
3723
+
3724
+ /* discard unexpected tags */
3725
+ #if SUPPORT_ACCESSIBILITY_CHECKS
3726
+ /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */
3727
+ if ( nodeIsA(node) )
3728
+ doc->badAccess |= BA_INVALID_LINK_NOFRAMES;
3729
+ #endif
3730
+
3731
+ TY_(ReportError)(doc, frameset, node, DISCARDING_UNEXPECTED);
3732
+ TY_(FreeNode)( doc, node);
3733
+ }
3734
+
3735
+ TY_(ReportError)(doc, frameset, node, MISSING_ENDTAG_FOR);
3736
+ }
3737
+
3738
+ void TY_(ParseHTML)(TidyDocImpl* doc, Node *html, GetTokenMode mode)
3739
+ {
3740
+ Node *node, *head;
3741
+ Node *frameset = NULL;
3742
+ Node *noframes = NULL;
3743
+
3744
+ TY_(SetOptionBool)( doc, TidyXmlTags, no );
3745
+
3746
+ for (;;)
3747
+ {
3748
+ node = TY_(GetToken)(doc, IgnoreWhitespace);
3749
+
3750
+ if (node == NULL)
3751
+ {
3752
+ node = TY_(InferredTag)(doc, TidyTag_HEAD);
3753
+ break;
3754
+ }
3755
+
3756
+ if ( nodeIsHEAD(node) )
3757
+ break;
3758
+
3759
+ if (node->tag == html->tag && node->type == EndTag)
3760
+ {
3761
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3762
+ TY_(FreeNode)( doc, node);
3763
+ continue;
3764
+ }
3765
+
3766
+ /* find and discard multiple <html> elements */
3767
+ if (node->tag == html->tag && node->type == StartTag)
3768
+ {
3769
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3770
+ TY_(FreeNode)(doc, node);
3771
+ continue;
3772
+ }
3773
+
3774
+ /* deal with comments etc. */
3775
+ if (InsertMisc(html, node))
3776
+ continue;
3777
+
3778
+ TY_(UngetToken)( doc );
3779
+ node = TY_(InferredTag)(doc, TidyTag_HEAD);
3780
+ break;
3781
+ }
3782
+
3783
+ head = node;
3784
+ TY_(InsertNodeAtEnd)(html, head);
3785
+ TY_(ParseHead)(doc, head, mode);
3786
+
3787
+ for (;;)
3788
+ {
3789
+ node = TY_(GetToken)(doc, IgnoreWhitespace);
3790
+
3791
+ if (node == NULL)
3792
+ {
3793
+ if (frameset == NULL) /* implied body */
3794
+ {
3795
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3796
+ TY_(InsertNodeAtEnd)(html, node);
3797
+ TY_(ParseBody)(doc, node, mode);
3798
+ }
3799
+
3800
+ return;
3801
+ }
3802
+
3803
+ /* robustly handle html tags */
3804
+ if (node->tag == html->tag)
3805
+ {
3806
+ if (node->type != StartTag && frameset == NULL)
3807
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3808
+
3809
+ TY_(FreeNode)( doc, node);
3810
+ continue;
3811
+ }
3812
+
3813
+ /* deal with comments etc. */
3814
+ if (InsertMisc(html, node))
3815
+ continue;
3816
+
3817
+ /* if frameset document coerce <body> to <noframes> */
3818
+ if ( nodeIsBODY(node) )
3819
+ {
3820
+ if (node->type != StartTag)
3821
+ {
3822
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3823
+ TY_(FreeNode)( doc, node);
3824
+ continue;
3825
+ }
3826
+
3827
+ if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
3828
+ {
3829
+ if (frameset != NULL)
3830
+ {
3831
+ TY_(UngetToken)( doc );
3832
+
3833
+ if (noframes == NULL)
3834
+ {
3835
+ noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3836
+ TY_(InsertNodeAtEnd)(frameset, noframes);
3837
+ TY_(ReportError)(doc, html, noframes, INSERTING_TAG);
3838
+ }
3839
+ else
3840
+ {
3841
+ if (noframes->type == StartEndTag)
3842
+ noframes->type = StartTag;
3843
+ }
3844
+
3845
+ ParseTag(doc, noframes, mode);
3846
+ continue;
3847
+ }
3848
+ }
3849
+
3850
+ TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3851
+ break; /* to parse body */
3852
+ }
3853
+
3854
+ /* flag an error if we see more than one frameset */
3855
+ if ( nodeIsFRAMESET(node) )
3856
+ {
3857
+ if (node->type != StartTag)
3858
+ {
3859
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3860
+ TY_(FreeNode)( doc, node);
3861
+ continue;
3862
+ }
3863
+
3864
+ if (frameset != NULL)
3865
+ TY_(ReportFatal)(doc, html, node, DUPLICATE_FRAMESET);
3866
+ else
3867
+ frameset = node;
3868
+
3869
+ TY_(InsertNodeAtEnd)(html, node);
3870
+ ParseTag(doc, node, mode);
3871
+
3872
+ /*
3873
+ see if it includes a noframes element so
3874
+ that we can merge subsequent noframes elements
3875
+ */
3876
+
3877
+ for (node = frameset->content; node; node = node->next)
3878
+ {
3879
+ if ( nodeIsNOFRAMES(node) )
3880
+ noframes = node;
3881
+ }
3882
+ continue;
3883
+ }
3884
+
3885
+ /* if not a frameset document coerce <noframes> to <body> */
3886
+ if ( nodeIsNOFRAMES(node) )
3887
+ {
3888
+ if (node->type != StartTag)
3889
+ {
3890
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3891
+ TY_(FreeNode)( doc, node);
3892
+ continue;
3893
+ }
3894
+
3895
+ if (frameset == NULL)
3896
+ {
3897
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3898
+ TY_(FreeNode)( doc, node);
3899
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3900
+ break;
3901
+ }
3902
+
3903
+ if (noframes == NULL)
3904
+ {
3905
+ noframes = node;
3906
+ TY_(InsertNodeAtEnd)(frameset, noframes);
3907
+ }
3908
+ else
3909
+ TY_(FreeNode)( doc, node);
3910
+
3911
+ ParseTag(doc, noframes, mode);
3912
+ continue;
3913
+ }
3914
+
3915
+ if (TY_(nodeIsElement)(node))
3916
+ {
3917
+ if (node->tag && node->tag->model & CM_HEAD)
3918
+ {
3919
+ MoveToHead(doc, html, node);
3920
+ continue;
3921
+ }
3922
+
3923
+ /* discard illegal frame element following a frameset */
3924
+ if ( frameset != NULL && nodeIsFRAME(node) )
3925
+ {
3926
+ TY_(ReportError)(doc, html, node, DISCARDING_UNEXPECTED);
3927
+ TY_(FreeNode)(doc, node);
3928
+ continue;
3929
+ }
3930
+ }
3931
+
3932
+ TY_(UngetToken)( doc );
3933
+
3934
+ /* insert other content into noframes element */
3935
+
3936
+ if (frameset)
3937
+ {
3938
+ if (noframes == NULL)
3939
+ {
3940
+ noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3941
+ TY_(InsertNodeAtEnd)(frameset, noframes);
3942
+ }
3943
+ else
3944
+ {
3945
+ TY_(ReportError)(doc, html, node, NOFRAMES_CONTENT);
3946
+ if (noframes->type == StartEndTag)
3947
+ noframes->type = StartTag;
3948
+ }
3949
+
3950
+ TY_(ConstrainVersion)(doc, VERS_FRAMESET);
3951
+ ParseTag(doc, noframes, mode);
3952
+ continue;
3953
+ }
3954
+
3955
+ node = TY_(InferredTag)(doc, TidyTag_BODY);
3956
+ TY_(ReportError)(doc, html, node, INSERTING_TAG );
3957
+ TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3958
+ break;
3959
+ }
3960
+
3961
+ /* node must be body */
3962
+
3963
+ TY_(InsertNodeAtEnd)(html, node);
3964
+ ParseTag(doc, node, mode);
3965
+ }
3966
+
3967
+ static Bool nodeCMIsOnlyInline( Node* node )
3968
+ {
3969
+ return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
3970
+ }
3971
+
3972
+ static void EncloseBodyText(TidyDocImpl* doc)
3973
+ {
3974
+ Node* node;
3975
+ Node* body = TY_(FindBody)(doc);
3976
+
3977
+ if (!body)
3978
+ return;
3979
+
3980
+ node = body->content;
3981
+
3982
+ while (node)
3983
+ {
3984
+ if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) ||
3985
+ (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node)))
3986
+ {
3987
+ Node* p = TY_(InferredTag)(doc, TidyTag_P);
3988
+ TY_(InsertNodeBeforeElement)(node, p);
3989
+ while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node)))
3990
+ {
3991
+ Node* next = node->next;
3992
+ TY_(RemoveNode)(node);
3993
+ TY_(InsertNodeAtEnd)(p, node);
3994
+ node = next;
3995
+ }
3996
+ TrimSpaces(doc, p);
3997
+ continue;
3998
+ }
3999
+ node = node->next;
4000
+ }
4001
+ }
4002
+
4003
+ /* <form>, <blockquote> and <noscript> do not allow #PCDATA in
4004
+ HTML 4.01 Strict (%block; model instead of %flow;).
4005
+ When requested, text nodes in these elements are wrapped in <p>. */
4006
+ static void EncloseBlockText(TidyDocImpl* doc, Node* node)
4007
+ {
4008
+ Node *next;
4009
+ Node *block;
4010
+
4011
+ while (node)
4012
+ {
4013
+ next = node->next;
4014
+
4015
+ if (node->content)
4016
+ EncloseBlockText(doc, node->content);
4017
+
4018
+ if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
4019
+ nodeIsBLOCKQUOTE(node))
4020
+ || !node->content)
4021
+ {
4022
+ node = next;
4023
+ continue;
4024
+ }
4025
+
4026
+ block = node->content;
4027
+
4028
+ if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) ||
4029
+ (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block)))
4030
+ {
4031
+ Node* p = TY_(InferredTag)(doc, TidyTag_P);
4032
+ TY_(InsertNodeBeforeElement)(block, p);
4033
+ while (block &&
4034
+ (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block)))
4035
+ {
4036
+ Node* tempNext = block->next;
4037
+ TY_(RemoveNode)(block);
4038
+ TY_(InsertNodeAtEnd)(p, block);
4039
+ block = tempNext;
4040
+ }
4041
+ TrimSpaces(doc, p);
4042
+ continue;
4043
+ }
4044
+
4045
+ node = next;
4046
+ }
4047
+ }
4048
+
4049
+ static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
4050
+ {
4051
+ Node *next;
4052
+
4053
+ while (node)
4054
+ {
4055
+ next = node->next;
4056
+
4057
+ if (nodeIsDIR(node) || nodeIsMENU(node))
4058
+ TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
4059
+
4060
+ if (nodeIsXMP(node) || nodeIsLISTING(node) ||
4061
+ (node->tag && node->tag->id == TidyTag_PLAINTEXT))
4062
+ TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes);
4063
+
4064
+ if (node->content)
4065
+ ReplaceObsoleteElements(doc, node->content);
4066
+
4067
+ node = next;
4068
+ }
4069
+ }
4070
+
4071
+ static void AttributeChecks(TidyDocImpl* doc, Node* node)
4072
+ {
4073
+ Node *next;
4074
+
4075
+ while (node)
4076
+ {
4077
+ next = node->next;
4078
+
4079
+ if (TY_(nodeIsElement)(node))
4080
+ {
4081
+ if (node->tag->chkattrs)
4082
+ node->tag->chkattrs(doc, node);
4083
+ else
4084
+ TY_(CheckAttributes)(doc, node);
4085
+ }
4086
+
4087
+ if (node->content)
4088
+ AttributeChecks(doc, node->content);
4089
+
4090
+ assert( next != node ); /* http://tidy.sf.net/issue/1603538 */
4091
+ node = next;
4092
+ }
4093
+ }
4094
+
4095
+ /*
4096
+ HTML is the top level element
4097
+ */
4098
+ void TY_(ParseDocument)(TidyDocImpl* doc)
4099
+ {
4100
+ Node *node, *html, *doctype = NULL;
4101
+
4102
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4103
+ {
4104
+ if (node->type == XmlDecl)
4105
+ {
4106
+ if (TY_(FindXmlDecl)(doc) && doc->root.content)
4107
+ {
4108
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4109
+ TY_(FreeNode)(doc, node);
4110
+ continue;
4111
+ }
4112
+ if (node->line != 1 || (node->line == 1 && node->column != 1))
4113
+ {
4114
+ TY_(ReportError)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
4115
+ }
4116
+ }
4117
+ #ifdef AUTO_INPUT_ENCODING
4118
+ if (node->type == XmlDecl)
4119
+ {
4120
+ AttVal* encoding = GetAttrByName(node, "encoding");
4121
+ if (AttrHasValue(encoding))
4122
+ {
4123
+ uint id = TY_(GetEncodingIdFromName)(encoding->value);
4124
+
4125
+ /* todo: detect mismatch with BOM/XMLDecl/declared */
4126
+ /* todo: error for unsupported encodings */
4127
+ /* todo: try to re-init transcoder */
4128
+ /* todo: change input/output encoding settings */
4129
+ /* todo: store id in StreamIn */
4130
+ }
4131
+ }
4132
+ #endif /* AUTO_INPUT_ENCODING */
4133
+
4134
+ /* deal with comments etc. */
4135
+ if (InsertMisc( &doc->root, node ))
4136
+ continue;
4137
+
4138
+ if (node->type == DocTypeTag)
4139
+ {
4140
+ if (doctype == NULL)
4141
+ {
4142
+ TY_(InsertNodeAtEnd)( &doc->root, node);
4143
+ doctype = node;
4144
+ }
4145
+ else
4146
+ {
4147
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4148
+ TY_(FreeNode)( doc, node);
4149
+ }
4150
+ continue;
4151
+ }
4152
+
4153
+ if (node->type == EndTag)
4154
+ {
4155
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4156
+ TY_(FreeNode)( doc, node);
4157
+ continue;
4158
+ }
4159
+
4160
+ if (node->type == StartTag && nodeIsHTML(node))
4161
+ {
4162
+ AttVal *xmlns;
4163
+
4164
+ xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS);
4165
+
4166
+ if (AttrValueIs(xmlns, XHTML_NAMESPACE))
4167
+ {
4168
+ Bool htmlOut = cfgBool( doc, TidyHtmlOut );
4169
+ doc->lexer->isvoyager = yes; /* Unless plain HTML */
4170
+ TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
4171
+ TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */
4172
+
4173
+ /* adjust other config options, just as in config.c */
4174
+ if ( !htmlOut )
4175
+ {
4176
+ TY_(SetOptionBool)( doc, TidyUpperCaseTags, no );
4177
+ TY_(SetOptionBool)( doc, TidyUpperCaseAttrs, no );
4178
+ }
4179
+ }
4180
+ }
4181
+
4182
+ if ( node->type != StartTag || !nodeIsHTML(node) )
4183
+ {
4184
+ TY_(UngetToken)( doc );
4185
+ html = TY_(InferredTag)(doc, TidyTag_HTML);
4186
+ }
4187
+ else
4188
+ html = node;
4189
+
4190
+ if (!TY_(FindDocType)(doc))
4191
+ TY_(ReportError)(doc, NULL, NULL, MISSING_DOCTYPE);
4192
+
4193
+ TY_(InsertNodeAtEnd)( &doc->root, html);
4194
+ TY_(ParseHTML)( doc, html, IgnoreWhitespace );
4195
+ break;
4196
+ }
4197
+
4198
+ #if SUPPORT_ACCESSIBILITY_CHECKS
4199
+ /* do this before any more document fixes */
4200
+ if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 )
4201
+ TY_(AccessibilityChecks)( doc );
4202
+ #endif /* #if SUPPORT_ACCESSIBILITY_CHECKS */
4203
+
4204
+ if (!TY_(FindHTML)(doc))
4205
+ {
4206
+ /* a later check should complain if <body> is empty */
4207
+ html = TY_(InferredTag)(doc, TidyTag_HTML);
4208
+ TY_(InsertNodeAtEnd)( &doc->root, html);
4209
+ TY_(ParseHTML)(doc, html, IgnoreWhitespace);
4210
+ }
4211
+
4212
+ if (!TY_(FindTITLE)(doc))
4213
+ {
4214
+ Node* head = TY_(FindHEAD)(doc);
4215
+ TY_(ReportError)(doc, head, NULL, MISSING_TITLE_ELEMENT);
4216
+ TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE));
4217
+ }
4218
+
4219
+ AttributeChecks(doc, &doc->root);
4220
+ ReplaceObsoleteElements(doc, &doc->root);
4221
+ TY_(DropEmptyElements)(doc, &doc->root);
4222
+ CleanSpaces(doc, &doc->root);
4223
+
4224
+ if (cfgBool(doc, TidyEncloseBodyText))
4225
+ EncloseBodyText(doc);
4226
+ if (cfgBool(doc, TidyEncloseBlockText))
4227
+ EncloseBlockText(doc, &doc->root);
4228
+ }
4229
+
4230
+ Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element)
4231
+ {
4232
+ AttVal *attribute;
4233
+
4234
+ /* search attributes for xml:space */
4235
+ for (attribute = element->attributes; attribute; attribute = attribute->next)
4236
+ {
4237
+ if (attrIsXML_SPACE(attribute))
4238
+ {
4239
+ if (AttrValueIs(attribute, "preserve"))
4240
+ return yes;
4241
+
4242
+ return no;
4243
+ }
4244
+ }
4245
+
4246
+ if (element->element == NULL)
4247
+ return no;
4248
+
4249
+ /* kludge for html docs without explicit xml:space attribute */
4250
+ if (nodeIsPRE(element) ||
4251
+ nodeIsSCRIPT(element) ||
4252
+ nodeIsSTYLE(element) ||
4253
+ TY_(FindParser)(doc, element) == TY_(ParsePre))
4254
+ return yes;
4255
+
4256
+ /* kludge for XSL docs */
4257
+ if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 )
4258
+ return yes;
4259
+
4260
+ return no;
4261
+ }
4262
+
4263
+ /*
4264
+ XML documents
4265
+ */
4266
+ static void ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode)
4267
+ {
4268
+ Lexer* lexer = doc->lexer;
4269
+ Node *node;
4270
+
4271
+ /* if node is pre or has xml:space="preserve" then do so */
4272
+
4273
+ if ( TY_(XMLPreserveWhiteSpace)(doc, element) )
4274
+ mode = Preformatted;
4275
+
4276
+ while ((node = TY_(GetToken)(doc, mode)) != NULL)
4277
+ {
4278
+ if (node->type == EndTag &&
4279
+ node->element && element->element &&
4280
+ TY_(tmbstrcmp)(node->element, element->element) == 0)
4281
+ {
4282
+ TY_(FreeNode)( doc, node);
4283
+ element->closed = yes;
4284
+ break;
4285
+ }
4286
+
4287
+ /* discard unexpected end tags */
4288
+ if (node->type == EndTag)
4289
+ {
4290
+ if (element)
4291
+ TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG_IN);
4292
+ else
4293
+ TY_(ReportFatal)(doc, element, node, UNEXPECTED_ENDTAG);
4294
+
4295
+ TY_(FreeNode)( doc, node);
4296
+ continue;
4297
+ }
4298
+
4299
+ /* parse content on seeing start tag */
4300
+ if (node->type == StartTag)
4301
+ ParseXMLElement( doc, node, mode );
4302
+
4303
+ TY_(InsertNodeAtEnd)(element, node);
4304
+ }
4305
+
4306
+ /*
4307
+ if first child is text then trim initial space and
4308
+ delete text node if it is empty.
4309
+ */
4310
+
4311
+ node = element->content;
4312
+
4313
+ if (TY_(nodeIsText)(node) && mode != Preformatted)
4314
+ {
4315
+ if ( lexer->lexbuf[node->start] == ' ' )
4316
+ {
4317
+ node->start++;
4318
+
4319
+ if (node->start >= node->end)
4320
+ TY_(DiscardElement)( doc, node );
4321
+ }
4322
+ }
4323
+
4324
+ /*
4325
+ if last child is text then trim final space and
4326
+ delete the text node if it is empty
4327
+ */
4328
+
4329
+ node = element->last;
4330
+
4331
+ if (TY_(nodeIsText)(node) && mode != Preformatted)
4332
+ {
4333
+ if ( lexer->lexbuf[node->end - 1] == ' ' )
4334
+ {
4335
+ node->end--;
4336
+
4337
+ if (node->start >= node->end)
4338
+ TY_(DiscardElement)( doc, node );
4339
+ }
4340
+ }
4341
+ }
4342
+
4343
+ void TY_(ParseXMLDocument)(TidyDocImpl* doc)
4344
+ {
4345
+ Node *node, *doctype = NULL;
4346
+
4347
+ TY_(SetOptionBool)( doc, TidyXmlTags, yes );
4348
+
4349
+ while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4350
+ {
4351
+ /* discard unexpected end tags */
4352
+ if (node->type == EndTag)
4353
+ {
4354
+ TY_(ReportError)(doc, NULL, node, UNEXPECTED_ENDTAG);
4355
+ TY_(FreeNode)( doc, node);
4356
+ continue;
4357
+ }
4358
+
4359
+ /* deal with comments etc. */
4360
+ if (InsertMisc( &doc->root, node))
4361
+ continue;
4362
+
4363
+ if (node->type == DocTypeTag)
4364
+ {
4365
+ if (doctype == NULL)
4366
+ {
4367
+ TY_(InsertNodeAtEnd)( &doc->root, node);
4368
+ doctype = node;
4369
+ }
4370
+ else
4371
+ {
4372
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4373
+ TY_(FreeNode)( doc, node);
4374
+ }
4375
+ continue;
4376
+ }
4377
+
4378
+ if (node->type == StartEndTag)
4379
+ {
4380
+ TY_(InsertNodeAtEnd)( &doc->root, node);
4381
+ continue;
4382
+ }
4383
+
4384
+ /* if start tag then parse element's content */
4385
+ if (node->type == StartTag)
4386
+ {
4387
+ TY_(InsertNodeAtEnd)( &doc->root, node );
4388
+ ParseXMLElement( doc, node, IgnoreWhitespace );
4389
+ continue;
4390
+ }
4391
+
4392
+ TY_(ReportError)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
4393
+ TY_(FreeNode)( doc, node);
4394
+ }
4395
+
4396
+ /* ensure presence of initial <?xml version="1.0"?> */
4397
+ if ( cfgBool(doc, TidyXmlDecl) )
4398
+ TY_(FixXmlDecl)( doc );
4399
+ }
4400
+
4401
+ /*
4402
+ * local variables:
4403
+ * mode: c
4404
+ * indent-tabs-mode: nil
4405
+ * c-basic-offset: 4
4406
+ * eval: (c-set-offset 'substatement-open 0)
4407
+ * end:
4408
+ */