tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/clean.c ADDED
@@ -0,0 +1,2674 @@
1
+ /*
2
+ clean.c -- clean up misuse of presentation markup
3
+
4
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5
+ See tidy.h for the copyright notice.
6
+
7
+ CVS Info :
8
+
9
+ $Author: arnaud02 $
10
+ $Date: 2008/10/14 12:18:10 $
11
+ $Revision: 1.111 $
12
+
13
+ Filters from other formats such as Microsoft Word
14
+ often make excessive use of presentation markup such
15
+ as font tags, B, I, and the align attribute. By applying
16
+ a set of production rules, it is straight forward to
17
+ transform this to use CSS.
18
+
19
+ Some rules replace some of the children of an element by
20
+ style properties on the element, e.g.
21
+
22
+ <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
23
+
24
+ Such rules are applied to the element's content and then
25
+ to the element itself until none of the rules more apply.
26
+ Having applied all the rules to an element, it will have
27
+ a style attribute with one or more properties.
28
+
29
+ Other rules strip the element they apply to, replacing
30
+ it by style properties on the contents, e.g.
31
+
32
+ <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
33
+
34
+ These rules are applied to an element before processing
35
+ its content and replace the current element by the first
36
+ element in the exposed content.
37
+
38
+ After applying both sets of rules, you can replace the
39
+ style attribute by a class value and style rule in the
40
+ document head. To support this, an association of styles
41
+ and class names is built.
42
+
43
+ A naive approach is to rely on string matching to test
44
+ when two property lists are the same. A better approach
45
+ would be to first sort the properties before matching.
46
+
47
+ */
48
+
49
+ #include <stdio.h>
50
+ #include <stdlib.h>
51
+ #include <string.h>
52
+
53
+ #include "tidy-int.h"
54
+ #include "clean.h"
55
+ #include "lexer.h"
56
+ #include "parser.h"
57
+ #include "attrs.h"
58
+ #include "message.h"
59
+ #include "tmbstr.h"
60
+ #include "utf8.h"
61
+
62
+ static Node* CleanNode( TidyDocImpl* doc, Node *node );
63
+
64
+ static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
65
+ {
66
+ const Dict* dict = TY_(LookupTagDef)( tid );
67
+ TidyDocFree( doc, node->element );
68
+ node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
69
+ node->tag = dict;
70
+ }
71
+
72
+ static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
73
+ {
74
+ StyleProp *next;
75
+
76
+ while (props)
77
+ {
78
+ next = props->next;
79
+ TidyDocFree(doc, props->name);
80
+ TidyDocFree(doc, props->value);
81
+ TidyDocFree(doc, props);
82
+ props = next;
83
+ }
84
+ }
85
+
86
+ static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
87
+ {
88
+ StyleProp *first, *prev, *prop;
89
+ int cmp;
90
+
91
+ prev = NULL;
92
+ first = props;
93
+
94
+ while (props)
95
+ {
96
+ cmp = TY_(tmbstrcmp)(props->name, name);
97
+
98
+ if (cmp == 0)
99
+ {
100
+ /* this property is already defined, ignore new value */
101
+ return first;
102
+ }
103
+
104
+ if (cmp > 0)
105
+ {
106
+ /* insert before this */
107
+
108
+ prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
109
+ prop->name = TY_(tmbstrdup)(doc->allocator, name);
110
+ prop->value = TY_(tmbstrdup)(doc->allocator, value);
111
+ prop->next = props;
112
+
113
+ if (prev)
114
+ prev->next = prop;
115
+ else
116
+ first = prop;
117
+
118
+ return first;
119
+ }
120
+
121
+ prev = props;
122
+ props = props->next;
123
+ }
124
+
125
+ prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
126
+ prop->name = TY_(tmbstrdup)(doc->allocator, name);
127
+ prop->value = TY_(tmbstrdup)(doc->allocator, value);
128
+ prop->next = NULL;
129
+
130
+ if (prev)
131
+ prev->next = prop;
132
+ else
133
+ first = prop;
134
+
135
+ return first;
136
+ }
137
+
138
+ /*
139
+ Create sorted linked list of properties from style string
140
+ It temporarily places nulls in place of ':' and ';' to
141
+ delimit the strings for the property name and value.
142
+ Some systems don't allow you to NULL literal strings,
143
+ so to avoid this, a copy is made first.
144
+ */
145
+ static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
146
+ {
147
+ tmbstr name, value = NULL, name_end, value_end, line;
148
+ Bool more;
149
+
150
+ line = TY_(tmbstrdup)(doc->allocator, style);
151
+ name = line;
152
+
153
+ while (*name)
154
+ {
155
+ while (*name == ' ')
156
+ ++name;
157
+
158
+ name_end = name;
159
+
160
+ while (*name_end)
161
+ {
162
+ if (*name_end == ':')
163
+ {
164
+ value = name_end + 1;
165
+ break;
166
+ }
167
+
168
+ ++name_end;
169
+ }
170
+
171
+ if (*name_end != ':')
172
+ break;
173
+
174
+ while ( value && *value == ' ')
175
+ ++value;
176
+
177
+ value_end = value;
178
+ more = no;
179
+
180
+ while (*value_end)
181
+ {
182
+ if (*value_end == ';')
183
+ {
184
+ more = yes;
185
+ break;
186
+ }
187
+
188
+ ++value_end;
189
+ }
190
+
191
+ *name_end = '\0';
192
+ *value_end = '\0';
193
+
194
+ prop = InsertProperty(doc, prop, name, value);
195
+ *name_end = ':';
196
+
197
+ if (more)
198
+ {
199
+ *value_end = ';';
200
+ name = value_end + 1;
201
+ continue;
202
+ }
203
+
204
+ break;
205
+ }
206
+
207
+ TidyDocFree(doc, line); /* free temporary copy */
208
+ return prop;
209
+ }
210
+
211
+ static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
212
+ {
213
+ tmbstr style, p, s;
214
+ uint len;
215
+ StyleProp *prop;
216
+
217
+ /* compute length */
218
+
219
+ for (len = 0, prop = props; prop; prop = prop->next)
220
+ {
221
+ len += TY_(tmbstrlen)(prop->name) + 2;
222
+ if (prop->value)
223
+ len += TY_(tmbstrlen)(prop->value) + 2;
224
+ }
225
+
226
+ style = (tmbstr) TidyDocAlloc(doc, len+1);
227
+ style[0] = '\0';
228
+
229
+ for (p = style, prop = props; prop; prop = prop->next)
230
+ {
231
+ s = prop->name;
232
+
233
+ while((*p++ = *s++))
234
+ continue;
235
+
236
+ if (prop->value)
237
+ {
238
+ *--p = ':';
239
+ *++p = ' ';
240
+ ++p;
241
+
242
+ s = prop->value;
243
+ while((*p++ = *s++))
244
+ continue;
245
+ }
246
+ if (prop->next == NULL)
247
+ break;
248
+
249
+ *--p = ';';
250
+ *++p = ' ';
251
+ ++p;
252
+ }
253
+
254
+ return style;
255
+ }
256
+
257
+ /*
258
+ create string with merged properties
259
+ static tmbstr AddProperty( ctmbstr style, ctmbstr property )
260
+ {
261
+ tmbstr line;
262
+ StyleProp *prop;
263
+
264
+ prop = CreateProps(doc, NULL, style);
265
+ prop = CreateProps(doc, prop, property);
266
+ line = CreatePropString(doc, prop);
267
+ FreeStyleProps(doc, prop);
268
+ return line;
269
+ }
270
+ */
271
+
272
+ void TY_(FreeStyles)( TidyDocImpl* doc )
273
+ {
274
+ Lexer* lexer = doc->lexer;
275
+ if ( lexer )
276
+ {
277
+ TagStyle *style, *next;
278
+ for ( style = lexer->styles; style; style = next )
279
+ {
280
+ next = style->next;
281
+ TidyDocFree( doc, style->tag );
282
+ TidyDocFree( doc, style->tag_class );
283
+ TidyDocFree( doc, style->properties );
284
+ TidyDocFree( doc, style );
285
+ }
286
+ }
287
+ }
288
+
289
+ static tmbstr GensymClass( TidyDocImpl* doc )
290
+ {
291
+ tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
292
+ ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
293
+ if ( pfx == NULL || *pfx == 0 )
294
+ pfx = "c";
295
+
296
+ TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
297
+ return TY_(tmbstrdup)(doc->allocator, buf);
298
+ }
299
+
300
+ static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
301
+ {
302
+ Lexer* lexer = doc->lexer;
303
+ TagStyle* style;
304
+
305
+ for (style = lexer->styles; style; style=style->next)
306
+ {
307
+ if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
308
+ TY_(tmbstrcmp)(style->properties, properties) == 0)
309
+ return style->tag_class;
310
+ }
311
+
312
+ style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
313
+ style->tag = TY_(tmbstrdup)(doc->allocator, tag);
314
+ style->tag_class = GensymClass( doc );
315
+ style->properties = TY_(tmbstrdup)( doc->allocator, properties );
316
+ style->next = lexer->styles;
317
+ lexer->styles = style;
318
+ return style->tag_class;
319
+ }
320
+
321
+ /*
322
+ Add class="foo" to node
323
+ */
324
+ static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
325
+ {
326
+ AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
327
+
328
+ /*
329
+ if there already is a class attribute
330
+ then append class name after a space.
331
+ */
332
+ if (classattr)
333
+ TY_(AppendToClassAttr)( doc, classattr, classname );
334
+ else /* create new class attribute */
335
+ TY_(AddAttribute)( doc, node, "class", classname );
336
+ }
337
+
338
+ void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
339
+ {
340
+ ctmbstr classname;
341
+
342
+ classname = FindStyle( doc, node->element, stylevalue );
343
+ AddClass( doc, node, classname);
344
+ }
345
+
346
+ /*
347
+ Find style attribute in node, and replace it
348
+ by corresponding class attribute. Search for
349
+ class in style dictionary otherwise gensym
350
+ new class and add to dictionary.
351
+
352
+ Assumes that node doesn't have a class attribute
353
+ */
354
+ static void Style2Rule( TidyDocImpl* doc, Node *node)
355
+ {
356
+ AttVal *styleattr, *classattr;
357
+ ctmbstr classname;
358
+
359
+ styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
360
+
361
+ if (styleattr)
362
+ {
363
+ /* fix for http://tidy.sf.net/bug/850215 */
364
+ if (!styleattr->value)
365
+ {
366
+ TY_(RemoveAttribute)(doc, node, styleattr);
367
+ return;
368
+ }
369
+
370
+ classname = FindStyle( doc, node->element, styleattr->value );
371
+ classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
372
+
373
+ /*
374
+ if there already is a class attribute
375
+ then append class name after an underscore
376
+ */
377
+ if (classattr)
378
+ {
379
+ TY_(AppendToClassAttr)( doc, classattr, classname );
380
+ TY_(RemoveAttribute)( doc, node, styleattr );
381
+ }
382
+ else /* reuse style attribute for class attribute */
383
+ {
384
+ TidyDocFree(doc, styleattr->attribute);
385
+ TidyDocFree(doc, styleattr->value);
386
+ styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
387
+ styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
388
+ }
389
+ }
390
+ }
391
+
392
+ static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
393
+ {
394
+ if ( selector && color )
395
+ {
396
+ TY_(AddStringLiteral)(lexer, selector);
397
+ TY_(AddStringLiteral)(lexer, " { color: ");
398
+ TY_(AddStringLiteral)(lexer, color);
399
+ TY_(AddStringLiteral)(lexer, " }\n");
400
+ }
401
+ }
402
+
403
+ /*
404
+ move presentation attribs from body to style element
405
+
406
+ background="foo" -> body { background-image: url(foo) }
407
+ bgcolor="foo" -> body { background-color: foo }
408
+ text="foo" -> body { color: foo }
409
+ link="foo" -> :link { color: foo }
410
+ vlink="foo" -> :visited { color: foo }
411
+ alink="foo" -> :active { color: foo }
412
+ */
413
+ static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
414
+ {
415
+ Lexer* lexer = doc->lexer;
416
+ tmbstr bgurl = NULL;
417
+ tmbstr bgcolor = NULL;
418
+ tmbstr color = NULL;
419
+ AttVal* attr;
420
+
421
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
422
+ {
423
+ bgurl = attr->value;
424
+ attr->value = NULL;
425
+ TY_(RemoveAttribute)( doc, body, attr );
426
+ }
427
+
428
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
429
+ {
430
+ bgcolor = attr->value;
431
+ attr->value = NULL;
432
+ TY_(RemoveAttribute)( doc, body, attr );
433
+ }
434
+
435
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
436
+ {
437
+ color = attr->value;
438
+ attr->value = NULL;
439
+ TY_(RemoveAttribute)( doc, body, attr );
440
+ }
441
+
442
+ if ( bgurl || bgcolor || color )
443
+ {
444
+ TY_(AddStringLiteral)(lexer, " body {\n");
445
+ if (bgurl)
446
+ {
447
+ TY_(AddStringLiteral)(lexer, " background-image: url(");
448
+ TY_(AddStringLiteral)(lexer, bgurl);
449
+ TY_(AddStringLiteral)(lexer, ");\n");
450
+ TidyDocFree(doc, bgurl);
451
+ }
452
+ if (bgcolor)
453
+ {
454
+ TY_(AddStringLiteral)(lexer, " background-color: ");
455
+ TY_(AddStringLiteral)(lexer, bgcolor);
456
+ TY_(AddStringLiteral)(lexer, ";\n");
457
+ TidyDocFree(doc, bgcolor);
458
+ }
459
+ if (color)
460
+ {
461
+ TY_(AddStringLiteral)(lexer, " color: ");
462
+ TY_(AddStringLiteral)(lexer, color);
463
+ TY_(AddStringLiteral)(lexer, ";\n");
464
+ TidyDocFree(doc, color);
465
+ }
466
+
467
+ TY_(AddStringLiteral)(lexer, " }\n");
468
+ }
469
+
470
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
471
+ {
472
+ AddColorRule(lexer, " :link", attr->value);
473
+ TY_(RemoveAttribute)( doc, body, attr );
474
+ }
475
+
476
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
477
+ {
478
+ AddColorRule(lexer, " :visited", attr->value);
479
+ TY_(RemoveAttribute)( doc, body, attr );
480
+ }
481
+
482
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
483
+ {
484
+ AddColorRule(lexer, " :active", attr->value);
485
+ TY_(RemoveAttribute)( doc, body, attr );
486
+ }
487
+ }
488
+
489
+ static Bool NiceBody( TidyDocImpl* doc )
490
+ {
491
+ Node* node = TY_(FindBody)(doc);
492
+ if (node)
493
+ {
494
+ if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
495
+ TY_(AttrGetById)(node, TidyAttr_BGCOLOR) ||
496
+ TY_(AttrGetById)(node, TidyAttr_TEXT) ||
497
+ TY_(AttrGetById)(node, TidyAttr_LINK) ||
498
+ TY_(AttrGetById)(node, TidyAttr_VLINK) ||
499
+ TY_(AttrGetById)(node, TidyAttr_ALINK))
500
+ {
501
+ doc->badLayout |= USING_BODY;
502
+ return no;
503
+ }
504
+ }
505
+
506
+ return yes;
507
+ }
508
+
509
+ /* create style element using rules from dictionary */
510
+ static void CreateStyleElement( TidyDocImpl* doc )
511
+ {
512
+ Lexer* lexer = doc->lexer;
513
+ Node *node, *head, *body;
514
+ TagStyle *style;
515
+ AttVal *av;
516
+
517
+ if ( lexer->styles == NULL && NiceBody(doc) )
518
+ return;
519
+
520
+ node = TY_(NewNode)( doc->allocator, lexer );
521
+ node->type = StartTag;
522
+ node->implicit = yes;
523
+ node->element = TY_(tmbstrdup)(doc->allocator, "style");
524
+ TY_(FindTag)( doc, node );
525
+
526
+ /* insert type attribute */
527
+ av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
528
+ TY_(InsertAttributeAtStart)( node, av );
529
+
530
+ body = TY_(FindBody)( doc );
531
+ lexer->txtstart = lexer->lexsize;
532
+ if ( body )
533
+ CleanBodyAttrs( doc, body );
534
+
535
+ for (style = lexer->styles; style; style = style->next)
536
+ {
537
+ TY_(AddCharToLexer)(lexer, ' ');
538
+ TY_(AddStringLiteral)(lexer, style->tag);
539
+ TY_(AddCharToLexer)(lexer, '.');
540
+ TY_(AddStringLiteral)(lexer, style->tag_class);
541
+ TY_(AddCharToLexer)(lexer, ' ');
542
+ TY_(AddCharToLexer)(lexer, '{');
543
+ TY_(AddStringLiteral)(lexer, style->properties);
544
+ TY_(AddCharToLexer)(lexer, '}');
545
+ TY_(AddCharToLexer)(lexer, '\n');
546
+ }
547
+
548
+ lexer->txtend = lexer->lexsize;
549
+
550
+ TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
551
+
552
+ /*
553
+ now insert style element into document head
554
+
555
+ doc is root node. search its children for html node
556
+ the head node should be first child of html node
557
+ */
558
+ if ( NULL != (head = TY_(FindHEAD)( doc )) )
559
+ TY_(InsertNodeAtEnd)( head, node );
560
+ }
561
+
562
+
563
+ /* ensure bidirectional links are consistent */
564
+ void TY_(FixNodeLinks)(Node *node)
565
+ {
566
+ Node *child;
567
+
568
+ if (node->prev)
569
+ node->prev->next = node;
570
+ else
571
+ node->parent->content = node;
572
+
573
+ if (node->next)
574
+ node->next->prev = node;
575
+ else
576
+ node->parent->last = node;
577
+
578
+ for (child = node->content; child; child = child->next)
579
+ child->parent = node;
580
+ }
581
+
582
+ /*
583
+ used to strip child of node when
584
+ the node has one and only one child
585
+ */
586
+ static void StripOnlyChild(TidyDocImpl* doc, Node *node)
587
+ {
588
+ Node *child;
589
+
590
+ child = node->content;
591
+ node->content = child->content;
592
+ node->last = child->last;
593
+ child->content = NULL;
594
+ TY_(FreeNode)(doc, child);
595
+
596
+ for (child = node->content; child; child = child->next)
597
+ child->parent = node;
598
+ }
599
+
600
+ /*
601
+ used to strip font start and end tags.
602
+ Extricate "element", replace it by its content and delete it.
603
+ */
604
+ static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
605
+ {
606
+ if (element->content)
607
+ {
608
+ Node *node, *parent = element->parent;
609
+
610
+ element->last->next = element->next;
611
+
612
+ if (element->next)
613
+ {
614
+ element->next->prev = element->last;
615
+ }
616
+ else
617
+ parent->last = element->last;
618
+
619
+ if (element->prev)
620
+ {
621
+ element->content->prev = element->prev;
622
+ element->prev->next = element->content;
623
+ }
624
+ else
625
+ parent->content = element->content;
626
+
627
+ for (node = element->content; node; node = node->next)
628
+ node->parent = parent;
629
+
630
+ *pnode = element->content;
631
+
632
+ element->next = element->content = NULL;
633
+ TY_(FreeNode)(doc, element);
634
+ }
635
+ else
636
+ {
637
+ *pnode = TY_(DiscardElement)(doc, element);
638
+ }
639
+ }
640
+
641
+ /*
642
+ Create new string that consists of the
643
+ combined style properties in s1 and s2
644
+
645
+ To merge property lists, we build a linked
646
+ list of property/values and insert properties
647
+ into the list in order, merging values for
648
+ the same property name.
649
+ */
650
+ static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
651
+ {
652
+ tmbstr s;
653
+ StyleProp *prop;
654
+
655
+ prop = CreateProps(doc, NULL, s1);
656
+ prop = CreateProps(doc, prop, s2);
657
+ s = CreatePropString(doc, prop);
658
+ FreeStyleProps(doc, prop);
659
+ return s;
660
+ }
661
+
662
+ /*
663
+ Add style property to element, creating style
664
+ attribute as needed and adding ; delimiter
665
+ */
666
+ void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
667
+ {
668
+ AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
669
+
670
+ /* if style attribute already exists then insert property */
671
+
672
+ if ( av )
673
+ {
674
+ if (av->value != NULL)
675
+ {
676
+ tmbstr s = MergeProperties( doc, av->value, property );
677
+ TidyDocFree( doc, av->value );
678
+ av->value = s;
679
+ }
680
+ else
681
+ {
682
+ av->value = TY_(tmbstrdup)( doc->allocator, property );
683
+ }
684
+ }
685
+ else /* else create new style attribute */
686
+ {
687
+ av = TY_(NewAttributeEx)( doc, "style", property, '"' );
688
+ TY_(InsertAttributeAtStart)( node, av );
689
+ }
690
+ }
691
+
692
+ static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
693
+ {
694
+ AttVal *av;
695
+ tmbstr s1, s2, names;
696
+
697
+ for (s2 = NULL, av = child->attributes; av; av = av->next)
698
+ {
699
+ if (attrIsCLASS(av))
700
+ {
701
+ s2 = av->value;
702
+ break;
703
+ }
704
+ }
705
+
706
+ for (s1 = NULL, av = node->attributes; av; av = av->next)
707
+ {
708
+ if (attrIsCLASS(av))
709
+ {
710
+ s1 = av->value;
711
+ break;
712
+ }
713
+ }
714
+
715
+ if (s1)
716
+ {
717
+ if (s2) /* merge class names from both */
718
+ {
719
+ uint l1, l2;
720
+ l1 = TY_(tmbstrlen)(s1);
721
+ l2 = TY_(tmbstrlen)(s2);
722
+ names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
723
+ TY_(tmbstrcpy)(names, s1);
724
+ names[l1] = ' ';
725
+ TY_(tmbstrcpy)(names+l1+1, s2);
726
+ TidyDocFree(doc, av->value);
727
+ av->value = names;
728
+ }
729
+ }
730
+ else if (s2) /* copy class names from child */
731
+ {
732
+ av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
733
+ TY_(InsertAttributeAtStart)( node, av );
734
+ }
735
+ }
736
+
737
+ static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
738
+ {
739
+ AttVal *av;
740
+ tmbstr s1, s2, style;
741
+
742
+ /*
743
+ the child may have a class attribute used
744
+ for attaching styles, if so the class name
745
+ needs to be copied to node's class
746
+ */
747
+ MergeClasses(doc, node, child);
748
+
749
+ for (s2 = NULL, av = child->attributes; av; av = av->next)
750
+ {
751
+ if (attrIsSTYLE(av))
752
+ {
753
+ s2 = av->value;
754
+ break;
755
+ }
756
+ }
757
+
758
+ for (s1 = NULL, av = node->attributes; av; av = av->next)
759
+ {
760
+ if (attrIsSTYLE(av))
761
+ {
762
+ s1 = av->value;
763
+ break;
764
+ }
765
+ }
766
+
767
+ if (s1)
768
+ {
769
+ if (s2) /* merge styles from both */
770
+ {
771
+ style = MergeProperties(doc, s1, s2);
772
+ TidyDocFree(doc, av->value);
773
+ av->value = style;
774
+ }
775
+ }
776
+ else if (s2) /* copy style of child */
777
+ {
778
+ av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
779
+ TY_(InsertAttributeAtStart)( node, av );
780
+ }
781
+ }
782
+
783
+ static ctmbstr FontSize2Name(ctmbstr size)
784
+ {
785
+ static const ctmbstr sizes[7] =
786
+ {
787
+ "60%", "70%", "80%", NULL,
788
+ "120%", "150%", "200%"
789
+ };
790
+
791
+ /* increment of 0.8 */
792
+ static const ctmbstr minussizes[] =
793
+ {
794
+ "100%", "80%", "64%", "51%",
795
+ "40%", "32%", "26%"
796
+ };
797
+
798
+ /* increment of 1.2 */
799
+ static const ctmbstr plussizes[] =
800
+ {
801
+ "100%", "120%", "144%", "172%",
802
+ "207%", "248%", "298%"
803
+ };
804
+
805
+ if (size[0] == '\0')
806
+ return NULL;
807
+
808
+ if ('0' <= size[0] && size[0] <= '6')
809
+ {
810
+ int n = size[0] - '0';
811
+ return sizes[n];
812
+ }
813
+
814
+ if (size[0] == '-')
815
+ {
816
+ if ('0' <= size[1] && size[1] <= '6')
817
+ {
818
+ int n = size[1] - '0';
819
+ return minussizes[n];
820
+ }
821
+ return "smaller"; /*"70%"; */
822
+ }
823
+
824
+ if ('0' <= size[1] && size[1] <= '6')
825
+ {
826
+ int n = size[1] - '0';
827
+ return plussizes[n];
828
+ }
829
+
830
+ return "larger"; /* "140%" */
831
+ }
832
+
833
+ static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
834
+ {
835
+ tmbchar buf[256];
836
+ TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
837
+ TY_(AddStyleProperty)( doc, node, buf );
838
+ }
839
+
840
+ static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
841
+ {
842
+ ctmbstr value = NULL;
843
+
844
+ if (nodeIsP(node))
845
+ {
846
+ if (TY_(tmbstrcmp)(size, "6") == 0)
847
+ value = "h1";
848
+ else if (TY_(tmbstrcmp)(size, "5") == 0)
849
+ value = "h2";
850
+ else if (TY_(tmbstrcmp)(size, "4") == 0)
851
+ value = "h3";
852
+
853
+ if (value)
854
+ {
855
+ TidyDocFree(doc, node->element);
856
+ node->element = TY_(tmbstrdup)(doc->allocator, value);
857
+ TY_(FindTag)(doc, node);
858
+ return;
859
+ }
860
+ }
861
+
862
+ value = FontSize2Name(size);
863
+
864
+ if (value)
865
+ {
866
+ tmbchar buf[64];
867
+ TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
868
+ TY_(AddStyleProperty)( doc, node, buf );
869
+ }
870
+ }
871
+
872
+ static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
873
+ {
874
+ tmbchar buf[128];
875
+ TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
876
+ TY_(AddStyleProperty)( doc, node, buf );
877
+ }
878
+
879
+ /* force alignment value to lower case */
880
+ static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
881
+ {
882
+ uint i;
883
+ tmbchar buf[128];
884
+
885
+ TY_(tmbstrcpy)( buf, "text-align: " );
886
+ for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
887
+ {
888
+ if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
889
+ break;
890
+ }
891
+ buf[i] = '\0';
892
+ TY_(AddStyleProperty)( doc, node, buf );
893
+ }
894
+
895
+ /*
896
+ add style properties to node corresponding to
897
+ the font face, size and color attributes
898
+ */
899
+ static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
900
+ {
901
+ while (av)
902
+ {
903
+ if (AttrHasValue(av))
904
+ {
905
+ if (attrIsFACE(av))
906
+ AddFontFace( doc, node, av->value );
907
+ else if (attrIsSIZE(av))
908
+ AddFontSize( doc, node, av->value );
909
+ else if (attrIsCOLOR(av))
910
+ AddFontColor( doc, node, av->value );
911
+ }
912
+ av = av->next;
913
+ }
914
+ }
915
+
916
+ /*
917
+ Symptom: <p align=center>
918
+ Action: <p style="text-align: center">
919
+ */
920
+ static void TextAlign( TidyDocImpl* doc, Node* node )
921
+ {
922
+ AttVal *av, *prev;
923
+
924
+ prev = NULL;
925
+
926
+ for (av = node->attributes; av; av = av->next)
927
+ {
928
+ if (attrIsALIGN(av))
929
+ {
930
+ if (prev)
931
+ prev->next = av->next;
932
+ else
933
+ node->attributes = av->next;
934
+
935
+ if (av->value)
936
+ AddAlign( doc, node, av->value );
937
+
938
+ TY_(FreeAttribute)(doc, av);
939
+ break;
940
+ }
941
+
942
+ prev = av;
943
+ }
944
+ }
945
+
946
+ /*
947
+ Symptom: <table bgcolor="red">
948
+ Action: <table style="background-color: red">
949
+ */
950
+ static void TableBgColor( TidyDocImpl* doc, Node* node )
951
+ {
952
+ AttVal* attr;
953
+ tmbchar buf[256];
954
+
955
+ if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
956
+ {
957
+ TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
958
+ TY_(RemoveAttribute)( doc, node, attr );
959
+ TY_(AddStyleProperty)( doc, node, buf );
960
+ }
961
+ }
962
+
963
+ /*
964
+ The clean up rules use the pnode argument to return the
965
+ next node when the original node has been deleted
966
+ */
967
+
968
+ /*
969
+ Symptom: <dir> <li> where <li> is only child
970
+ Action: coerce <dir> <li> to <div> with indent.
971
+ */
972
+
973
+ static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
974
+ {
975
+ Node *child;
976
+
977
+ if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
978
+ {
979
+ child = node->content;
980
+
981
+ if (child == NULL)
982
+ return no;
983
+
984
+ /* check child has no peers */
985
+
986
+ if (child->next)
987
+ return no;
988
+
989
+ if ( !nodeIsLI(child) )
990
+ return no;
991
+
992
+ if ( !child->implicit )
993
+ return no;
994
+
995
+ /* coerce dir to div */
996
+ node->tag = TY_(LookupTagDef)( TidyTag_DIV );
997
+ TidyDocFree( doc, node->element );
998
+ node->element = TY_(tmbstrdup)(doc->allocator, "div");
999
+ TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
1000
+ StripOnlyChild( doc, node );
1001
+ return yes;
1002
+ }
1003
+
1004
+ return no;
1005
+ }
1006
+
1007
+ /*
1008
+ Symptom: <center>
1009
+ Action: replace <center> by <div style="text-align: center">
1010
+ */
1011
+
1012
+ static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1013
+ {
1014
+ if ( nodeIsCENTER(node) )
1015
+ {
1016
+ if ( cfgBool(doc, TidyDropFontTags) )
1017
+ {
1018
+ if (node->content)
1019
+ {
1020
+ Node *last = node->last;
1021
+ DiscardContainer( doc, node, pnode );
1022
+
1023
+ node = TY_(InferredTag)(doc, TidyTag_BR);
1024
+ TY_(InsertNodeAfterElement)(last, node);
1025
+ }
1026
+ else
1027
+ {
1028
+ Node *prev = node->prev, *next = node->next,
1029
+ *parent = node->parent;
1030
+ DiscardContainer( doc, node, pnode );
1031
+
1032
+ node = TY_(InferredTag)(doc, TidyTag_BR);
1033
+ if (next)
1034
+ TY_(InsertNodeBeforeElement)(next, node);
1035
+ else if (prev)
1036
+ TY_(InsertNodeAfterElement)(prev, node);
1037
+ else
1038
+ TY_(InsertNodeAtStart)(parent, node);
1039
+ }
1040
+
1041
+ return yes;
1042
+ }
1043
+
1044
+ RenameElem( doc, node, TidyTag_DIV );
1045
+ TY_(AddStyleProperty)( doc, node, "text-align: center" );
1046
+ return yes;
1047
+ }
1048
+
1049
+ return no;
1050
+ }
1051
+
1052
+ /* Copy child attributes to node. Duplicate attributes are overwritten.
1053
+ Unique attributes (such as ID) disable the action.
1054
+ Attributes style and class are not dealt with. A call to MergeStyles
1055
+ will do that.
1056
+ */
1057
+ static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1058
+ {
1059
+ AttVal *av1, *av2;
1060
+ TidyAttrId id;
1061
+
1062
+ /* Detect attributes that cannot be merged or overwritten. */
1063
+ if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1064
+ && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1065
+ return no;
1066
+
1067
+ /* Move child attributes to node. Attributes in node
1068
+ can be overwritten or merged. */
1069
+ for (av2 = child->attributes; av2; )
1070
+ {
1071
+ /* Dealt by MergeStyles. */
1072
+ if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1073
+ {
1074
+ av2 = av2->next;
1075
+ continue;
1076
+ }
1077
+ /* Avoid duplicates in node */
1078
+ if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1079
+ && (av1=TY_(AttrGetById)(node, id))!= NULL)
1080
+ TY_(RemoveAttribute)( doc, node, av1 );
1081
+
1082
+ /* Move attribute from child to node */
1083
+ TY_(DetachAttribute)( child, av2 );
1084
+ av1 = av2;
1085
+ av2 = av2->next;
1086
+ av1->next = NULL;
1087
+ TY_(InsertAttributeAtEnd)( node, av1 );
1088
+ }
1089
+
1090
+ return yes;
1091
+ }
1092
+
1093
+ /*
1094
+ Symptom <XX><XX>...</XX></XX>
1095
+ Action: merge the two XXs
1096
+
1097
+ For instance, this is useful after nested <dir>s used by Word
1098
+ for indenting have been converted to <div>s
1099
+
1100
+ If state is "no", no merging.
1101
+ If state is "yes", inner element is discarded. Only Style and Class
1102
+ attributes are merged using MergeStyles().
1103
+ If state is "auto", atttibutes are merged as described in CopyAttrs().
1104
+ Style and Class attributes are merged using MergeStyles().
1105
+ */
1106
+ static Bool MergeNestedElements( TidyDocImpl* doc,
1107
+ TidyTagId Id, TidyTriState state, Node *node,
1108
+ Node **ARG_UNUSED(pnode))
1109
+ {
1110
+ Node *child;
1111
+
1112
+ if ( state == TidyNoState
1113
+ || !TagIsId(node, Id) )
1114
+ return no;
1115
+
1116
+ child = node->content;
1117
+
1118
+ if ( child == NULL
1119
+ || child->next != NULL
1120
+ || !TagIsId(child, Id) )
1121
+ return no;
1122
+
1123
+ if ( state == TidyAutoState
1124
+ && CopyAttrs(doc, node, child) == no )
1125
+ return no;
1126
+
1127
+ MergeStyles( doc, node, child );
1128
+ StripOnlyChild( doc, node );
1129
+ return yes;
1130
+ }
1131
+
1132
+ /*
1133
+ Symptom: <ul><li><ul>...</ul></li></ul>
1134
+ Action: discard outer list
1135
+ */
1136
+
1137
+ static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1138
+ {
1139
+ Node *child, *list;
1140
+
1141
+ if ( nodeIsUL(node) || nodeIsOL(node) )
1142
+ {
1143
+ child = node->content;
1144
+
1145
+ if (child == NULL)
1146
+ return no;
1147
+
1148
+ /* check child has no peers */
1149
+
1150
+ if (child->next)
1151
+ return no;
1152
+
1153
+ list = child->content;
1154
+
1155
+ if (!list)
1156
+ return no;
1157
+
1158
+ if (list->tag != node->tag)
1159
+ return no;
1160
+
1161
+ /* check list has no peers */
1162
+ if (list->next)
1163
+ return no;
1164
+
1165
+ *pnode = list; /* Set node to resume iteration */
1166
+
1167
+ /* move inner list node into position of outer node */
1168
+ list->prev = node->prev;
1169
+ list->next = node->next;
1170
+ list->parent = node->parent;
1171
+ TY_(FixNodeLinks)(list);
1172
+
1173
+ /* get rid of outer ul and its li */
1174
+ child->content = NULL;
1175
+ TY_(FreeNode)( doc, child ); /* See test #427841. */
1176
+ child = NULL;
1177
+ node->content = NULL;
1178
+ node->next = NULL;
1179
+ TY_(FreeNode)( doc, node );
1180
+ node = NULL;
1181
+
1182
+ /*
1183
+ If prev node was a list the chances are this node
1184
+ should be appended to that list. Word has no way of
1185
+ recognizing nested lists and just uses indents
1186
+ */
1187
+
1188
+ if (list->prev)
1189
+ {
1190
+ if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1191
+ && list->prev->last )
1192
+ {
1193
+ node = list;
1194
+ list = node->prev;
1195
+
1196
+ child = list->last; /* <li> */
1197
+
1198
+ list->next = node->next;
1199
+ TY_(FixNodeLinks)(list);
1200
+
1201
+ node->parent = child;
1202
+ node->next = NULL;
1203
+ node->prev = child->last;
1204
+ TY_(FixNodeLinks)(node);
1205
+ CleanNode( doc, node );
1206
+ }
1207
+ }
1208
+
1209
+ return yes;
1210
+ }
1211
+
1212
+ return no;
1213
+ }
1214
+
1215
+ /* Find CSS equivalent in a SPAN element */
1216
+ static
1217
+ Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1218
+ {
1219
+ struct
1220
+ {
1221
+ TidyTagId id;
1222
+ ctmbstr CSSeq;
1223
+ Bool deprecated;
1224
+ }
1225
+ const CSS_SpanEq[] =
1226
+ {
1227
+ { TidyTag_B, "font-weight: bold", no },
1228
+ { TidyTag_I, "font-style: italic", no },
1229
+ { TidyTag_S, "text-decoration: line-through", yes},
1230
+ { TidyTag_STRIKE, "text-decoration: line-through", yes},
1231
+ { TidyTag_U, "text-decoration: underline", yes},
1232
+ { TidyTag_UNKNOWN, NULL, no }
1233
+ };
1234
+ uint i;
1235
+
1236
+ for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1237
+ if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1238
+ && TagIsId(node, CSS_SpanEq[i].id) )
1239
+ {
1240
+ *s = CSS_SpanEq[i].CSSeq;
1241
+ return yes;
1242
+ }
1243
+ return no;
1244
+ }
1245
+
1246
+ /* Necessary conditions to apply BlockStyle(). */
1247
+ static Bool CanApplyBlockStyle( Node *node )
1248
+ {
1249
+ if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1250
+ && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1251
+ {
1252
+ return yes;
1253
+ }
1254
+ return no;
1255
+ }
1256
+
1257
+ /*
1258
+ Symptom: the only child of a block-level element is a
1259
+ presentation element such as B, I or FONT
1260
+
1261
+ Action: add style "font-weight: bold" to the block and
1262
+ strip the <b> element, leaving its children.
1263
+
1264
+ example:
1265
+
1266
+ <p>
1267
+ <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1268
+ </p>
1269
+
1270
+ becomes:
1271
+
1272
+ <p style="font-weight: bold; font-family: Arial; font-size: 6">
1273
+ Draft Recommended Practice
1274
+ </p>
1275
+
1276
+ This code also replaces the align attribute by a style attribute.
1277
+ However, to avoid CSS problems with Navigator 4, this isn't done
1278
+ for the elements: caption, tr and table
1279
+ */
1280
+ static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1281
+ {
1282
+ Node *child;
1283
+ ctmbstr CSSeq;
1284
+
1285
+ /* check for bgcolor */
1286
+ if ( nodeIsTABLE(node)
1287
+ || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1288
+ TableBgColor( doc, node );
1289
+
1290
+ if (CanApplyBlockStyle(node))
1291
+ {
1292
+ /* check for align attribute */
1293
+ if ( !nodeIsCAPTION(node) )
1294
+ TextAlign( doc, node );
1295
+
1296
+ child = node->content;
1297
+ if (child == NULL)
1298
+ return no;
1299
+
1300
+ /* check child has no peers */
1301
+ if (child->next)
1302
+ return no;
1303
+
1304
+ if ( FindCSSSpanEq(child, &CSSeq, no) )
1305
+ {
1306
+ MergeStyles( doc, node, child );
1307
+ TY_(AddStyleProperty)( doc, node, CSSeq );
1308
+ StripOnlyChild( doc, node );
1309
+ return yes;
1310
+ }
1311
+ else if ( nodeIsFONT(child) )
1312
+ {
1313
+ MergeStyles( doc, node, child );
1314
+ AddFontStyles( doc, node, child->attributes );
1315
+ StripOnlyChild( doc, node );
1316
+ return yes;
1317
+ }
1318
+ }
1319
+
1320
+ return no;
1321
+ }
1322
+
1323
+ /* Necessary conditions to apply InlineStyle(). */
1324
+ static Bool CanApplyInlineStyle( Node *node )
1325
+ {
1326
+ return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1327
+ }
1328
+
1329
+ /* the only child of table cell or an inline element such as em */
1330
+ static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1331
+ {
1332
+ Node *child;
1333
+ ctmbstr CSSeq;
1334
+
1335
+ if ( CanApplyInlineStyle(node) )
1336
+ {
1337
+ child = node->content;
1338
+
1339
+ if (child == NULL)
1340
+ return no;
1341
+
1342
+ /* check child has no peers */
1343
+
1344
+ if (child->next)
1345
+ return no;
1346
+
1347
+ if ( FindCSSSpanEq(child, &CSSeq, no) )
1348
+ {
1349
+ MergeStyles( doc, node, child );
1350
+ TY_(AddStyleProperty)( doc, node, CSSeq );
1351
+ StripOnlyChild( doc, node );
1352
+ return yes;
1353
+ }
1354
+ else if ( nodeIsFONT(child) )
1355
+ {
1356
+ MergeStyles( doc, node, child );
1357
+ AddFontStyles( doc, node, child->attributes );
1358
+ StripOnlyChild( doc, node );
1359
+ return yes;
1360
+ }
1361
+ }
1362
+
1363
+ return no;
1364
+ }
1365
+
1366
+ /*
1367
+ Transform element to equivalent CSS
1368
+ */
1369
+ static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1370
+ Node **ARG_UNUSED(pnode) )
1371
+ {
1372
+ ctmbstr CSSeq;
1373
+
1374
+ /* if node is the only child of parent element then leave alone
1375
+ Do so only if BlockStyle may be succesful. */
1376
+ if ( node->parent->content == node && node->next == NULL &&
1377
+ (CanApplyBlockStyle(node->parent)
1378
+ || CanApplyInlineStyle(node->parent)) )
1379
+ return no;
1380
+
1381
+ if ( FindCSSSpanEq(node, &CSSeq, yes) )
1382
+ {
1383
+ RenameElem( doc, node, TidyTag_SPAN );
1384
+ TY_(AddStyleProperty)( doc, node, CSSeq );
1385
+ return yes;
1386
+ }
1387
+ return no;
1388
+ }
1389
+
1390
+ /*
1391
+ Replace font elements by span elements, deleting
1392
+ the font element's attributes and replacing them
1393
+ by a single style attribute.
1394
+ */
1395
+ static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1396
+ {
1397
+ AttVal *av, *style, *next;
1398
+
1399
+ if ( nodeIsFONT(node) )
1400
+ {
1401
+ if ( cfgBool(doc, TidyDropFontTags) )
1402
+ {
1403
+ DiscardContainer( doc, node, pnode );
1404
+ return yes;
1405
+ }
1406
+
1407
+ /* if node is the only child of parent element then leave alone
1408
+ Do so only if BlockStyle may be succesful. */
1409
+ if ( node->parent->content == node && node->next == NULL &&
1410
+ CanApplyBlockStyle(node->parent) )
1411
+ return no;
1412
+
1413
+ AddFontStyles( doc, node, node->attributes );
1414
+
1415
+ /* extract style attribute and free the rest */
1416
+ av = node->attributes;
1417
+ style = NULL;
1418
+
1419
+ while (av)
1420
+ {
1421
+ next = av->next;
1422
+
1423
+ if (attrIsSTYLE(av))
1424
+ {
1425
+ av->next = NULL;
1426
+ style = av;
1427
+ }
1428
+ else
1429
+ {
1430
+ TY_(FreeAttribute)( doc, av );
1431
+ }
1432
+ av = next;
1433
+ }
1434
+
1435
+ node->attributes = style;
1436
+ RenameElem( doc, node, TidyTag_SPAN );
1437
+ return yes;
1438
+ }
1439
+
1440
+ return no;
1441
+ }
1442
+
1443
+ /*
1444
+ Applies all matching rules to a node.
1445
+ */
1446
+ Node* CleanNode( TidyDocImpl* doc, Node *node )
1447
+ {
1448
+ Node *next = NULL;
1449
+ TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1450
+ TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1451
+
1452
+ for (next = node; TY_(nodeIsElement)(node); node = next)
1453
+ {
1454
+ if ( Dir2Div(doc, node, &next) )
1455
+ continue;
1456
+
1457
+ /* Special case: true result means
1458
+ ** that arg node and its parent no longer exist.
1459
+ ** So we must jump back up the CreateStyleProperties()
1460
+ ** call stack until we have a valid node reference.
1461
+ */
1462
+ if ( NestedList(doc, node, &next) )
1463
+ return next;
1464
+
1465
+ if ( Center2Div(doc, node, &next) )
1466
+ continue;
1467
+
1468
+ if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1469
+ continue;
1470
+
1471
+ if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1472
+ continue;
1473
+
1474
+ if ( BlockStyle(doc, node, &next) )
1475
+ continue;
1476
+
1477
+ if ( InlineStyle(doc, node, &next) )
1478
+ continue;
1479
+
1480
+ if ( InlineElementToCSS(doc, node, &next) )
1481
+ continue;
1482
+
1483
+ if ( Font2Span(doc, node, &next) )
1484
+ continue;
1485
+
1486
+ break;
1487
+ }
1488
+
1489
+ return next;
1490
+ }
1491
+
1492
+ /* Special case: if the current node is destroyed by
1493
+ ** CleanNode() lower in the tree, this node and its parent
1494
+ ** no longer exist. So we must jump back up the CleanTree()
1495
+ ** call stack until we have a valid node reference.
1496
+ */
1497
+
1498
+ static Node* CleanTree( TidyDocImpl* doc, Node *node )
1499
+ {
1500
+ if (node->content)
1501
+ {
1502
+ Node *child;
1503
+ for (child = node->content; child != NULL; child = child->next)
1504
+ {
1505
+ child = CleanTree( doc, child );
1506
+ if ( !child )
1507
+ break;
1508
+ }
1509
+ }
1510
+
1511
+ return CleanNode( doc, node );
1512
+ }
1513
+
1514
+ static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1515
+ {
1516
+ Node *child;
1517
+
1518
+ if (node->content)
1519
+ {
1520
+ for (child = node->content;
1521
+ child != NULL; child = child->next)
1522
+ {
1523
+ DefineStyleRules( doc, child );
1524
+ }
1525
+ }
1526
+
1527
+ Style2Rule( doc, node );
1528
+ }
1529
+
1530
+ void TY_(CleanDocument)( TidyDocImpl* doc )
1531
+ {
1532
+ /* placeholder. CleanTree()/CleanNode() will not
1533
+ ** zap root element
1534
+ */
1535
+ CleanTree( doc, &doc->root );
1536
+
1537
+ if ( cfgBool(doc, TidyMakeClean) )
1538
+ {
1539
+ DefineStyleRules( doc, &doc->root );
1540
+ CreateStyleElement( doc );
1541
+ }
1542
+ }
1543
+
1544
+ /* simplifies <b><b> ... </b> ...</b> etc. */
1545
+ void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1546
+ {
1547
+ Node *next;
1548
+
1549
+ while (node)
1550
+ {
1551
+ next = node->next;
1552
+
1553
+ if ( (nodeIsB(node) || nodeIsI(node))
1554
+ && node->parent && node->parent->tag == node->tag)
1555
+ {
1556
+ /* strip redundant inner element */
1557
+ DiscardContainer( doc, node, &next );
1558
+ node = next;
1559
+ continue;
1560
+ }
1561
+
1562
+ if ( node->content )
1563
+ TY_(NestedEmphasis)( doc, node->content );
1564
+
1565
+ node = next;
1566
+ }
1567
+ }
1568
+
1569
+
1570
+
1571
+ /* replace i by em and b by strong */
1572
+ void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1573
+ {
1574
+ while (node)
1575
+ {
1576
+ if ( nodeIsI(node) )
1577
+ RenameElem( doc, node, TidyTag_EM );
1578
+ else if ( nodeIsB(node) )
1579
+ RenameElem( doc, node, TidyTag_STRONG );
1580
+
1581
+ if ( node->content )
1582
+ TY_(EmFromI)( doc, node->content );
1583
+
1584
+ node = node->next;
1585
+ }
1586
+ }
1587
+
1588
+ static Bool HasOneChild(Node *node)
1589
+ {
1590
+ return (node->content && node->content->next == NULL);
1591
+ }
1592
+
1593
+ /*
1594
+ Some people use dir or ul without an li
1595
+ to indent the content. The pattern to
1596
+ look for is a list with a single implicit
1597
+ li. This is recursively replaced by an
1598
+ implicit blockquote.
1599
+ */
1600
+ void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1601
+ {
1602
+ while (node)
1603
+ {
1604
+ if (node->content)
1605
+ TY_(List2BQ)( doc, node->content );
1606
+
1607
+ if ( node->tag && node->tag->parser == TY_(ParseList) &&
1608
+ HasOneChild(node) && node->content->implicit )
1609
+ {
1610
+ StripOnlyChild( doc, node );
1611
+ RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1612
+ node->implicit = yes;
1613
+ }
1614
+
1615
+ node = node->next;
1616
+ }
1617
+ }
1618
+
1619
+
1620
+ /*
1621
+ Replace implicit blockquote by div with an indent
1622
+ taking care to reduce nested blockquotes to a single
1623
+ div with the indent set to match the nesting depth
1624
+ */
1625
+ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1626
+ {
1627
+ tmbchar indent_buf[ 32 ];
1628
+ uint indent;
1629
+
1630
+ while (node)
1631
+ {
1632
+ if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1633
+ {
1634
+ indent = 1;
1635
+
1636
+ while( HasOneChild(node) &&
1637
+ nodeIsBLOCKQUOTE(node->content) &&
1638
+ node->implicit)
1639
+ {
1640
+ ++indent;
1641
+ StripOnlyChild( doc, node );
1642
+ }
1643
+
1644
+ if (node->content)
1645
+ TY_(BQ2Div)( doc, node->content );
1646
+
1647
+ TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1648
+ 2*indent);
1649
+
1650
+ RenameElem( doc, node, TidyTag_DIV );
1651
+ TY_(AddStyleProperty)(doc, node, indent_buf );
1652
+ }
1653
+ else if (node->content)
1654
+ TY_(BQ2Div)( doc, node->content );
1655
+
1656
+ node = node->next;
1657
+ }
1658
+ }
1659
+
1660
+
1661
+ static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1662
+ {
1663
+ Node *check;
1664
+
1665
+ for ( check=node; check; check = check->parent )
1666
+ {
1667
+ if ( nodeIsTD(check) )
1668
+ return check;
1669
+ }
1670
+ return NULL;
1671
+ }
1672
+
1673
+ /* node is <![if ...]> prune up to <![endif]> */
1674
+ static Node* PruneSection( TidyDocImpl* doc, Node *node )
1675
+ {
1676
+ Lexer* lexer = doc->lexer;
1677
+
1678
+ for (;;)
1679
+ {
1680
+ ctmbstr lexbuf = lexer->lexbuf + node->start;
1681
+ if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1682
+ {
1683
+ Node* cell = FindEnclosingCell( doc, node );
1684
+ if ( cell )
1685
+ {
1686
+ /* Need to put &nbsp; into cell so it doesn't look weird
1687
+ */
1688
+ Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1689
+ assert( (byte)'\240' == (byte)160 );
1690
+ TY_(InsertNodeBeforeElement)( node, nbsp );
1691
+ }
1692
+ }
1693
+
1694
+ /* discard node and returns next, unless it is a text node */
1695
+ if ( node->type == TextNode )
1696
+ node = node->next;
1697
+ else
1698
+ node = TY_(DiscardElement)( doc, node );
1699
+
1700
+ if (node == NULL)
1701
+ return NULL;
1702
+
1703
+ if (node->type == SectionTag)
1704
+ {
1705
+ if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1706
+ {
1707
+ node = PruneSection( doc, node );
1708
+ continue;
1709
+ }
1710
+
1711
+ if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1712
+ {
1713
+ node = TY_(DiscardElement)( doc, node );
1714
+ break;
1715
+ }
1716
+ }
1717
+ }
1718
+
1719
+ return node;
1720
+ }
1721
+
1722
+ void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1723
+ {
1724
+ Lexer* lexer = doc->lexer;
1725
+ while (node)
1726
+ {
1727
+ if (node->type == SectionTag)
1728
+ {
1729
+ /* prune up to matching endif */
1730
+ if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1731
+ (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1732
+ {
1733
+ node = PruneSection( doc, node );
1734
+ continue;
1735
+ }
1736
+
1737
+ /* discard others as well */
1738
+ node = TY_(DiscardElement)( doc, node );
1739
+ continue;
1740
+ }
1741
+
1742
+ if (node->content)
1743
+ TY_(DropSections)( doc, node->content );
1744
+
1745
+ node = node->next;
1746
+ }
1747
+ }
1748
+
1749
+ static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1750
+ {
1751
+ AttVal *attr, *next, *prev = NULL;
1752
+
1753
+ for ( attr = node->attributes; attr; attr = next )
1754
+ {
1755
+ next = attr->next;
1756
+
1757
+ /* special check for class="Code" denoting pre text */
1758
+ /* Pass thru user defined styles as HTML class names */
1759
+ if (attrIsCLASS(attr))
1760
+ {
1761
+ if (AttrValueIs(attr, "Code") ||
1762
+ TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1763
+ {
1764
+ prev = attr;
1765
+ continue;
1766
+ }
1767
+ }
1768
+
1769
+ if (attrIsCLASS(attr) ||
1770
+ attrIsSTYLE(attr) ||
1771
+ attrIsLANG(attr) ||
1772
+ ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1773
+ (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1774
+ (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1775
+ {
1776
+ if (prev)
1777
+ prev->next = next;
1778
+ else
1779
+ node->attributes = next;
1780
+
1781
+ TY_(FreeAttribute)( doc, attr );
1782
+ }
1783
+ else
1784
+ prev = attr;
1785
+ }
1786
+ }
1787
+
1788
+ /* Word2000 uses span excessively, so we strip span out */
1789
+ static Node* StripSpan( TidyDocImpl* doc, Node* span )
1790
+ {
1791
+ Node *node, *prev = NULL, *content;
1792
+
1793
+ /*
1794
+ deal with span elements that have content
1795
+ by splicing the content in place of the span
1796
+ after having processed it
1797
+ */
1798
+
1799
+ TY_(CleanWord2000)( doc, span->content );
1800
+ content = span->content;
1801
+
1802
+ if (span->prev)
1803
+ prev = span->prev;
1804
+ else if (content)
1805
+ {
1806
+ node = content;
1807
+ content = content->next;
1808
+ TY_(RemoveNode)(node);
1809
+ TY_(InsertNodeBeforeElement)(span, node);
1810
+ prev = node;
1811
+ }
1812
+
1813
+ while (content)
1814
+ {
1815
+ node = content;
1816
+ content = content->next;
1817
+ TY_(RemoveNode)(node);
1818
+ TY_(InsertNodeAfterElement)(prev, node);
1819
+ prev = node;
1820
+ }
1821
+
1822
+ if (span->next == NULL)
1823
+ span->parent->last = prev;
1824
+
1825
+ node = span->next;
1826
+ span->content = NULL;
1827
+ TY_(DiscardElement)( doc, span );
1828
+ return node;
1829
+ }
1830
+
1831
+ /* map non-breaking spaces to regular spaces */
1832
+ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1833
+ {
1834
+ while ( node )
1835
+ {
1836
+ if ( node->content )
1837
+ TY_(NormalizeSpaces)( lexer, node->content );
1838
+
1839
+ if (TY_(nodeIsText)(node))
1840
+ {
1841
+ uint i, c;
1842
+ tmbstr p = lexer->lexbuf + node->start;
1843
+
1844
+ for (i = node->start; i < node->end; ++i)
1845
+ {
1846
+ c = (byte) lexer->lexbuf[i];
1847
+
1848
+ /* look for UTF-8 multibyte character */
1849
+ if ( c > 0x7F )
1850
+ i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1851
+
1852
+ if ( c == 160 )
1853
+ c = ' ';
1854
+
1855
+ p = TY_(PutUTF8)(p, c);
1856
+ }
1857
+ node->end = p - lexer->lexbuf;
1858
+ }
1859
+
1860
+ node = node->next;
1861
+ }
1862
+ }
1863
+
1864
+ /* used to hunt for hidden preformatted sections */
1865
+ static Bool NoMargins(Node *node)
1866
+ {
1867
+ AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1868
+
1869
+ if ( !AttrHasValue(attval) )
1870
+ return no;
1871
+
1872
+ /* search for substring "margin-top: 0" */
1873
+ if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1874
+ return no;
1875
+
1876
+ /* search for substring "margin-bottom: 0" */
1877
+ if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1878
+ return no;
1879
+
1880
+ return yes;
1881
+ }
1882
+
1883
+ /* does element have a single space as its content? */
1884
+ static Bool SingleSpace( Lexer* lexer, Node* node )
1885
+ {
1886
+ if ( node->content )
1887
+ {
1888
+ node = node->content;
1889
+
1890
+ if ( node->next != NULL )
1891
+ return no;
1892
+
1893
+ if ( node->type != TextNode )
1894
+ return no;
1895
+
1896
+ if ( (node->end - node->start) == 1 &&
1897
+ lexer->lexbuf[node->start] == ' ' )
1898
+ return yes;
1899
+
1900
+ if ( (node->end - node->start) == 2 )
1901
+ {
1902
+ uint c = 0;
1903
+ TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1904
+ if ( c == 160 )
1905
+ return yes;
1906
+ }
1907
+ }
1908
+
1909
+ return no;
1910
+ }
1911
+
1912
+ /*
1913
+ This is a major clean up to strip out all the extra stuff you get
1914
+ when you save as web page from Word 2000. It doesn't yet know what
1915
+ to do with VML tags, but these will appear as errors unless you
1916
+ declare them as new tags, such as o:p which needs to be declared
1917
+ as inline.
1918
+ */
1919
+ void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1920
+ {
1921
+ /* used to a list from a sequence of bulletted p's */
1922
+ Lexer* lexer = doc->lexer;
1923
+ Node* list = NULL;
1924
+
1925
+ while ( node )
1926
+ {
1927
+ /* get rid of Word's xmlns attributes */
1928
+ if ( nodeIsHTML(node) )
1929
+ {
1930
+ /* check that it's a Word 2000 document */
1931
+ if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1932
+ !cfgBool(doc, TidyMakeBare) )
1933
+ return;
1934
+
1935
+ TY_(FreeAttrs)( doc, node );
1936
+ }
1937
+
1938
+ /* fix up preformatted sections by looking for a
1939
+ ** sequence of paragraphs with zero top/bottom margin
1940
+ */
1941
+ if ( nodeIsP(node) )
1942
+ {
1943
+ if (NoMargins(node))
1944
+ {
1945
+ Node *pre, *next;
1946
+ TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1947
+
1948
+ PurgeWord2000Attributes( doc, node );
1949
+
1950
+ if (node->content)
1951
+ TY_(CleanWord2000)( doc, node->content );
1952
+
1953
+ pre = node;
1954
+ node = node->next;
1955
+
1956
+ /* continue to strip p's */
1957
+
1958
+ while ( nodeIsP(node) && NoMargins(node) )
1959
+ {
1960
+ next = node->next;
1961
+ TY_(RemoveNode)(node);
1962
+ TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1963
+ TY_(InsertNodeAtEnd)(pre, node);
1964
+ StripSpan( doc, node );
1965
+ node = next;
1966
+ }
1967
+
1968
+ if (node == NULL)
1969
+ break;
1970
+ }
1971
+ }
1972
+
1973
+ if (node->tag && (node->tag->model & CM_BLOCK)
1974
+ && SingleSpace(lexer, node))
1975
+ {
1976
+ node = StripSpan( doc, node );
1977
+ continue;
1978
+ }
1979
+ /* discard Word's style verbiage */
1980
+ if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1981
+ node->type == CommentTag )
1982
+ {
1983
+ node = TY_(DiscardElement)( doc, node );
1984
+ continue;
1985
+ }
1986
+
1987
+ /* strip out all span and font tags Word scatters so liberally! */
1988
+ if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1989
+ {
1990
+ node = StripSpan( doc, node );
1991
+ continue;
1992
+ }
1993
+
1994
+ if ( nodeIsLINK(node) )
1995
+ {
1996
+ AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1997
+
1998
+ if (AttrValueIs(attr, "File-List"))
1999
+ {
2000
+ node = TY_(DiscardElement)( doc, node );
2001
+ continue;
2002
+ }
2003
+ }
2004
+
2005
+ /* discards <o:p> which encodes the paragraph mark */
2006
+ if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2007
+ {
2008
+ Node* next;
2009
+ DiscardContainer( doc, node, &next );
2010
+ node = next;
2011
+ continue;
2012
+ }
2013
+
2014
+ /* discard empty paragraphs */
2015
+
2016
+ if ( node->content == NULL && nodeIsP(node) )
2017
+ {
2018
+ /* Use the existing function to ensure consistency */
2019
+ Node *next = TY_(TrimEmptyElement)( doc, node );
2020
+ node = next;
2021
+ continue;
2022
+ }
2023
+
2024
+ if ( nodeIsP(node) )
2025
+ {
2026
+ AttVal *attr, *atrStyle;
2027
+
2028
+ attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2029
+ atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2030
+ /*
2031
+ (JES) Sometimes Word marks a list item with the following hokie syntax
2032
+ <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2033
+ translate these into <li>
2034
+ */
2035
+ /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2036
+ /* map <p class="MsoListNumber"> to <ol>...</ol> */
2037
+ if ( AttrValueIs(attr, "MsoListBullet") ||
2038
+ AttrValueIs(attr, "MsoListNumber") ||
2039
+ AttrContains(atrStyle, "mso-list:") )
2040
+ {
2041
+ TidyTagId listType = TidyTag_UL;
2042
+ if (AttrValueIs(attr, "MsoListNumber"))
2043
+ listType = TidyTag_OL;
2044
+
2045
+ TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2046
+
2047
+ if ( !list || TagId(list) != listType )
2048
+ {
2049
+ const Dict* tag = TY_(LookupTagDef)( listType );
2050
+ list = TY_(InferredTag)(doc, tag->id);
2051
+ TY_(InsertNodeBeforeElement)(node, list);
2052
+ }
2053
+
2054
+ PurgeWord2000Attributes( doc, node );
2055
+
2056
+ if ( node->content )
2057
+ TY_(CleanWord2000)( doc, node->content );
2058
+
2059
+ /* remove node and append to contents of list */
2060
+ TY_(RemoveNode)(node);
2061
+ TY_(InsertNodeAtEnd)(list, node);
2062
+ node = list;
2063
+ }
2064
+ /* map sequence of <p class="Code"> to <pre>...</pre> */
2065
+ else if (AttrValueIs(attr, "Code"))
2066
+ {
2067
+ Node *br = TY_(NewLineNode)(lexer);
2068
+ TY_(NormalizeSpaces)(lexer, node->content);
2069
+
2070
+ if ( !list || TagId(list) != TidyTag_PRE )
2071
+ {
2072
+ list = TY_(InferredTag)(doc, TidyTag_PRE);
2073
+ TY_(InsertNodeBeforeElement)(node, list);
2074
+ }
2075
+
2076
+ /* remove node and append to contents of list */
2077
+ TY_(RemoveNode)(node);
2078
+ TY_(InsertNodeAtEnd)(list, node);
2079
+ StripSpan( doc, node );
2080
+ TY_(InsertNodeAtEnd)(list, br);
2081
+ node = list->next;
2082
+ }
2083
+ else
2084
+ list = NULL;
2085
+ }
2086
+ else
2087
+ list = NULL;
2088
+
2089
+ if (!node)
2090
+ return;
2091
+
2092
+ /* strip out style and class attributes */
2093
+ if (TY_(nodeIsElement)(node))
2094
+ PurgeWord2000Attributes( doc, node );
2095
+
2096
+ if (node->content)
2097
+ TY_(CleanWord2000)( doc, node->content );
2098
+
2099
+ node = node->next;
2100
+ }
2101
+ }
2102
+
2103
+ Bool TY_(IsWord2000)( TidyDocImpl* doc )
2104
+ {
2105
+ AttVal *attval;
2106
+ Node *node, *head;
2107
+ Node *html = TY_(FindHTML)( doc );
2108
+
2109
+ if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2110
+ return yes;
2111
+
2112
+ /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2113
+ head = TY_(FindHEAD)( doc );
2114
+
2115
+ if (head)
2116
+ {
2117
+ for (node = head->content; node; node = node->next)
2118
+ {
2119
+ if ( !nodeIsMETA(node) )
2120
+ continue;
2121
+
2122
+ attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2123
+
2124
+ if ( !AttrValueIs(attval, "generator") )
2125
+ continue;
2126
+
2127
+ attval = TY_(AttrGetById)( node, TidyAttr_CONTENT );
2128
+
2129
+ if ( AttrContains(attval, "Microsoft") )
2130
+ return yes;
2131
+ }
2132
+ }
2133
+
2134
+ return no;
2135
+ }
2136
+
2137
+ /* where appropriate move object elements from head to body */
2138
+ void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2139
+ {
2140
+ Node *node, *next, *head = NULL, *body = NULL;
2141
+
2142
+ if (!html)
2143
+ return;
2144
+
2145
+ for ( node = html->content; node != NULL; node = node->next )
2146
+ {
2147
+ if ( nodeIsHEAD(node) )
2148
+ head = node;
2149
+
2150
+ if ( nodeIsBODY(node) )
2151
+ body = node;
2152
+ }
2153
+
2154
+ if ( head != NULL && body != NULL )
2155
+ {
2156
+ for (node = head->content; node != NULL; node = next)
2157
+ {
2158
+ next = node->next;
2159
+
2160
+ if ( nodeIsOBJECT(node) )
2161
+ {
2162
+ Node *child;
2163
+ Bool bump = no;
2164
+
2165
+ for (child = node->content; child != NULL; child = child->next)
2166
+ {
2167
+ /* bump to body unless content is param */
2168
+ if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2169
+ || !nodeIsPARAM(child) )
2170
+ {
2171
+ bump = yes;
2172
+ break;
2173
+ }
2174
+ }
2175
+
2176
+ if ( bump )
2177
+ {
2178
+ TY_(RemoveNode)( node );
2179
+ TY_(InsertNodeAtStart)( body, node );
2180
+ }
2181
+ }
2182
+ }
2183
+ }
2184
+ }
2185
+
2186
+ /* This is disabled due to http://tidy.sf.net/bug/681116 */
2187
+ #if 0
2188
+ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2189
+ {
2190
+ Node *pNode;
2191
+ Bool bBRDeleted = no;
2192
+
2193
+ if (NULL == pParent)
2194
+ return;
2195
+
2196
+ /* First, check the status of All My Children */
2197
+ pNode = pParent->content;
2198
+ while (NULL != pNode )
2199
+ {
2200
+ /* The node may get trimmed, so save the next pointer, if any */
2201
+ Node *pNext = pNode->next;
2202
+ FixBrakes( pDoc, pNode );
2203
+ pNode = pNext;
2204
+ }
2205
+
2206
+
2207
+ /* As long as my last child is a <br />, move it to my last peer */
2208
+ if ( nodeCMIsBlock( pParent ))
2209
+ {
2210
+ for ( pNode = pParent->last;
2211
+ NULL != pNode && nodeIsBR( pNode );
2212
+ pNode = pParent->last )
2213
+ {
2214
+ if ( NULL == pNode->attributes && no == bBRDeleted )
2215
+ {
2216
+ TY_(DiscardElement)( pDoc, pNode );
2217
+ bBRDeleted = yes;
2218
+ }
2219
+ else
2220
+ {
2221
+ TY_(RemoveNode)( pNode );
2222
+ TY_(InsertNodeAfterElement)( pParent, pNode );
2223
+ }
2224
+ }
2225
+ TY_(TrimEmptyElement)( pDoc, pParent );
2226
+ }
2227
+ }
2228
+ #endif
2229
+
2230
+ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2231
+ {
2232
+ Node *pNode;
2233
+ StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2234
+ tmbstr s, pszBegin, pszEnd;
2235
+ ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2236
+
2237
+ if (!enc)
2238
+ return;
2239
+
2240
+ if (!nodeIsHEAD(head))
2241
+ head = TY_(FindHEAD)(doc);
2242
+
2243
+ if (!head)
2244
+ return;
2245
+
2246
+ /* Find any <meta http-equiv='Content-Type' content='...' /> */
2247
+ for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2248
+ {
2249
+ AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2250
+ AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2251
+
2252
+ if ( !nodeIsMETA(pNode) || !metaContent ||
2253
+ !AttrValueIs(httpEquiv, "Content-Type") )
2254
+ continue;
2255
+
2256
+ pszBegin = s = TY_(tmbstrdup)( doc->allocator, metaContent->value );
2257
+ while (pszBegin && *pszBegin)
2258
+ {
2259
+ while (isspace( *pszBegin ))
2260
+ pszBegin++;
2261
+ pszEnd = pszBegin;
2262
+ while ('\0' != *pszEnd && ';' != *pszEnd)
2263
+ pszEnd++;
2264
+ if (';' == *pszEnd )
2265
+ *(pszEnd++) = '\0';
2266
+ if (pszEnd > pszBegin)
2267
+ {
2268
+ prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
2269
+ prop->name = TY_(tmbstrdup)( doc->allocator, pszBegin );
2270
+ prop->value = NULL;
2271
+ prop->next = NULL;
2272
+
2273
+ if (NULL != pLastProp)
2274
+ pLastProp->next = prop;
2275
+ else
2276
+ pFirstProp = prop;
2277
+
2278
+ pLastProp = prop;
2279
+ pszBegin = pszEnd;
2280
+ }
2281
+ }
2282
+ TidyDocFree( doc, s );
2283
+
2284
+ /* find the charset property */
2285
+ for (prop = pFirstProp; NULL != prop; prop = prop->next)
2286
+ {
2287
+ if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2288
+ continue;
2289
+
2290
+ TidyDocFree( doc, prop->name );
2291
+ prop->name = (tmbstr)TidyDocAlloc( doc, 8 + TY_(tmbstrlen)(enc) + 1 );
2292
+ TY_(tmbstrcpy)(prop->name, "charset=");
2293
+ TY_(tmbstrcpy)(prop->name+8, enc);
2294
+ s = CreatePropString( doc, pFirstProp );
2295
+ TidyDocFree( doc, metaContent->value );
2296
+ metaContent->value = s;
2297
+ break;
2298
+ }
2299
+ /* #718127, prevent memory leakage */
2300
+ FreeStyleProps(doc, pFirstProp);
2301
+ pFirstProp = NULL;
2302
+ pLastProp = NULL;
2303
+ }
2304
+ }
2305
+
2306
+ void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2307
+ {
2308
+ Node* next;
2309
+
2310
+ while (node)
2311
+ {
2312
+ next = node->next;
2313
+
2314
+ if (node->type == CommentTag)
2315
+ {
2316
+ TY_(RemoveNode)(node);
2317
+ TY_(FreeNode)(doc, node);
2318
+ node = next;
2319
+ continue;
2320
+ }
2321
+
2322
+ if (node->content)
2323
+ TY_(DropComments)(doc, node->content);
2324
+
2325
+ node = next;
2326
+ }
2327
+ }
2328
+
2329
+ void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2330
+ {
2331
+ Node* next;
2332
+
2333
+ while (node)
2334
+ {
2335
+ next = node->next;
2336
+
2337
+ if (nodeIsFONT(node))
2338
+ {
2339
+ DiscardContainer(doc, node, &next);
2340
+ node = next;
2341
+ continue;
2342
+ }
2343
+
2344
+ if (node->content)
2345
+ TY_(DropFontElements)(doc, node->content, &next);
2346
+
2347
+ node = next;
2348
+ }
2349
+ }
2350
+
2351
+ void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2352
+ {
2353
+ Node* next;
2354
+
2355
+ while (node)
2356
+ {
2357
+ next = node->next;
2358
+
2359
+ if (nodeIsWBR(node))
2360
+ {
2361
+ Node* text;
2362
+ text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2363
+ TY_(InsertNodeAfterElement)(node, text);
2364
+ TY_(RemoveNode)(node);
2365
+ TY_(FreeNode)(doc, node);
2366
+ node = next;
2367
+ continue;
2368
+ }
2369
+
2370
+ if (node->content)
2371
+ TY_(WbrToSpace)(doc, node->content);
2372
+
2373
+ node = next;
2374
+ }
2375
+ }
2376
+
2377
+ /*
2378
+ Filters from Word and PowerPoint often use smart
2379
+ quotes resulting in character codes between 128
2380
+ and 159. Unfortunately, the corresponding HTML 4.0
2381
+ entities for these are not widely supported. The
2382
+ following converts dashes and quotation marks to
2383
+ the nearest ASCII equivalent. My thanks to
2384
+ Andrzej Novosiolov for his help with this code.
2385
+
2386
+ Note: The old code in the pretty printer applied
2387
+ this to all node types and attribute values while
2388
+ this routine applies it only to text nodes. First,
2389
+ Microsoft Office products rarely put the relevant
2390
+ characters into these tokens, second support for
2391
+ them is much better now and last but not least, it
2392
+ can be harmful to replace these characters since
2393
+ US-ASCII quote marks are often used as syntax
2394
+ characters, a simple
2395
+
2396
+ <a onmouseover="alert('&#x2018;')">...</a>
2397
+
2398
+ would be broken if the U+2018 is replaced by "'".
2399
+ The old code would neither take care whether the
2400
+ quote mark is already used as delimiter,
2401
+
2402
+ <p title='&#x2018;'>...</p>
2403
+
2404
+ got
2405
+
2406
+ <p title='''>...</p>
2407
+
2408
+ Since browser support is much better nowadays and
2409
+ high-quality typography is better than ASCII it'd
2410
+ be probably a good idea to drop the feature...
2411
+ */
2412
+ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2413
+ {
2414
+ Node* next;
2415
+ Lexer* lexer = doc->lexer;
2416
+
2417
+ while (node)
2418
+ {
2419
+ next = node->next;
2420
+
2421
+ if (TY_(nodeIsText)(node))
2422
+ {
2423
+ uint i, c;
2424
+ tmbstr p = lexer->lexbuf + node->start;
2425
+
2426
+ for (i = node->start; i < node->end; ++i)
2427
+ {
2428
+ c = (unsigned char) lexer->lexbuf[i];
2429
+
2430
+ if (c > 0x7F)
2431
+ i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2432
+
2433
+ if (c >= 0x2013 && c <= 0x201E)
2434
+ {
2435
+ switch (c)
2436
+ {
2437
+ case 0x2013: /* en dash */
2438
+ case 0x2014: /* em dash */
2439
+ c = '-';
2440
+ break;
2441
+ case 0x2018: /* left single quotation mark */
2442
+ case 0x2019: /* right single quotation mark */
2443
+ case 0x201A: /* single low-9 quotation mark */
2444
+ c = '\'';
2445
+ break;
2446
+ case 0x201C: /* left double quotation mark */
2447
+ case 0x201D: /* right double quotation mark */
2448
+ case 0x201E: /* double low-9 quotation mark */
2449
+ c = '"';
2450
+ break;
2451
+ }
2452
+ }
2453
+
2454
+ p = TY_(PutUTF8)(p, c);
2455
+ }
2456
+
2457
+ node->end = p - lexer->lexbuf;
2458
+ }
2459
+
2460
+ if (node->content)
2461
+ TY_(DowngradeTypography)(doc, node->content);
2462
+
2463
+ node = next;
2464
+ }
2465
+ }
2466
+
2467
+ void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2468
+ {
2469
+ Node* next;
2470
+
2471
+ while (node)
2472
+ {
2473
+ next = node->next;
2474
+
2475
+ if (node->tag && node->tag->parser == TY_(ParsePre))
2476
+ {
2477
+ TY_(NormalizeSpaces)(doc->lexer, node->content);
2478
+ node = next;
2479
+ continue;
2480
+ }
2481
+
2482
+ if (node->content)
2483
+ TY_(ReplacePreformattedSpaces)(doc, node->content);
2484
+
2485
+ node = next;
2486
+ }
2487
+ }
2488
+
2489
+ void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2490
+ {
2491
+ Node* next;
2492
+
2493
+ while (node)
2494
+ {
2495
+ next = node->next;
2496
+
2497
+ if (node->type == CDATATag)
2498
+ node->type = TextNode;
2499
+
2500
+ if (node->content)
2501
+ TY_(ConvertCDATANodes)(doc, node->content);
2502
+
2503
+ node = next;
2504
+ }
2505
+ }
2506
+
2507
+ /*
2508
+ FixLanguageInformation ensures that the document contains (only)
2509
+ the attributes for language information desired by the output
2510
+ document type. For example, for XHTML 1.0 documents both
2511
+ 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2512
+ is desired and for HTML 4.01 only 'lang' is desired.
2513
+ */
2514
+ void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2515
+ {
2516
+ Node* next;
2517
+
2518
+ while (node)
2519
+ {
2520
+ next = node->next;
2521
+
2522
+ /* todo: report modifications made here to the report system */
2523
+
2524
+ if (TY_(nodeIsElement)(node))
2525
+ {
2526
+ AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2527
+ AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2528
+
2529
+ if (lang && xmlLang)
2530
+ {
2531
+ /*
2532
+ todo: check whether both attributes are in sync,
2533
+ here or elsewhere, where elsewhere is probably
2534
+ preferable.
2535
+ AD - March 2005: not mandatory according the standards.
2536
+ */
2537
+ }
2538
+ else if (lang && wantXmlLang)
2539
+ {
2540
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2541
+ & doc->lexer->versionEmitted)
2542
+ TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2543
+ }
2544
+ else if (xmlLang && wantLang)
2545
+ {
2546
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2547
+ & doc->lexer->versionEmitted)
2548
+ TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2549
+ }
2550
+
2551
+ if (lang && !wantLang)
2552
+ TY_(RemoveAttribute)(doc, node, lang);
2553
+
2554
+ if (xmlLang && !wantXmlLang)
2555
+ TY_(RemoveAttribute)(doc, node, xmlLang);
2556
+ }
2557
+
2558
+ if (node->content)
2559
+ TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2560
+
2561
+ node = next;
2562
+ }
2563
+ }
2564
+
2565
+ /*
2566
+ Set/fix/remove <html xmlns='...'>
2567
+ */
2568
+ void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2569
+ {
2570
+ Node* html = TY_(FindHTML)(doc);
2571
+ AttVal* xmlns;
2572
+
2573
+ if (!html)
2574
+ return;
2575
+
2576
+ xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2577
+
2578
+ if (wantXmlns)
2579
+ {
2580
+ if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2581
+ TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2582
+ }
2583
+ else if (xmlns)
2584
+ {
2585
+ TY_(RemoveAttribute)(doc, html, xmlns);
2586
+ }
2587
+ }
2588
+
2589
+ /*
2590
+ ...
2591
+ */
2592
+ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2593
+ {
2594
+ Node* next;
2595
+
2596
+ while (node)
2597
+ {
2598
+ next = node->next;
2599
+
2600
+ if (TY_(IsAnchorElement)(doc, node))
2601
+ {
2602
+ AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2603
+ AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2604
+ Bool hadName = name!=NULL;
2605
+ Bool hadId = id!=NULL;
2606
+ Bool IdEmitted = no;
2607
+ Bool NameEmitted = no;
2608
+
2609
+ /* todo: how are empty name/id attributes handled? */
2610
+
2611
+ if (name && id)
2612
+ {
2613
+ Bool NameHasValue = AttrHasValue(name);
2614
+ Bool IdHasValue = AttrHasValue(id);
2615
+ if ( (NameHasValue != IdHasValue) ||
2616
+ (NameHasValue && IdHasValue &&
2617
+ TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2618
+ TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2619
+ }
2620
+ else if (name && wantId)
2621
+ {
2622
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2623
+ & doc->lexer->versionEmitted)
2624
+ {
2625
+ if (TY_(IsValidHTMLID)(name->value))
2626
+ {
2627
+ TY_(RepairAttrValue)(doc, node, "id", name->value);
2628
+ IdEmitted = yes;
2629
+ }
2630
+ else
2631
+ TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2632
+ }
2633
+ }
2634
+ else if (id && wantName)
2635
+ {
2636
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2637
+ & doc->lexer->versionEmitted)
2638
+ {
2639
+ /* todo: do not assume id is valid */
2640
+ TY_(RepairAttrValue)(doc, node, "name", id->value);
2641
+ NameEmitted = yes;
2642
+ }
2643
+ }
2644
+
2645
+ if (id && !wantId
2646
+ /* make sure that Name has been emitted if requested */
2647
+ && (hadName || !wantName || NameEmitted) )
2648
+ TY_(RemoveAttribute)(doc, node, id);
2649
+
2650
+ if (name && !wantName
2651
+ /* make sure that Id has been emitted if requested */
2652
+ && (hadId || !wantId || IdEmitted) )
2653
+ TY_(RemoveAttribute)(doc, node, name);
2654
+
2655
+ if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL &&
2656
+ TY_(AttrGetById)(node, TidyAttr_ID) == NULL)
2657
+ TY_(RemoveAnchorByNode)(doc, node);
2658
+ }
2659
+
2660
+ if (node->content)
2661
+ TY_(FixAnchors)(doc, node->content, wantName, wantId);
2662
+
2663
+ node = next;
2664
+ }
2665
+ }
2666
+
2667
+ /*
2668
+ * local variables:
2669
+ * mode: c
2670
+ * indent-tabs-mode: nil
2671
+ * c-basic-offset: 4
2672
+ * eval: (c-set-offset 'substatement-open 0)
2673
+ * end:
2674
+ */