tidy-ext 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/clean.c ADDED
@@ -0,0 +1,2674 @@
1
+ /*
2
+ clean.c -- clean up misuse of presentation markup
3
+
4
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5
+ See tidy.h for the copyright notice.
6
+
7
+ CVS Info :
8
+
9
+ $Author: arnaud02 $
10
+ $Date: 2008/10/14 12:18:10 $
11
+ $Revision: 1.111 $
12
+
13
+ Filters from other formats such as Microsoft Word
14
+ often make excessive use of presentation markup such
15
+ as font tags, B, I, and the align attribute. By applying
16
+ a set of production rules, it is straight forward to
17
+ transform this to use CSS.
18
+
19
+ Some rules replace some of the children of an element by
20
+ style properties on the element, e.g.
21
+
22
+ <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
23
+
24
+ Such rules are applied to the element's content and then
25
+ to the element itself until none of the rules more apply.
26
+ Having applied all the rules to an element, it will have
27
+ a style attribute with one or more properties.
28
+
29
+ Other rules strip the element they apply to, replacing
30
+ it by style properties on the contents, e.g.
31
+
32
+ <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
33
+
34
+ These rules are applied to an element before processing
35
+ its content and replace the current element by the first
36
+ element in the exposed content.
37
+
38
+ After applying both sets of rules, you can replace the
39
+ style attribute by a class value and style rule in the
40
+ document head. To support this, an association of styles
41
+ and class names is built.
42
+
43
+ A naive approach is to rely on string matching to test
44
+ when two property lists are the same. A better approach
45
+ would be to first sort the properties before matching.
46
+
47
+ */
48
+
49
+ #include <stdio.h>
50
+ #include <stdlib.h>
51
+ #include <string.h>
52
+
53
+ #include "tidy-int.h"
54
+ #include "clean.h"
55
+ #include "lexer.h"
56
+ #include "parser.h"
57
+ #include "attrs.h"
58
+ #include "message.h"
59
+ #include "tmbstr.h"
60
+ #include "utf8.h"
61
+
62
+ static Node* CleanNode( TidyDocImpl* doc, Node *node );
63
+
64
+ static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
65
+ {
66
+ const Dict* dict = TY_(LookupTagDef)( tid );
67
+ TidyDocFree( doc, node->element );
68
+ node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
69
+ node->tag = dict;
70
+ }
71
+
72
+ static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
73
+ {
74
+ StyleProp *next;
75
+
76
+ while (props)
77
+ {
78
+ next = props->next;
79
+ TidyDocFree(doc, props->name);
80
+ TidyDocFree(doc, props->value);
81
+ TidyDocFree(doc, props);
82
+ props = next;
83
+ }
84
+ }
85
+
86
+ static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
87
+ {
88
+ StyleProp *first, *prev, *prop;
89
+ int cmp;
90
+
91
+ prev = NULL;
92
+ first = props;
93
+
94
+ while (props)
95
+ {
96
+ cmp = TY_(tmbstrcmp)(props->name, name);
97
+
98
+ if (cmp == 0)
99
+ {
100
+ /* this property is already defined, ignore new value */
101
+ return first;
102
+ }
103
+
104
+ if (cmp > 0)
105
+ {
106
+ /* insert before this */
107
+
108
+ prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
109
+ prop->name = TY_(tmbstrdup)(doc->allocator, name);
110
+ prop->value = TY_(tmbstrdup)(doc->allocator, value);
111
+ prop->next = props;
112
+
113
+ if (prev)
114
+ prev->next = prop;
115
+ else
116
+ first = prop;
117
+
118
+ return first;
119
+ }
120
+
121
+ prev = props;
122
+ props = props->next;
123
+ }
124
+
125
+ prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
126
+ prop->name = TY_(tmbstrdup)(doc->allocator, name);
127
+ prop->value = TY_(tmbstrdup)(doc->allocator, value);
128
+ prop->next = NULL;
129
+
130
+ if (prev)
131
+ prev->next = prop;
132
+ else
133
+ first = prop;
134
+
135
+ return first;
136
+ }
137
+
138
+ /*
139
+ Create sorted linked list of properties from style string
140
+ It temporarily places nulls in place of ':' and ';' to
141
+ delimit the strings for the property name and value.
142
+ Some systems don't allow you to NULL literal strings,
143
+ so to avoid this, a copy is made first.
144
+ */
145
+ static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
146
+ {
147
+ tmbstr name, value = NULL, name_end, value_end, line;
148
+ Bool more;
149
+
150
+ line = TY_(tmbstrdup)(doc->allocator, style);
151
+ name = line;
152
+
153
+ while (*name)
154
+ {
155
+ while (*name == ' ')
156
+ ++name;
157
+
158
+ name_end = name;
159
+
160
+ while (*name_end)
161
+ {
162
+ if (*name_end == ':')
163
+ {
164
+ value = name_end + 1;
165
+ break;
166
+ }
167
+
168
+ ++name_end;
169
+ }
170
+
171
+ if (*name_end != ':')
172
+ break;
173
+
174
+ while ( value && *value == ' ')
175
+ ++value;
176
+
177
+ value_end = value;
178
+ more = no;
179
+
180
+ while (*value_end)
181
+ {
182
+ if (*value_end == ';')
183
+ {
184
+ more = yes;
185
+ break;
186
+ }
187
+
188
+ ++value_end;
189
+ }
190
+
191
+ *name_end = '\0';
192
+ *value_end = '\0';
193
+
194
+ prop = InsertProperty(doc, prop, name, value);
195
+ *name_end = ':';
196
+
197
+ if (more)
198
+ {
199
+ *value_end = ';';
200
+ name = value_end + 1;
201
+ continue;
202
+ }
203
+
204
+ break;
205
+ }
206
+
207
+ TidyDocFree(doc, line); /* free temporary copy */
208
+ return prop;
209
+ }
210
+
211
+ static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
212
+ {
213
+ tmbstr style, p, s;
214
+ uint len;
215
+ StyleProp *prop;
216
+
217
+ /* compute length */
218
+
219
+ for (len = 0, prop = props; prop; prop = prop->next)
220
+ {
221
+ len += TY_(tmbstrlen)(prop->name) + 2;
222
+ if (prop->value)
223
+ len += TY_(tmbstrlen)(prop->value) + 2;
224
+ }
225
+
226
+ style = (tmbstr) TidyDocAlloc(doc, len+1);
227
+ style[0] = '\0';
228
+
229
+ for (p = style, prop = props; prop; prop = prop->next)
230
+ {
231
+ s = prop->name;
232
+
233
+ while((*p++ = *s++))
234
+ continue;
235
+
236
+ if (prop->value)
237
+ {
238
+ *--p = ':';
239
+ *++p = ' ';
240
+ ++p;
241
+
242
+ s = prop->value;
243
+ while((*p++ = *s++))
244
+ continue;
245
+ }
246
+ if (prop->next == NULL)
247
+ break;
248
+
249
+ *--p = ';';
250
+ *++p = ' ';
251
+ ++p;
252
+ }
253
+
254
+ return style;
255
+ }
256
+
257
+ /*
258
+ create string with merged properties
259
+ static tmbstr AddProperty( ctmbstr style, ctmbstr property )
260
+ {
261
+ tmbstr line;
262
+ StyleProp *prop;
263
+
264
+ prop = CreateProps(doc, NULL, style);
265
+ prop = CreateProps(doc, prop, property);
266
+ line = CreatePropString(doc, prop);
267
+ FreeStyleProps(doc, prop);
268
+ return line;
269
+ }
270
+ */
271
+
272
+ void TY_(FreeStyles)( TidyDocImpl* doc )
273
+ {
274
+ Lexer* lexer = doc->lexer;
275
+ if ( lexer )
276
+ {
277
+ TagStyle *style, *next;
278
+ for ( style = lexer->styles; style; style = next )
279
+ {
280
+ next = style->next;
281
+ TidyDocFree( doc, style->tag );
282
+ TidyDocFree( doc, style->tag_class );
283
+ TidyDocFree( doc, style->properties );
284
+ TidyDocFree( doc, style );
285
+ }
286
+ }
287
+ }
288
+
289
+ static tmbstr GensymClass( TidyDocImpl* doc )
290
+ {
291
+ tmbchar buf[512]; /* CSSPrefix is limited to 256 characters */
292
+ ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
293
+ if ( pfx == NULL || *pfx == 0 )
294
+ pfx = "c";
295
+
296
+ TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
297
+ return TY_(tmbstrdup)(doc->allocator, buf);
298
+ }
299
+
300
+ static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
301
+ {
302
+ Lexer* lexer = doc->lexer;
303
+ TagStyle* style;
304
+
305
+ for (style = lexer->styles; style; style=style->next)
306
+ {
307
+ if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
308
+ TY_(tmbstrcmp)(style->properties, properties) == 0)
309
+ return style->tag_class;
310
+ }
311
+
312
+ style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
313
+ style->tag = TY_(tmbstrdup)(doc->allocator, tag);
314
+ style->tag_class = GensymClass( doc );
315
+ style->properties = TY_(tmbstrdup)( doc->allocator, properties );
316
+ style->next = lexer->styles;
317
+ lexer->styles = style;
318
+ return style->tag_class;
319
+ }
320
+
321
+ /*
322
+ Add class="foo" to node
323
+ */
324
+ static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
325
+ {
326
+ AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
327
+
328
+ /*
329
+ if there already is a class attribute
330
+ then append class name after a space.
331
+ */
332
+ if (classattr)
333
+ TY_(AppendToClassAttr)( doc, classattr, classname );
334
+ else /* create new class attribute */
335
+ TY_(AddAttribute)( doc, node, "class", classname );
336
+ }
337
+
338
+ void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
339
+ {
340
+ ctmbstr classname;
341
+
342
+ classname = FindStyle( doc, node->element, stylevalue );
343
+ AddClass( doc, node, classname);
344
+ }
345
+
346
+ /*
347
+ Find style attribute in node, and replace it
348
+ by corresponding class attribute. Search for
349
+ class in style dictionary otherwise gensym
350
+ new class and add to dictionary.
351
+
352
+ Assumes that node doesn't have a class attribute
353
+ */
354
+ static void Style2Rule( TidyDocImpl* doc, Node *node)
355
+ {
356
+ AttVal *styleattr, *classattr;
357
+ ctmbstr classname;
358
+
359
+ styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
360
+
361
+ if (styleattr)
362
+ {
363
+ /* fix for http://tidy.sf.net/bug/850215 */
364
+ if (!styleattr->value)
365
+ {
366
+ TY_(RemoveAttribute)(doc, node, styleattr);
367
+ return;
368
+ }
369
+
370
+ classname = FindStyle( doc, node->element, styleattr->value );
371
+ classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
372
+
373
+ /*
374
+ if there already is a class attribute
375
+ then append class name after an underscore
376
+ */
377
+ if (classattr)
378
+ {
379
+ TY_(AppendToClassAttr)( doc, classattr, classname );
380
+ TY_(RemoveAttribute)( doc, node, styleattr );
381
+ }
382
+ else /* reuse style attribute for class attribute */
383
+ {
384
+ TidyDocFree(doc, styleattr->attribute);
385
+ TidyDocFree(doc, styleattr->value);
386
+ styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
387
+ styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
388
+ }
389
+ }
390
+ }
391
+
392
+ static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
393
+ {
394
+ if ( selector && color )
395
+ {
396
+ TY_(AddStringLiteral)(lexer, selector);
397
+ TY_(AddStringLiteral)(lexer, " { color: ");
398
+ TY_(AddStringLiteral)(lexer, color);
399
+ TY_(AddStringLiteral)(lexer, " }\n");
400
+ }
401
+ }
402
+
403
+ /*
404
+ move presentation attribs from body to style element
405
+
406
+ background="foo" -> body { background-image: url(foo) }
407
+ bgcolor="foo" -> body { background-color: foo }
408
+ text="foo" -> body { color: foo }
409
+ link="foo" -> :link { color: foo }
410
+ vlink="foo" -> :visited { color: foo }
411
+ alink="foo" -> :active { color: foo }
412
+ */
413
+ static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
414
+ {
415
+ Lexer* lexer = doc->lexer;
416
+ tmbstr bgurl = NULL;
417
+ tmbstr bgcolor = NULL;
418
+ tmbstr color = NULL;
419
+ AttVal* attr;
420
+
421
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
422
+ {
423
+ bgurl = attr->value;
424
+ attr->value = NULL;
425
+ TY_(RemoveAttribute)( doc, body, attr );
426
+ }
427
+
428
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
429
+ {
430
+ bgcolor = attr->value;
431
+ attr->value = NULL;
432
+ TY_(RemoveAttribute)( doc, body, attr );
433
+ }
434
+
435
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
436
+ {
437
+ color = attr->value;
438
+ attr->value = NULL;
439
+ TY_(RemoveAttribute)( doc, body, attr );
440
+ }
441
+
442
+ if ( bgurl || bgcolor || color )
443
+ {
444
+ TY_(AddStringLiteral)(lexer, " body {\n");
445
+ if (bgurl)
446
+ {
447
+ TY_(AddStringLiteral)(lexer, " background-image: url(");
448
+ TY_(AddStringLiteral)(lexer, bgurl);
449
+ TY_(AddStringLiteral)(lexer, ");\n");
450
+ TidyDocFree(doc, bgurl);
451
+ }
452
+ if (bgcolor)
453
+ {
454
+ TY_(AddStringLiteral)(lexer, " background-color: ");
455
+ TY_(AddStringLiteral)(lexer, bgcolor);
456
+ TY_(AddStringLiteral)(lexer, ";\n");
457
+ TidyDocFree(doc, bgcolor);
458
+ }
459
+ if (color)
460
+ {
461
+ TY_(AddStringLiteral)(lexer, " color: ");
462
+ TY_(AddStringLiteral)(lexer, color);
463
+ TY_(AddStringLiteral)(lexer, ";\n");
464
+ TidyDocFree(doc, color);
465
+ }
466
+
467
+ TY_(AddStringLiteral)(lexer, " }\n");
468
+ }
469
+
470
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
471
+ {
472
+ AddColorRule(lexer, " :link", attr->value);
473
+ TY_(RemoveAttribute)( doc, body, attr );
474
+ }
475
+
476
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
477
+ {
478
+ AddColorRule(lexer, " :visited", attr->value);
479
+ TY_(RemoveAttribute)( doc, body, attr );
480
+ }
481
+
482
+ if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
483
+ {
484
+ AddColorRule(lexer, " :active", attr->value);
485
+ TY_(RemoveAttribute)( doc, body, attr );
486
+ }
487
+ }
488
+
489
+ static Bool NiceBody( TidyDocImpl* doc )
490
+ {
491
+ Node* node = TY_(FindBody)(doc);
492
+ if (node)
493
+ {
494
+ if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
495
+ TY_(AttrGetById)(node, TidyAttr_BGCOLOR) ||
496
+ TY_(AttrGetById)(node, TidyAttr_TEXT) ||
497
+ TY_(AttrGetById)(node, TidyAttr_LINK) ||
498
+ TY_(AttrGetById)(node, TidyAttr_VLINK) ||
499
+ TY_(AttrGetById)(node, TidyAttr_ALINK))
500
+ {
501
+ doc->badLayout |= USING_BODY;
502
+ return no;
503
+ }
504
+ }
505
+
506
+ return yes;
507
+ }
508
+
509
+ /* create style element using rules from dictionary */
510
+ static void CreateStyleElement( TidyDocImpl* doc )
511
+ {
512
+ Lexer* lexer = doc->lexer;
513
+ Node *node, *head, *body;
514
+ TagStyle *style;
515
+ AttVal *av;
516
+
517
+ if ( lexer->styles == NULL && NiceBody(doc) )
518
+ return;
519
+
520
+ node = TY_(NewNode)( doc->allocator, lexer );
521
+ node->type = StartTag;
522
+ node->implicit = yes;
523
+ node->element = TY_(tmbstrdup)(doc->allocator, "style");
524
+ TY_(FindTag)( doc, node );
525
+
526
+ /* insert type attribute */
527
+ av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
528
+ TY_(InsertAttributeAtStart)( node, av );
529
+
530
+ body = TY_(FindBody)( doc );
531
+ lexer->txtstart = lexer->lexsize;
532
+ if ( body )
533
+ CleanBodyAttrs( doc, body );
534
+
535
+ for (style = lexer->styles; style; style = style->next)
536
+ {
537
+ TY_(AddCharToLexer)(lexer, ' ');
538
+ TY_(AddStringLiteral)(lexer, style->tag);
539
+ TY_(AddCharToLexer)(lexer, '.');
540
+ TY_(AddStringLiteral)(lexer, style->tag_class);
541
+ TY_(AddCharToLexer)(lexer, ' ');
542
+ TY_(AddCharToLexer)(lexer, '{');
543
+ TY_(AddStringLiteral)(lexer, style->properties);
544
+ TY_(AddCharToLexer)(lexer, '}');
545
+ TY_(AddCharToLexer)(lexer, '\n');
546
+ }
547
+
548
+ lexer->txtend = lexer->lexsize;
549
+
550
+ TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
551
+
552
+ /*
553
+ now insert style element into document head
554
+
555
+ doc is root node. search its children for html node
556
+ the head node should be first child of html node
557
+ */
558
+ if ( NULL != (head = TY_(FindHEAD)( doc )) )
559
+ TY_(InsertNodeAtEnd)( head, node );
560
+ }
561
+
562
+
563
+ /* ensure bidirectional links are consistent */
564
+ void TY_(FixNodeLinks)(Node *node)
565
+ {
566
+ Node *child;
567
+
568
+ if (node->prev)
569
+ node->prev->next = node;
570
+ else
571
+ node->parent->content = node;
572
+
573
+ if (node->next)
574
+ node->next->prev = node;
575
+ else
576
+ node->parent->last = node;
577
+
578
+ for (child = node->content; child; child = child->next)
579
+ child->parent = node;
580
+ }
581
+
582
+ /*
583
+ used to strip child of node when
584
+ the node has one and only one child
585
+ */
586
+ static void StripOnlyChild(TidyDocImpl* doc, Node *node)
587
+ {
588
+ Node *child;
589
+
590
+ child = node->content;
591
+ node->content = child->content;
592
+ node->last = child->last;
593
+ child->content = NULL;
594
+ TY_(FreeNode)(doc, child);
595
+
596
+ for (child = node->content; child; child = child->next)
597
+ child->parent = node;
598
+ }
599
+
600
+ /*
601
+ used to strip font start and end tags.
602
+ Extricate "element", replace it by its content and delete it.
603
+ */
604
+ static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
605
+ {
606
+ if (element->content)
607
+ {
608
+ Node *node, *parent = element->parent;
609
+
610
+ element->last->next = element->next;
611
+
612
+ if (element->next)
613
+ {
614
+ element->next->prev = element->last;
615
+ }
616
+ else
617
+ parent->last = element->last;
618
+
619
+ if (element->prev)
620
+ {
621
+ element->content->prev = element->prev;
622
+ element->prev->next = element->content;
623
+ }
624
+ else
625
+ parent->content = element->content;
626
+
627
+ for (node = element->content; node; node = node->next)
628
+ node->parent = parent;
629
+
630
+ *pnode = element->content;
631
+
632
+ element->next = element->content = NULL;
633
+ TY_(FreeNode)(doc, element);
634
+ }
635
+ else
636
+ {
637
+ *pnode = TY_(DiscardElement)(doc, element);
638
+ }
639
+ }
640
+
641
+ /*
642
+ Create new string that consists of the
643
+ combined style properties in s1 and s2
644
+
645
+ To merge property lists, we build a linked
646
+ list of property/values and insert properties
647
+ into the list in order, merging values for
648
+ the same property name.
649
+ */
650
+ static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
651
+ {
652
+ tmbstr s;
653
+ StyleProp *prop;
654
+
655
+ prop = CreateProps(doc, NULL, s1);
656
+ prop = CreateProps(doc, prop, s2);
657
+ s = CreatePropString(doc, prop);
658
+ FreeStyleProps(doc, prop);
659
+ return s;
660
+ }
661
+
662
+ /*
663
+ Add style property to element, creating style
664
+ attribute as needed and adding ; delimiter
665
+ */
666
+ void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
667
+ {
668
+ AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
669
+
670
+ /* if style attribute already exists then insert property */
671
+
672
+ if ( av )
673
+ {
674
+ if (av->value != NULL)
675
+ {
676
+ tmbstr s = MergeProperties( doc, av->value, property );
677
+ TidyDocFree( doc, av->value );
678
+ av->value = s;
679
+ }
680
+ else
681
+ {
682
+ av->value = TY_(tmbstrdup)( doc->allocator, property );
683
+ }
684
+ }
685
+ else /* else create new style attribute */
686
+ {
687
+ av = TY_(NewAttributeEx)( doc, "style", property, '"' );
688
+ TY_(InsertAttributeAtStart)( node, av );
689
+ }
690
+ }
691
+
692
+ static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
693
+ {
694
+ AttVal *av;
695
+ tmbstr s1, s2, names;
696
+
697
+ for (s2 = NULL, av = child->attributes; av; av = av->next)
698
+ {
699
+ if (attrIsCLASS(av))
700
+ {
701
+ s2 = av->value;
702
+ break;
703
+ }
704
+ }
705
+
706
+ for (s1 = NULL, av = node->attributes; av; av = av->next)
707
+ {
708
+ if (attrIsCLASS(av))
709
+ {
710
+ s1 = av->value;
711
+ break;
712
+ }
713
+ }
714
+
715
+ if (s1)
716
+ {
717
+ if (s2) /* merge class names from both */
718
+ {
719
+ uint l1, l2;
720
+ l1 = TY_(tmbstrlen)(s1);
721
+ l2 = TY_(tmbstrlen)(s2);
722
+ names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
723
+ TY_(tmbstrcpy)(names, s1);
724
+ names[l1] = ' ';
725
+ TY_(tmbstrcpy)(names+l1+1, s2);
726
+ TidyDocFree(doc, av->value);
727
+ av->value = names;
728
+ }
729
+ }
730
+ else if (s2) /* copy class names from child */
731
+ {
732
+ av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
733
+ TY_(InsertAttributeAtStart)( node, av );
734
+ }
735
+ }
736
+
737
+ static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
738
+ {
739
+ AttVal *av;
740
+ tmbstr s1, s2, style;
741
+
742
+ /*
743
+ the child may have a class attribute used
744
+ for attaching styles, if so the class name
745
+ needs to be copied to node's class
746
+ */
747
+ MergeClasses(doc, node, child);
748
+
749
+ for (s2 = NULL, av = child->attributes; av; av = av->next)
750
+ {
751
+ if (attrIsSTYLE(av))
752
+ {
753
+ s2 = av->value;
754
+ break;
755
+ }
756
+ }
757
+
758
+ for (s1 = NULL, av = node->attributes; av; av = av->next)
759
+ {
760
+ if (attrIsSTYLE(av))
761
+ {
762
+ s1 = av->value;
763
+ break;
764
+ }
765
+ }
766
+
767
+ if (s1)
768
+ {
769
+ if (s2) /* merge styles from both */
770
+ {
771
+ style = MergeProperties(doc, s1, s2);
772
+ TidyDocFree(doc, av->value);
773
+ av->value = style;
774
+ }
775
+ }
776
+ else if (s2) /* copy style of child */
777
+ {
778
+ av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
779
+ TY_(InsertAttributeAtStart)( node, av );
780
+ }
781
+ }
782
+
783
+ static ctmbstr FontSize2Name(ctmbstr size)
784
+ {
785
+ static const ctmbstr sizes[7] =
786
+ {
787
+ "60%", "70%", "80%", NULL,
788
+ "120%", "150%", "200%"
789
+ };
790
+
791
+ /* increment of 0.8 */
792
+ static const ctmbstr minussizes[] =
793
+ {
794
+ "100%", "80%", "64%", "51%",
795
+ "40%", "32%", "26%"
796
+ };
797
+
798
+ /* increment of 1.2 */
799
+ static const ctmbstr plussizes[] =
800
+ {
801
+ "100%", "120%", "144%", "172%",
802
+ "207%", "248%", "298%"
803
+ };
804
+
805
+ if (size[0] == '\0')
806
+ return NULL;
807
+
808
+ if ('0' <= size[0] && size[0] <= '6')
809
+ {
810
+ int n = size[0] - '0';
811
+ return sizes[n];
812
+ }
813
+
814
+ if (size[0] == '-')
815
+ {
816
+ if ('0' <= size[1] && size[1] <= '6')
817
+ {
818
+ int n = size[1] - '0';
819
+ return minussizes[n];
820
+ }
821
+ return "smaller"; /*"70%"; */
822
+ }
823
+
824
+ if ('0' <= size[1] && size[1] <= '6')
825
+ {
826
+ int n = size[1] - '0';
827
+ return plussizes[n];
828
+ }
829
+
830
+ return "larger"; /* "140%" */
831
+ }
832
+
833
+ static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
834
+ {
835
+ tmbchar buf[256];
836
+ TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
837
+ TY_(AddStyleProperty)( doc, node, buf );
838
+ }
839
+
840
+ static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
841
+ {
842
+ ctmbstr value = NULL;
843
+
844
+ if (nodeIsP(node))
845
+ {
846
+ if (TY_(tmbstrcmp)(size, "6") == 0)
847
+ value = "h1";
848
+ else if (TY_(tmbstrcmp)(size, "5") == 0)
849
+ value = "h2";
850
+ else if (TY_(tmbstrcmp)(size, "4") == 0)
851
+ value = "h3";
852
+
853
+ if (value)
854
+ {
855
+ TidyDocFree(doc, node->element);
856
+ node->element = TY_(tmbstrdup)(doc->allocator, value);
857
+ TY_(FindTag)(doc, node);
858
+ return;
859
+ }
860
+ }
861
+
862
+ value = FontSize2Name(size);
863
+
864
+ if (value)
865
+ {
866
+ tmbchar buf[64];
867
+ TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
868
+ TY_(AddStyleProperty)( doc, node, buf );
869
+ }
870
+ }
871
+
872
+ static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
873
+ {
874
+ tmbchar buf[128];
875
+ TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
876
+ TY_(AddStyleProperty)( doc, node, buf );
877
+ }
878
+
879
+ /* force alignment value to lower case */
880
+ static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
881
+ {
882
+ uint i;
883
+ tmbchar buf[128];
884
+
885
+ TY_(tmbstrcpy)( buf, "text-align: " );
886
+ for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
887
+ {
888
+ if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
889
+ break;
890
+ }
891
+ buf[i] = '\0';
892
+ TY_(AddStyleProperty)( doc, node, buf );
893
+ }
894
+
895
+ /*
896
+ add style properties to node corresponding to
897
+ the font face, size and color attributes
898
+ */
899
+ static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
900
+ {
901
+ while (av)
902
+ {
903
+ if (AttrHasValue(av))
904
+ {
905
+ if (attrIsFACE(av))
906
+ AddFontFace( doc, node, av->value );
907
+ else if (attrIsSIZE(av))
908
+ AddFontSize( doc, node, av->value );
909
+ else if (attrIsCOLOR(av))
910
+ AddFontColor( doc, node, av->value );
911
+ }
912
+ av = av->next;
913
+ }
914
+ }
915
+
916
+ /*
917
+ Symptom: <p align=center>
918
+ Action: <p style="text-align: center">
919
+ */
920
+ static void TextAlign( TidyDocImpl* doc, Node* node )
921
+ {
922
+ AttVal *av, *prev;
923
+
924
+ prev = NULL;
925
+
926
+ for (av = node->attributes; av; av = av->next)
927
+ {
928
+ if (attrIsALIGN(av))
929
+ {
930
+ if (prev)
931
+ prev->next = av->next;
932
+ else
933
+ node->attributes = av->next;
934
+
935
+ if (av->value)
936
+ AddAlign( doc, node, av->value );
937
+
938
+ TY_(FreeAttribute)(doc, av);
939
+ break;
940
+ }
941
+
942
+ prev = av;
943
+ }
944
+ }
945
+
946
+ /*
947
+ Symptom: <table bgcolor="red">
948
+ Action: <table style="background-color: red">
949
+ */
950
+ static void TableBgColor( TidyDocImpl* doc, Node* node )
951
+ {
952
+ AttVal* attr;
953
+ tmbchar buf[256];
954
+
955
+ if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
956
+ {
957
+ TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
958
+ TY_(RemoveAttribute)( doc, node, attr );
959
+ TY_(AddStyleProperty)( doc, node, buf );
960
+ }
961
+ }
962
+
963
+ /*
964
+ The clean up rules use the pnode argument to return the
965
+ next node when the original node has been deleted
966
+ */
967
+
968
+ /*
969
+ Symptom: <dir> <li> where <li> is only child
970
+ Action: coerce <dir> <li> to <div> with indent.
971
+ */
972
+
973
+ static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
974
+ {
975
+ Node *child;
976
+
977
+ if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
978
+ {
979
+ child = node->content;
980
+
981
+ if (child == NULL)
982
+ return no;
983
+
984
+ /* check child has no peers */
985
+
986
+ if (child->next)
987
+ return no;
988
+
989
+ if ( !nodeIsLI(child) )
990
+ return no;
991
+
992
+ if ( !child->implicit )
993
+ return no;
994
+
995
+ /* coerce dir to div */
996
+ node->tag = TY_(LookupTagDef)( TidyTag_DIV );
997
+ TidyDocFree( doc, node->element );
998
+ node->element = TY_(tmbstrdup)(doc->allocator, "div");
999
+ TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
1000
+ StripOnlyChild( doc, node );
1001
+ return yes;
1002
+ }
1003
+
1004
+ return no;
1005
+ }
1006
+
1007
+ /*
1008
+ Symptom: <center>
1009
+ Action: replace <center> by <div style="text-align: center">
1010
+ */
1011
+
1012
+ static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1013
+ {
1014
+ if ( nodeIsCENTER(node) )
1015
+ {
1016
+ if ( cfgBool(doc, TidyDropFontTags) )
1017
+ {
1018
+ if (node->content)
1019
+ {
1020
+ Node *last = node->last;
1021
+ DiscardContainer( doc, node, pnode );
1022
+
1023
+ node = TY_(InferredTag)(doc, TidyTag_BR);
1024
+ TY_(InsertNodeAfterElement)(last, node);
1025
+ }
1026
+ else
1027
+ {
1028
+ Node *prev = node->prev, *next = node->next,
1029
+ *parent = node->parent;
1030
+ DiscardContainer( doc, node, pnode );
1031
+
1032
+ node = TY_(InferredTag)(doc, TidyTag_BR);
1033
+ if (next)
1034
+ TY_(InsertNodeBeforeElement)(next, node);
1035
+ else if (prev)
1036
+ TY_(InsertNodeAfterElement)(prev, node);
1037
+ else
1038
+ TY_(InsertNodeAtStart)(parent, node);
1039
+ }
1040
+
1041
+ return yes;
1042
+ }
1043
+
1044
+ RenameElem( doc, node, TidyTag_DIV );
1045
+ TY_(AddStyleProperty)( doc, node, "text-align: center" );
1046
+ return yes;
1047
+ }
1048
+
1049
+ return no;
1050
+ }
1051
+
1052
+ /* Copy child attributes to node. Duplicate attributes are overwritten.
1053
+ Unique attributes (such as ID) disable the action.
1054
+ Attributes style and class are not dealt with. A call to MergeStyles
1055
+ will do that.
1056
+ */
1057
+ static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1058
+ {
1059
+ AttVal *av1, *av2;
1060
+ TidyAttrId id;
1061
+
1062
+ /* Detect attributes that cannot be merged or overwritten. */
1063
+ if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1064
+ && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1065
+ return no;
1066
+
1067
+ /* Move child attributes to node. Attributes in node
1068
+ can be overwritten or merged. */
1069
+ for (av2 = child->attributes; av2; )
1070
+ {
1071
+ /* Dealt by MergeStyles. */
1072
+ if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1073
+ {
1074
+ av2 = av2->next;
1075
+ continue;
1076
+ }
1077
+ /* Avoid duplicates in node */
1078
+ if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1079
+ && (av1=TY_(AttrGetById)(node, id))!= NULL)
1080
+ TY_(RemoveAttribute)( doc, node, av1 );
1081
+
1082
+ /* Move attribute from child to node */
1083
+ TY_(DetachAttribute)( child, av2 );
1084
+ av1 = av2;
1085
+ av2 = av2->next;
1086
+ av1->next = NULL;
1087
+ TY_(InsertAttributeAtEnd)( node, av1 );
1088
+ }
1089
+
1090
+ return yes;
1091
+ }
1092
+
1093
+ /*
1094
+ Symptom <XX><XX>...</XX></XX>
1095
+ Action: merge the two XXs
1096
+
1097
+ For instance, this is useful after nested <dir>s used by Word
1098
+ for indenting have been converted to <div>s
1099
+
1100
+ If state is "no", no merging.
1101
+ If state is "yes", inner element is discarded. Only Style and Class
1102
+ attributes are merged using MergeStyles().
1103
+ If state is "auto", atttibutes are merged as described in CopyAttrs().
1104
+ Style and Class attributes are merged using MergeStyles().
1105
+ */
1106
+ static Bool MergeNestedElements( TidyDocImpl* doc,
1107
+ TidyTagId Id, TidyTriState state, Node *node,
1108
+ Node **ARG_UNUSED(pnode))
1109
+ {
1110
+ Node *child;
1111
+
1112
+ if ( state == TidyNoState
1113
+ || !TagIsId(node, Id) )
1114
+ return no;
1115
+
1116
+ child = node->content;
1117
+
1118
+ if ( child == NULL
1119
+ || child->next != NULL
1120
+ || !TagIsId(child, Id) )
1121
+ return no;
1122
+
1123
+ if ( state == TidyAutoState
1124
+ && CopyAttrs(doc, node, child) == no )
1125
+ return no;
1126
+
1127
+ MergeStyles( doc, node, child );
1128
+ StripOnlyChild( doc, node );
1129
+ return yes;
1130
+ }
1131
+
1132
+ /*
1133
+ Symptom: <ul><li><ul>...</ul></li></ul>
1134
+ Action: discard outer list
1135
+ */
1136
+
1137
+ static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1138
+ {
1139
+ Node *child, *list;
1140
+
1141
+ if ( nodeIsUL(node) || nodeIsOL(node) )
1142
+ {
1143
+ child = node->content;
1144
+
1145
+ if (child == NULL)
1146
+ return no;
1147
+
1148
+ /* check child has no peers */
1149
+
1150
+ if (child->next)
1151
+ return no;
1152
+
1153
+ list = child->content;
1154
+
1155
+ if (!list)
1156
+ return no;
1157
+
1158
+ if (list->tag != node->tag)
1159
+ return no;
1160
+
1161
+ /* check list has no peers */
1162
+ if (list->next)
1163
+ return no;
1164
+
1165
+ *pnode = list; /* Set node to resume iteration */
1166
+
1167
+ /* move inner list node into position of outer node */
1168
+ list->prev = node->prev;
1169
+ list->next = node->next;
1170
+ list->parent = node->parent;
1171
+ TY_(FixNodeLinks)(list);
1172
+
1173
+ /* get rid of outer ul and its li */
1174
+ child->content = NULL;
1175
+ TY_(FreeNode)( doc, child ); /* See test #427841. */
1176
+ child = NULL;
1177
+ node->content = NULL;
1178
+ node->next = NULL;
1179
+ TY_(FreeNode)( doc, node );
1180
+ node = NULL;
1181
+
1182
+ /*
1183
+ If prev node was a list the chances are this node
1184
+ should be appended to that list. Word has no way of
1185
+ recognizing nested lists and just uses indents
1186
+ */
1187
+
1188
+ if (list->prev)
1189
+ {
1190
+ if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1191
+ && list->prev->last )
1192
+ {
1193
+ node = list;
1194
+ list = node->prev;
1195
+
1196
+ child = list->last; /* <li> */
1197
+
1198
+ list->next = node->next;
1199
+ TY_(FixNodeLinks)(list);
1200
+
1201
+ node->parent = child;
1202
+ node->next = NULL;
1203
+ node->prev = child->last;
1204
+ TY_(FixNodeLinks)(node);
1205
+ CleanNode( doc, node );
1206
+ }
1207
+ }
1208
+
1209
+ return yes;
1210
+ }
1211
+
1212
+ return no;
1213
+ }
1214
+
1215
+ /* Find CSS equivalent in a SPAN element */
1216
+ static
1217
+ Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1218
+ {
1219
+ struct
1220
+ {
1221
+ TidyTagId id;
1222
+ ctmbstr CSSeq;
1223
+ Bool deprecated;
1224
+ }
1225
+ const CSS_SpanEq[] =
1226
+ {
1227
+ { TidyTag_B, "font-weight: bold", no },
1228
+ { TidyTag_I, "font-style: italic", no },
1229
+ { TidyTag_S, "text-decoration: line-through", yes},
1230
+ { TidyTag_STRIKE, "text-decoration: line-through", yes},
1231
+ { TidyTag_U, "text-decoration: underline", yes},
1232
+ { TidyTag_UNKNOWN, NULL, no }
1233
+ };
1234
+ uint i;
1235
+
1236
+ for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1237
+ if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1238
+ && TagIsId(node, CSS_SpanEq[i].id) )
1239
+ {
1240
+ *s = CSS_SpanEq[i].CSSeq;
1241
+ return yes;
1242
+ }
1243
+ return no;
1244
+ }
1245
+
1246
+ /* Necessary conditions to apply BlockStyle(). */
1247
+ static Bool CanApplyBlockStyle( Node *node )
1248
+ {
1249
+ if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1250
+ && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1251
+ {
1252
+ return yes;
1253
+ }
1254
+ return no;
1255
+ }
1256
+
1257
+ /*
1258
+ Symptom: the only child of a block-level element is a
1259
+ presentation element such as B, I or FONT
1260
+
1261
+ Action: add style "font-weight: bold" to the block and
1262
+ strip the <b> element, leaving its children.
1263
+
1264
+ example:
1265
+
1266
+ <p>
1267
+ <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1268
+ </p>
1269
+
1270
+ becomes:
1271
+
1272
+ <p style="font-weight: bold; font-family: Arial; font-size: 6">
1273
+ Draft Recommended Practice
1274
+ </p>
1275
+
1276
+ This code also replaces the align attribute by a style attribute.
1277
+ However, to avoid CSS problems with Navigator 4, this isn't done
1278
+ for the elements: caption, tr and table
1279
+ */
1280
+ static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1281
+ {
1282
+ Node *child;
1283
+ ctmbstr CSSeq;
1284
+
1285
+ /* check for bgcolor */
1286
+ if ( nodeIsTABLE(node)
1287
+ || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1288
+ TableBgColor( doc, node );
1289
+
1290
+ if (CanApplyBlockStyle(node))
1291
+ {
1292
+ /* check for align attribute */
1293
+ if ( !nodeIsCAPTION(node) )
1294
+ TextAlign( doc, node );
1295
+
1296
+ child = node->content;
1297
+ if (child == NULL)
1298
+ return no;
1299
+
1300
+ /* check child has no peers */
1301
+ if (child->next)
1302
+ return no;
1303
+
1304
+ if ( FindCSSSpanEq(child, &CSSeq, no) )
1305
+ {
1306
+ MergeStyles( doc, node, child );
1307
+ TY_(AddStyleProperty)( doc, node, CSSeq );
1308
+ StripOnlyChild( doc, node );
1309
+ return yes;
1310
+ }
1311
+ else if ( nodeIsFONT(child) )
1312
+ {
1313
+ MergeStyles( doc, node, child );
1314
+ AddFontStyles( doc, node, child->attributes );
1315
+ StripOnlyChild( doc, node );
1316
+ return yes;
1317
+ }
1318
+ }
1319
+
1320
+ return no;
1321
+ }
1322
+
1323
+ /* Necessary conditions to apply InlineStyle(). */
1324
+ static Bool CanApplyInlineStyle( Node *node )
1325
+ {
1326
+ return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1327
+ }
1328
+
1329
+ /* the only child of table cell or an inline element such as em */
1330
+ static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1331
+ {
1332
+ Node *child;
1333
+ ctmbstr CSSeq;
1334
+
1335
+ if ( CanApplyInlineStyle(node) )
1336
+ {
1337
+ child = node->content;
1338
+
1339
+ if (child == NULL)
1340
+ return no;
1341
+
1342
+ /* check child has no peers */
1343
+
1344
+ if (child->next)
1345
+ return no;
1346
+
1347
+ if ( FindCSSSpanEq(child, &CSSeq, no) )
1348
+ {
1349
+ MergeStyles( doc, node, child );
1350
+ TY_(AddStyleProperty)( doc, node, CSSeq );
1351
+ StripOnlyChild( doc, node );
1352
+ return yes;
1353
+ }
1354
+ else if ( nodeIsFONT(child) )
1355
+ {
1356
+ MergeStyles( doc, node, child );
1357
+ AddFontStyles( doc, node, child->attributes );
1358
+ StripOnlyChild( doc, node );
1359
+ return yes;
1360
+ }
1361
+ }
1362
+
1363
+ return no;
1364
+ }
1365
+
1366
+ /*
1367
+ Transform element to equivalent CSS
1368
+ */
1369
+ static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1370
+ Node **ARG_UNUSED(pnode) )
1371
+ {
1372
+ ctmbstr CSSeq;
1373
+
1374
+ /* if node is the only child of parent element then leave alone
1375
+ Do so only if BlockStyle may be succesful. */
1376
+ if ( node->parent->content == node && node->next == NULL &&
1377
+ (CanApplyBlockStyle(node->parent)
1378
+ || CanApplyInlineStyle(node->parent)) )
1379
+ return no;
1380
+
1381
+ if ( FindCSSSpanEq(node, &CSSeq, yes) )
1382
+ {
1383
+ RenameElem( doc, node, TidyTag_SPAN );
1384
+ TY_(AddStyleProperty)( doc, node, CSSeq );
1385
+ return yes;
1386
+ }
1387
+ return no;
1388
+ }
1389
+
1390
+ /*
1391
+ Replace font elements by span elements, deleting
1392
+ the font element's attributes and replacing them
1393
+ by a single style attribute.
1394
+ */
1395
+ static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1396
+ {
1397
+ AttVal *av, *style, *next;
1398
+
1399
+ if ( nodeIsFONT(node) )
1400
+ {
1401
+ if ( cfgBool(doc, TidyDropFontTags) )
1402
+ {
1403
+ DiscardContainer( doc, node, pnode );
1404
+ return yes;
1405
+ }
1406
+
1407
+ /* if node is the only child of parent element then leave alone
1408
+ Do so only if BlockStyle may be succesful. */
1409
+ if ( node->parent->content == node && node->next == NULL &&
1410
+ CanApplyBlockStyle(node->parent) )
1411
+ return no;
1412
+
1413
+ AddFontStyles( doc, node, node->attributes );
1414
+
1415
+ /* extract style attribute and free the rest */
1416
+ av = node->attributes;
1417
+ style = NULL;
1418
+
1419
+ while (av)
1420
+ {
1421
+ next = av->next;
1422
+
1423
+ if (attrIsSTYLE(av))
1424
+ {
1425
+ av->next = NULL;
1426
+ style = av;
1427
+ }
1428
+ else
1429
+ {
1430
+ TY_(FreeAttribute)( doc, av );
1431
+ }
1432
+ av = next;
1433
+ }
1434
+
1435
+ node->attributes = style;
1436
+ RenameElem( doc, node, TidyTag_SPAN );
1437
+ return yes;
1438
+ }
1439
+
1440
+ return no;
1441
+ }
1442
+
1443
+ /*
1444
+ Applies all matching rules to a node.
1445
+ */
1446
+ Node* CleanNode( TidyDocImpl* doc, Node *node )
1447
+ {
1448
+ Node *next = NULL;
1449
+ TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1450
+ TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1451
+
1452
+ for (next = node; TY_(nodeIsElement)(node); node = next)
1453
+ {
1454
+ if ( Dir2Div(doc, node, &next) )
1455
+ continue;
1456
+
1457
+ /* Special case: true result means
1458
+ ** that arg node and its parent no longer exist.
1459
+ ** So we must jump back up the CreateStyleProperties()
1460
+ ** call stack until we have a valid node reference.
1461
+ */
1462
+ if ( NestedList(doc, node, &next) )
1463
+ return next;
1464
+
1465
+ if ( Center2Div(doc, node, &next) )
1466
+ continue;
1467
+
1468
+ if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1469
+ continue;
1470
+
1471
+ if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1472
+ continue;
1473
+
1474
+ if ( BlockStyle(doc, node, &next) )
1475
+ continue;
1476
+
1477
+ if ( InlineStyle(doc, node, &next) )
1478
+ continue;
1479
+
1480
+ if ( InlineElementToCSS(doc, node, &next) )
1481
+ continue;
1482
+
1483
+ if ( Font2Span(doc, node, &next) )
1484
+ continue;
1485
+
1486
+ break;
1487
+ }
1488
+
1489
+ return next;
1490
+ }
1491
+
1492
+ /* Special case: if the current node is destroyed by
1493
+ ** CleanNode() lower in the tree, this node and its parent
1494
+ ** no longer exist. So we must jump back up the CleanTree()
1495
+ ** call stack until we have a valid node reference.
1496
+ */
1497
+
1498
+ static Node* CleanTree( TidyDocImpl* doc, Node *node )
1499
+ {
1500
+ if (node->content)
1501
+ {
1502
+ Node *child;
1503
+ for (child = node->content; child != NULL; child = child->next)
1504
+ {
1505
+ child = CleanTree( doc, child );
1506
+ if ( !child )
1507
+ break;
1508
+ }
1509
+ }
1510
+
1511
+ return CleanNode( doc, node );
1512
+ }
1513
+
1514
+ static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1515
+ {
1516
+ Node *child;
1517
+
1518
+ if (node->content)
1519
+ {
1520
+ for (child = node->content;
1521
+ child != NULL; child = child->next)
1522
+ {
1523
+ DefineStyleRules( doc, child );
1524
+ }
1525
+ }
1526
+
1527
+ Style2Rule( doc, node );
1528
+ }
1529
+
1530
+ void TY_(CleanDocument)( TidyDocImpl* doc )
1531
+ {
1532
+ /* placeholder. CleanTree()/CleanNode() will not
1533
+ ** zap root element
1534
+ */
1535
+ CleanTree( doc, &doc->root );
1536
+
1537
+ if ( cfgBool(doc, TidyMakeClean) )
1538
+ {
1539
+ DefineStyleRules( doc, &doc->root );
1540
+ CreateStyleElement( doc );
1541
+ }
1542
+ }
1543
+
1544
+ /* simplifies <b><b> ... </b> ...</b> etc. */
1545
+ void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1546
+ {
1547
+ Node *next;
1548
+
1549
+ while (node)
1550
+ {
1551
+ next = node->next;
1552
+
1553
+ if ( (nodeIsB(node) || nodeIsI(node))
1554
+ && node->parent && node->parent->tag == node->tag)
1555
+ {
1556
+ /* strip redundant inner element */
1557
+ DiscardContainer( doc, node, &next );
1558
+ node = next;
1559
+ continue;
1560
+ }
1561
+
1562
+ if ( node->content )
1563
+ TY_(NestedEmphasis)( doc, node->content );
1564
+
1565
+ node = next;
1566
+ }
1567
+ }
1568
+
1569
+
1570
+
1571
+ /* replace i by em and b by strong */
1572
+ void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1573
+ {
1574
+ while (node)
1575
+ {
1576
+ if ( nodeIsI(node) )
1577
+ RenameElem( doc, node, TidyTag_EM );
1578
+ else if ( nodeIsB(node) )
1579
+ RenameElem( doc, node, TidyTag_STRONG );
1580
+
1581
+ if ( node->content )
1582
+ TY_(EmFromI)( doc, node->content );
1583
+
1584
+ node = node->next;
1585
+ }
1586
+ }
1587
+
1588
+ static Bool HasOneChild(Node *node)
1589
+ {
1590
+ return (node->content && node->content->next == NULL);
1591
+ }
1592
+
1593
+ /*
1594
+ Some people use dir or ul without an li
1595
+ to indent the content. The pattern to
1596
+ look for is a list with a single implicit
1597
+ li. This is recursively replaced by an
1598
+ implicit blockquote.
1599
+ */
1600
+ void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1601
+ {
1602
+ while (node)
1603
+ {
1604
+ if (node->content)
1605
+ TY_(List2BQ)( doc, node->content );
1606
+
1607
+ if ( node->tag && node->tag->parser == TY_(ParseList) &&
1608
+ HasOneChild(node) && node->content->implicit )
1609
+ {
1610
+ StripOnlyChild( doc, node );
1611
+ RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1612
+ node->implicit = yes;
1613
+ }
1614
+
1615
+ node = node->next;
1616
+ }
1617
+ }
1618
+
1619
+
1620
+ /*
1621
+ Replace implicit blockquote by div with an indent
1622
+ taking care to reduce nested blockquotes to a single
1623
+ div with the indent set to match the nesting depth
1624
+ */
1625
+ void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1626
+ {
1627
+ tmbchar indent_buf[ 32 ];
1628
+ uint indent;
1629
+
1630
+ while (node)
1631
+ {
1632
+ if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1633
+ {
1634
+ indent = 1;
1635
+
1636
+ while( HasOneChild(node) &&
1637
+ nodeIsBLOCKQUOTE(node->content) &&
1638
+ node->implicit)
1639
+ {
1640
+ ++indent;
1641
+ StripOnlyChild( doc, node );
1642
+ }
1643
+
1644
+ if (node->content)
1645
+ TY_(BQ2Div)( doc, node->content );
1646
+
1647
+ TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1648
+ 2*indent);
1649
+
1650
+ RenameElem( doc, node, TidyTag_DIV );
1651
+ TY_(AddStyleProperty)(doc, node, indent_buf );
1652
+ }
1653
+ else if (node->content)
1654
+ TY_(BQ2Div)( doc, node->content );
1655
+
1656
+ node = node->next;
1657
+ }
1658
+ }
1659
+
1660
+
1661
+ static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1662
+ {
1663
+ Node *check;
1664
+
1665
+ for ( check=node; check; check = check->parent )
1666
+ {
1667
+ if ( nodeIsTD(check) )
1668
+ return check;
1669
+ }
1670
+ return NULL;
1671
+ }
1672
+
1673
+ /* node is <![if ...]> prune up to <![endif]> */
1674
+ static Node* PruneSection( TidyDocImpl* doc, Node *node )
1675
+ {
1676
+ Lexer* lexer = doc->lexer;
1677
+
1678
+ for (;;)
1679
+ {
1680
+ ctmbstr lexbuf = lexer->lexbuf + node->start;
1681
+ if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1682
+ {
1683
+ Node* cell = FindEnclosingCell( doc, node );
1684
+ if ( cell )
1685
+ {
1686
+ /* Need to put &nbsp; into cell so it doesn't look weird
1687
+ */
1688
+ Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1689
+ assert( (byte)'\240' == (byte)160 );
1690
+ TY_(InsertNodeBeforeElement)( node, nbsp );
1691
+ }
1692
+ }
1693
+
1694
+ /* discard node and returns next, unless it is a text node */
1695
+ if ( node->type == TextNode )
1696
+ node = node->next;
1697
+ else
1698
+ node = TY_(DiscardElement)( doc, node );
1699
+
1700
+ if (node == NULL)
1701
+ return NULL;
1702
+
1703
+ if (node->type == SectionTag)
1704
+ {
1705
+ if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1706
+ {
1707
+ node = PruneSection( doc, node );
1708
+ continue;
1709
+ }
1710
+
1711
+ if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1712
+ {
1713
+ node = TY_(DiscardElement)( doc, node );
1714
+ break;
1715
+ }
1716
+ }
1717
+ }
1718
+
1719
+ return node;
1720
+ }
1721
+
1722
+ void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1723
+ {
1724
+ Lexer* lexer = doc->lexer;
1725
+ while (node)
1726
+ {
1727
+ if (node->type == SectionTag)
1728
+ {
1729
+ /* prune up to matching endif */
1730
+ if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1731
+ (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1732
+ {
1733
+ node = PruneSection( doc, node );
1734
+ continue;
1735
+ }
1736
+
1737
+ /* discard others as well */
1738
+ node = TY_(DiscardElement)( doc, node );
1739
+ continue;
1740
+ }
1741
+
1742
+ if (node->content)
1743
+ TY_(DropSections)( doc, node->content );
1744
+
1745
+ node = node->next;
1746
+ }
1747
+ }
1748
+
1749
+ static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1750
+ {
1751
+ AttVal *attr, *next, *prev = NULL;
1752
+
1753
+ for ( attr = node->attributes; attr; attr = next )
1754
+ {
1755
+ next = attr->next;
1756
+
1757
+ /* special check for class="Code" denoting pre text */
1758
+ /* Pass thru user defined styles as HTML class names */
1759
+ if (attrIsCLASS(attr))
1760
+ {
1761
+ if (AttrValueIs(attr, "Code") ||
1762
+ TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1763
+ {
1764
+ prev = attr;
1765
+ continue;
1766
+ }
1767
+ }
1768
+
1769
+ if (attrIsCLASS(attr) ||
1770
+ attrIsSTYLE(attr) ||
1771
+ attrIsLANG(attr) ||
1772
+ ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1773
+ (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1774
+ (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1775
+ {
1776
+ if (prev)
1777
+ prev->next = next;
1778
+ else
1779
+ node->attributes = next;
1780
+
1781
+ TY_(FreeAttribute)( doc, attr );
1782
+ }
1783
+ else
1784
+ prev = attr;
1785
+ }
1786
+ }
1787
+
1788
+ /* Word2000 uses span excessively, so we strip span out */
1789
+ static Node* StripSpan( TidyDocImpl* doc, Node* span )
1790
+ {
1791
+ Node *node, *prev = NULL, *content;
1792
+
1793
+ /*
1794
+ deal with span elements that have content
1795
+ by splicing the content in place of the span
1796
+ after having processed it
1797
+ */
1798
+
1799
+ TY_(CleanWord2000)( doc, span->content );
1800
+ content = span->content;
1801
+
1802
+ if (span->prev)
1803
+ prev = span->prev;
1804
+ else if (content)
1805
+ {
1806
+ node = content;
1807
+ content = content->next;
1808
+ TY_(RemoveNode)(node);
1809
+ TY_(InsertNodeBeforeElement)(span, node);
1810
+ prev = node;
1811
+ }
1812
+
1813
+ while (content)
1814
+ {
1815
+ node = content;
1816
+ content = content->next;
1817
+ TY_(RemoveNode)(node);
1818
+ TY_(InsertNodeAfterElement)(prev, node);
1819
+ prev = node;
1820
+ }
1821
+
1822
+ if (span->next == NULL)
1823
+ span->parent->last = prev;
1824
+
1825
+ node = span->next;
1826
+ span->content = NULL;
1827
+ TY_(DiscardElement)( doc, span );
1828
+ return node;
1829
+ }
1830
+
1831
+ /* map non-breaking spaces to regular spaces */
1832
+ void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1833
+ {
1834
+ while ( node )
1835
+ {
1836
+ if ( node->content )
1837
+ TY_(NormalizeSpaces)( lexer, node->content );
1838
+
1839
+ if (TY_(nodeIsText)(node))
1840
+ {
1841
+ uint i, c;
1842
+ tmbstr p = lexer->lexbuf + node->start;
1843
+
1844
+ for (i = node->start; i < node->end; ++i)
1845
+ {
1846
+ c = (byte) lexer->lexbuf[i];
1847
+
1848
+ /* look for UTF-8 multibyte character */
1849
+ if ( c > 0x7F )
1850
+ i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1851
+
1852
+ if ( c == 160 )
1853
+ c = ' ';
1854
+
1855
+ p = TY_(PutUTF8)(p, c);
1856
+ }
1857
+ node->end = p - lexer->lexbuf;
1858
+ }
1859
+
1860
+ node = node->next;
1861
+ }
1862
+ }
1863
+
1864
+ /* used to hunt for hidden preformatted sections */
1865
+ static Bool NoMargins(Node *node)
1866
+ {
1867
+ AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1868
+
1869
+ if ( !AttrHasValue(attval) )
1870
+ return no;
1871
+
1872
+ /* search for substring "margin-top: 0" */
1873
+ if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1874
+ return no;
1875
+
1876
+ /* search for substring "margin-bottom: 0" */
1877
+ if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1878
+ return no;
1879
+
1880
+ return yes;
1881
+ }
1882
+
1883
+ /* does element have a single space as its content? */
1884
+ static Bool SingleSpace( Lexer* lexer, Node* node )
1885
+ {
1886
+ if ( node->content )
1887
+ {
1888
+ node = node->content;
1889
+
1890
+ if ( node->next != NULL )
1891
+ return no;
1892
+
1893
+ if ( node->type != TextNode )
1894
+ return no;
1895
+
1896
+ if ( (node->end - node->start) == 1 &&
1897
+ lexer->lexbuf[node->start] == ' ' )
1898
+ return yes;
1899
+
1900
+ if ( (node->end - node->start) == 2 )
1901
+ {
1902
+ uint c = 0;
1903
+ TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1904
+ if ( c == 160 )
1905
+ return yes;
1906
+ }
1907
+ }
1908
+
1909
+ return no;
1910
+ }
1911
+
1912
+ /*
1913
+ This is a major clean up to strip out all the extra stuff you get
1914
+ when you save as web page from Word 2000. It doesn't yet know what
1915
+ to do with VML tags, but these will appear as errors unless you
1916
+ declare them as new tags, such as o:p which needs to be declared
1917
+ as inline.
1918
+ */
1919
+ void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1920
+ {
1921
+ /* used to a list from a sequence of bulletted p's */
1922
+ Lexer* lexer = doc->lexer;
1923
+ Node* list = NULL;
1924
+
1925
+ while ( node )
1926
+ {
1927
+ /* get rid of Word's xmlns attributes */
1928
+ if ( nodeIsHTML(node) )
1929
+ {
1930
+ /* check that it's a Word 2000 document */
1931
+ if ( !TY_(GetAttrByName)(node, "xmlns:o") &&
1932
+ !cfgBool(doc, TidyMakeBare) )
1933
+ return;
1934
+
1935
+ TY_(FreeAttrs)( doc, node );
1936
+ }
1937
+
1938
+ /* fix up preformatted sections by looking for a
1939
+ ** sequence of paragraphs with zero top/bottom margin
1940
+ */
1941
+ if ( nodeIsP(node) )
1942
+ {
1943
+ if (NoMargins(node))
1944
+ {
1945
+ Node *pre, *next;
1946
+ TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1947
+
1948
+ PurgeWord2000Attributes( doc, node );
1949
+
1950
+ if (node->content)
1951
+ TY_(CleanWord2000)( doc, node->content );
1952
+
1953
+ pre = node;
1954
+ node = node->next;
1955
+
1956
+ /* continue to strip p's */
1957
+
1958
+ while ( nodeIsP(node) && NoMargins(node) )
1959
+ {
1960
+ next = node->next;
1961
+ TY_(RemoveNode)(node);
1962
+ TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1963
+ TY_(InsertNodeAtEnd)(pre, node);
1964
+ StripSpan( doc, node );
1965
+ node = next;
1966
+ }
1967
+
1968
+ if (node == NULL)
1969
+ break;
1970
+ }
1971
+ }
1972
+
1973
+ if (node->tag && (node->tag->model & CM_BLOCK)
1974
+ && SingleSpace(lexer, node))
1975
+ {
1976
+ node = StripSpan( doc, node );
1977
+ continue;
1978
+ }
1979
+ /* discard Word's style verbiage */
1980
+ if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1981
+ node->type == CommentTag )
1982
+ {
1983
+ node = TY_(DiscardElement)( doc, node );
1984
+ continue;
1985
+ }
1986
+
1987
+ /* strip out all span and font tags Word scatters so liberally! */
1988
+ if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1989
+ {
1990
+ node = StripSpan( doc, node );
1991
+ continue;
1992
+ }
1993
+
1994
+ if ( nodeIsLINK(node) )
1995
+ {
1996
+ AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1997
+
1998
+ if (AttrValueIs(attr, "File-List"))
1999
+ {
2000
+ node = TY_(DiscardElement)( doc, node );
2001
+ continue;
2002
+ }
2003
+ }
2004
+
2005
+ /* discards <o:p> which encodes the paragraph mark */
2006
+ if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2007
+ {
2008
+ Node* next;
2009
+ DiscardContainer( doc, node, &next );
2010
+ node = next;
2011
+ continue;
2012
+ }
2013
+
2014
+ /* discard empty paragraphs */
2015
+
2016
+ if ( node->content == NULL && nodeIsP(node) )
2017
+ {
2018
+ /* Use the existing function to ensure consistency */
2019
+ Node *next = TY_(TrimEmptyElement)( doc, node );
2020
+ node = next;
2021
+ continue;
2022
+ }
2023
+
2024
+ if ( nodeIsP(node) )
2025
+ {
2026
+ AttVal *attr, *atrStyle;
2027
+
2028
+ attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2029
+ atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2030
+ /*
2031
+ (JES) Sometimes Word marks a list item with the following hokie syntax
2032
+ <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2033
+ translate these into <li>
2034
+ */
2035
+ /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2036
+ /* map <p class="MsoListNumber"> to <ol>...</ol> */
2037
+ if ( AttrValueIs(attr, "MsoListBullet") ||
2038
+ AttrValueIs(attr, "MsoListNumber") ||
2039
+ AttrContains(atrStyle, "mso-list:") )
2040
+ {
2041
+ TidyTagId listType = TidyTag_UL;
2042
+ if (AttrValueIs(attr, "MsoListNumber"))
2043
+ listType = TidyTag_OL;
2044
+
2045
+ TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2046
+
2047
+ if ( !list || TagId(list) != listType )
2048
+ {
2049
+ const Dict* tag = TY_(LookupTagDef)( listType );
2050
+ list = TY_(InferredTag)(doc, tag->id);
2051
+ TY_(InsertNodeBeforeElement)(node, list);
2052
+ }
2053
+
2054
+ PurgeWord2000Attributes( doc, node );
2055
+
2056
+ if ( node->content )
2057
+ TY_(CleanWord2000)( doc, node->content );
2058
+
2059
+ /* remove node and append to contents of list */
2060
+ TY_(RemoveNode)(node);
2061
+ TY_(InsertNodeAtEnd)(list, node);
2062
+ node = list;
2063
+ }
2064
+ /* map sequence of <p class="Code"> to <pre>...</pre> */
2065
+ else if (AttrValueIs(attr, "Code"))
2066
+ {
2067
+ Node *br = TY_(NewLineNode)(lexer);
2068
+ TY_(NormalizeSpaces)(lexer, node->content);
2069
+
2070
+ if ( !list || TagId(list) != TidyTag_PRE )
2071
+ {
2072
+ list = TY_(InferredTag)(doc, TidyTag_PRE);
2073
+ TY_(InsertNodeBeforeElement)(node, list);
2074
+ }
2075
+
2076
+ /* remove node and append to contents of list */
2077
+ TY_(RemoveNode)(node);
2078
+ TY_(InsertNodeAtEnd)(list, node);
2079
+ StripSpan( doc, node );
2080
+ TY_(InsertNodeAtEnd)(list, br);
2081
+ node = list->next;
2082
+ }
2083
+ else
2084
+ list = NULL;
2085
+ }
2086
+ else
2087
+ list = NULL;
2088
+
2089
+ if (!node)
2090
+ return;
2091
+
2092
+ /* strip out style and class attributes */
2093
+ if (TY_(nodeIsElement)(node))
2094
+ PurgeWord2000Attributes( doc, node );
2095
+
2096
+ if (node->content)
2097
+ TY_(CleanWord2000)( doc, node->content );
2098
+
2099
+ node = node->next;
2100
+ }
2101
+ }
2102
+
2103
+ Bool TY_(IsWord2000)( TidyDocImpl* doc )
2104
+ {
2105
+ AttVal *attval;
2106
+ Node *node, *head;
2107
+ Node *html = TY_(FindHTML)( doc );
2108
+
2109
+ if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2110
+ return yes;
2111
+
2112
+ /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2113
+ head = TY_(FindHEAD)( doc );
2114
+
2115
+ if (head)
2116
+ {
2117
+ for (node = head->content; node; node = node->next)
2118
+ {
2119
+ if ( !nodeIsMETA(node) )
2120
+ continue;
2121
+
2122
+ attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2123
+
2124
+ if ( !AttrValueIs(attval, "generator") )
2125
+ continue;
2126
+
2127
+ attval = TY_(AttrGetById)( node, TidyAttr_CONTENT );
2128
+
2129
+ if ( AttrContains(attval, "Microsoft") )
2130
+ return yes;
2131
+ }
2132
+ }
2133
+
2134
+ return no;
2135
+ }
2136
+
2137
+ /* where appropriate move object elements from head to body */
2138
+ void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2139
+ {
2140
+ Node *node, *next, *head = NULL, *body = NULL;
2141
+
2142
+ if (!html)
2143
+ return;
2144
+
2145
+ for ( node = html->content; node != NULL; node = node->next )
2146
+ {
2147
+ if ( nodeIsHEAD(node) )
2148
+ head = node;
2149
+
2150
+ if ( nodeIsBODY(node) )
2151
+ body = node;
2152
+ }
2153
+
2154
+ if ( head != NULL && body != NULL )
2155
+ {
2156
+ for (node = head->content; node != NULL; node = next)
2157
+ {
2158
+ next = node->next;
2159
+
2160
+ if ( nodeIsOBJECT(node) )
2161
+ {
2162
+ Node *child;
2163
+ Bool bump = no;
2164
+
2165
+ for (child = node->content; child != NULL; child = child->next)
2166
+ {
2167
+ /* bump to body unless content is param */
2168
+ if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2169
+ || !nodeIsPARAM(child) )
2170
+ {
2171
+ bump = yes;
2172
+ break;
2173
+ }
2174
+ }
2175
+
2176
+ if ( bump )
2177
+ {
2178
+ TY_(RemoveNode)( node );
2179
+ TY_(InsertNodeAtStart)( body, node );
2180
+ }
2181
+ }
2182
+ }
2183
+ }
2184
+ }
2185
+
2186
+ /* This is disabled due to http://tidy.sf.net/bug/681116 */
2187
+ #if 0
2188
+ void FixBrakes( TidyDocImpl* pDoc, Node *pParent )
2189
+ {
2190
+ Node *pNode;
2191
+ Bool bBRDeleted = no;
2192
+
2193
+ if (NULL == pParent)
2194
+ return;
2195
+
2196
+ /* First, check the status of All My Children */
2197
+ pNode = pParent->content;
2198
+ while (NULL != pNode )
2199
+ {
2200
+ /* The node may get trimmed, so save the next pointer, if any */
2201
+ Node *pNext = pNode->next;
2202
+ FixBrakes( pDoc, pNode );
2203
+ pNode = pNext;
2204
+ }
2205
+
2206
+
2207
+ /* As long as my last child is a <br />, move it to my last peer */
2208
+ if ( nodeCMIsBlock( pParent ))
2209
+ {
2210
+ for ( pNode = pParent->last;
2211
+ NULL != pNode && nodeIsBR( pNode );
2212
+ pNode = pParent->last )
2213
+ {
2214
+ if ( NULL == pNode->attributes && no == bBRDeleted )
2215
+ {
2216
+ TY_(DiscardElement)( pDoc, pNode );
2217
+ bBRDeleted = yes;
2218
+ }
2219
+ else
2220
+ {
2221
+ TY_(RemoveNode)( pNode );
2222
+ TY_(InsertNodeAfterElement)( pParent, pNode );
2223
+ }
2224
+ }
2225
+ TY_(TrimEmptyElement)( pDoc, pParent );
2226
+ }
2227
+ }
2228
+ #endif
2229
+
2230
+ void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2231
+ {
2232
+ Node *pNode;
2233
+ StyleProp *pFirstProp = NULL, *pLastProp = NULL, *prop = NULL;
2234
+ tmbstr s, pszBegin, pszEnd;
2235
+ ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2236
+
2237
+ if (!enc)
2238
+ return;
2239
+
2240
+ if (!nodeIsHEAD(head))
2241
+ head = TY_(FindHEAD)(doc);
2242
+
2243
+ if (!head)
2244
+ return;
2245
+
2246
+ /* Find any <meta http-equiv='Content-Type' content='...' /> */
2247
+ for (pNode = head->content; NULL != pNode; pNode = pNode->next)
2248
+ {
2249
+ AttVal* httpEquiv = TY_(AttrGetById)(pNode, TidyAttr_HTTP_EQUIV);
2250
+ AttVal* metaContent = TY_(AttrGetById)(pNode, TidyAttr_CONTENT);
2251
+
2252
+ if ( !nodeIsMETA(pNode) || !metaContent ||
2253
+ !AttrValueIs(httpEquiv, "Content-Type") )
2254
+ continue;
2255
+
2256
+ pszBegin = s = TY_(tmbstrdup)( doc->allocator, metaContent->value );
2257
+ while (pszBegin && *pszBegin)
2258
+ {
2259
+ while (isspace( *pszBegin ))
2260
+ pszBegin++;
2261
+ pszEnd = pszBegin;
2262
+ while ('\0' != *pszEnd && ';' != *pszEnd)
2263
+ pszEnd++;
2264
+ if (';' == *pszEnd )
2265
+ *(pszEnd++) = '\0';
2266
+ if (pszEnd > pszBegin)
2267
+ {
2268
+ prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
2269
+ prop->name = TY_(tmbstrdup)( doc->allocator, pszBegin );
2270
+ prop->value = NULL;
2271
+ prop->next = NULL;
2272
+
2273
+ if (NULL != pLastProp)
2274
+ pLastProp->next = prop;
2275
+ else
2276
+ pFirstProp = prop;
2277
+
2278
+ pLastProp = prop;
2279
+ pszBegin = pszEnd;
2280
+ }
2281
+ }
2282
+ TidyDocFree( doc, s );
2283
+
2284
+ /* find the charset property */
2285
+ for (prop = pFirstProp; NULL != prop; prop = prop->next)
2286
+ {
2287
+ if (0 != TY_(tmbstrncasecmp)( prop->name, "charset", 7 ))
2288
+ continue;
2289
+
2290
+ TidyDocFree( doc, prop->name );
2291
+ prop->name = (tmbstr)TidyDocAlloc( doc, 8 + TY_(tmbstrlen)(enc) + 1 );
2292
+ TY_(tmbstrcpy)(prop->name, "charset=");
2293
+ TY_(tmbstrcpy)(prop->name+8, enc);
2294
+ s = CreatePropString( doc, pFirstProp );
2295
+ TidyDocFree( doc, metaContent->value );
2296
+ metaContent->value = s;
2297
+ break;
2298
+ }
2299
+ /* #718127, prevent memory leakage */
2300
+ FreeStyleProps(doc, pFirstProp);
2301
+ pFirstProp = NULL;
2302
+ pLastProp = NULL;
2303
+ }
2304
+ }
2305
+
2306
+ void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2307
+ {
2308
+ Node* next;
2309
+
2310
+ while (node)
2311
+ {
2312
+ next = node->next;
2313
+
2314
+ if (node->type == CommentTag)
2315
+ {
2316
+ TY_(RemoveNode)(node);
2317
+ TY_(FreeNode)(doc, node);
2318
+ node = next;
2319
+ continue;
2320
+ }
2321
+
2322
+ if (node->content)
2323
+ TY_(DropComments)(doc, node->content);
2324
+
2325
+ node = next;
2326
+ }
2327
+ }
2328
+
2329
+ void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2330
+ {
2331
+ Node* next;
2332
+
2333
+ while (node)
2334
+ {
2335
+ next = node->next;
2336
+
2337
+ if (nodeIsFONT(node))
2338
+ {
2339
+ DiscardContainer(doc, node, &next);
2340
+ node = next;
2341
+ continue;
2342
+ }
2343
+
2344
+ if (node->content)
2345
+ TY_(DropFontElements)(doc, node->content, &next);
2346
+
2347
+ node = next;
2348
+ }
2349
+ }
2350
+
2351
+ void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2352
+ {
2353
+ Node* next;
2354
+
2355
+ while (node)
2356
+ {
2357
+ next = node->next;
2358
+
2359
+ if (nodeIsWBR(node))
2360
+ {
2361
+ Node* text;
2362
+ text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2363
+ TY_(InsertNodeAfterElement)(node, text);
2364
+ TY_(RemoveNode)(node);
2365
+ TY_(FreeNode)(doc, node);
2366
+ node = next;
2367
+ continue;
2368
+ }
2369
+
2370
+ if (node->content)
2371
+ TY_(WbrToSpace)(doc, node->content);
2372
+
2373
+ node = next;
2374
+ }
2375
+ }
2376
+
2377
+ /*
2378
+ Filters from Word and PowerPoint often use smart
2379
+ quotes resulting in character codes between 128
2380
+ and 159. Unfortunately, the corresponding HTML 4.0
2381
+ entities for these are not widely supported. The
2382
+ following converts dashes and quotation marks to
2383
+ the nearest ASCII equivalent. My thanks to
2384
+ Andrzej Novosiolov for his help with this code.
2385
+
2386
+ Note: The old code in the pretty printer applied
2387
+ this to all node types and attribute values while
2388
+ this routine applies it only to text nodes. First,
2389
+ Microsoft Office products rarely put the relevant
2390
+ characters into these tokens, second support for
2391
+ them is much better now and last but not least, it
2392
+ can be harmful to replace these characters since
2393
+ US-ASCII quote marks are often used as syntax
2394
+ characters, a simple
2395
+
2396
+ <a onmouseover="alert('&#x2018;')">...</a>
2397
+
2398
+ would be broken if the U+2018 is replaced by "'".
2399
+ The old code would neither take care whether the
2400
+ quote mark is already used as delimiter,
2401
+
2402
+ <p title='&#x2018;'>...</p>
2403
+
2404
+ got
2405
+
2406
+ <p title='''>...</p>
2407
+
2408
+ Since browser support is much better nowadays and
2409
+ high-quality typography is better than ASCII it'd
2410
+ be probably a good idea to drop the feature...
2411
+ */
2412
+ void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2413
+ {
2414
+ Node* next;
2415
+ Lexer* lexer = doc->lexer;
2416
+
2417
+ while (node)
2418
+ {
2419
+ next = node->next;
2420
+
2421
+ if (TY_(nodeIsText)(node))
2422
+ {
2423
+ uint i, c;
2424
+ tmbstr p = lexer->lexbuf + node->start;
2425
+
2426
+ for (i = node->start; i < node->end; ++i)
2427
+ {
2428
+ c = (unsigned char) lexer->lexbuf[i];
2429
+
2430
+ if (c > 0x7F)
2431
+ i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2432
+
2433
+ if (c >= 0x2013 && c <= 0x201E)
2434
+ {
2435
+ switch (c)
2436
+ {
2437
+ case 0x2013: /* en dash */
2438
+ case 0x2014: /* em dash */
2439
+ c = '-';
2440
+ break;
2441
+ case 0x2018: /* left single quotation mark */
2442
+ case 0x2019: /* right single quotation mark */
2443
+ case 0x201A: /* single low-9 quotation mark */
2444
+ c = '\'';
2445
+ break;
2446
+ case 0x201C: /* left double quotation mark */
2447
+ case 0x201D: /* right double quotation mark */
2448
+ case 0x201E: /* double low-9 quotation mark */
2449
+ c = '"';
2450
+ break;
2451
+ }
2452
+ }
2453
+
2454
+ p = TY_(PutUTF8)(p, c);
2455
+ }
2456
+
2457
+ node->end = p - lexer->lexbuf;
2458
+ }
2459
+
2460
+ if (node->content)
2461
+ TY_(DowngradeTypography)(doc, node->content);
2462
+
2463
+ node = next;
2464
+ }
2465
+ }
2466
+
2467
+ void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2468
+ {
2469
+ Node* next;
2470
+
2471
+ while (node)
2472
+ {
2473
+ next = node->next;
2474
+
2475
+ if (node->tag && node->tag->parser == TY_(ParsePre))
2476
+ {
2477
+ TY_(NormalizeSpaces)(doc->lexer, node->content);
2478
+ node = next;
2479
+ continue;
2480
+ }
2481
+
2482
+ if (node->content)
2483
+ TY_(ReplacePreformattedSpaces)(doc, node->content);
2484
+
2485
+ node = next;
2486
+ }
2487
+ }
2488
+
2489
+ void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2490
+ {
2491
+ Node* next;
2492
+
2493
+ while (node)
2494
+ {
2495
+ next = node->next;
2496
+
2497
+ if (node->type == CDATATag)
2498
+ node->type = TextNode;
2499
+
2500
+ if (node->content)
2501
+ TY_(ConvertCDATANodes)(doc, node->content);
2502
+
2503
+ node = next;
2504
+ }
2505
+ }
2506
+
2507
+ /*
2508
+ FixLanguageInformation ensures that the document contains (only)
2509
+ the attributes for language information desired by the output
2510
+ document type. For example, for XHTML 1.0 documents both
2511
+ 'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2512
+ is desired and for HTML 4.01 only 'lang' is desired.
2513
+ */
2514
+ void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2515
+ {
2516
+ Node* next;
2517
+
2518
+ while (node)
2519
+ {
2520
+ next = node->next;
2521
+
2522
+ /* todo: report modifications made here to the report system */
2523
+
2524
+ if (TY_(nodeIsElement)(node))
2525
+ {
2526
+ AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2527
+ AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2528
+
2529
+ if (lang && xmlLang)
2530
+ {
2531
+ /*
2532
+ todo: check whether both attributes are in sync,
2533
+ here or elsewhere, where elsewhere is probably
2534
+ preferable.
2535
+ AD - March 2005: not mandatory according the standards.
2536
+ */
2537
+ }
2538
+ else if (lang && wantXmlLang)
2539
+ {
2540
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2541
+ & doc->lexer->versionEmitted)
2542
+ TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2543
+ }
2544
+ else if (xmlLang && wantLang)
2545
+ {
2546
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2547
+ & doc->lexer->versionEmitted)
2548
+ TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2549
+ }
2550
+
2551
+ if (lang && !wantLang)
2552
+ TY_(RemoveAttribute)(doc, node, lang);
2553
+
2554
+ if (xmlLang && !wantXmlLang)
2555
+ TY_(RemoveAttribute)(doc, node, xmlLang);
2556
+ }
2557
+
2558
+ if (node->content)
2559
+ TY_(FixLanguageInformation)(doc, node->content, wantXmlLang, wantLang);
2560
+
2561
+ node = next;
2562
+ }
2563
+ }
2564
+
2565
+ /*
2566
+ Set/fix/remove <html xmlns='...'>
2567
+ */
2568
+ void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2569
+ {
2570
+ Node* html = TY_(FindHTML)(doc);
2571
+ AttVal* xmlns;
2572
+
2573
+ if (!html)
2574
+ return;
2575
+
2576
+ xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2577
+
2578
+ if (wantXmlns)
2579
+ {
2580
+ if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2581
+ TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2582
+ }
2583
+ else if (xmlns)
2584
+ {
2585
+ TY_(RemoveAttribute)(doc, html, xmlns);
2586
+ }
2587
+ }
2588
+
2589
+ /*
2590
+ ...
2591
+ */
2592
+ void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2593
+ {
2594
+ Node* next;
2595
+
2596
+ while (node)
2597
+ {
2598
+ next = node->next;
2599
+
2600
+ if (TY_(IsAnchorElement)(doc, node))
2601
+ {
2602
+ AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2603
+ AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2604
+ Bool hadName = name!=NULL;
2605
+ Bool hadId = id!=NULL;
2606
+ Bool IdEmitted = no;
2607
+ Bool NameEmitted = no;
2608
+
2609
+ /* todo: how are empty name/id attributes handled? */
2610
+
2611
+ if (name && id)
2612
+ {
2613
+ Bool NameHasValue = AttrHasValue(name);
2614
+ Bool IdHasValue = AttrHasValue(id);
2615
+ if ( (NameHasValue != IdHasValue) ||
2616
+ (NameHasValue && IdHasValue &&
2617
+ TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2618
+ TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2619
+ }
2620
+ else if (name && wantId)
2621
+ {
2622
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2623
+ & doc->lexer->versionEmitted)
2624
+ {
2625
+ if (TY_(IsValidHTMLID)(name->value))
2626
+ {
2627
+ TY_(RepairAttrValue)(doc, node, "id", name->value);
2628
+ IdEmitted = yes;
2629
+ }
2630
+ else
2631
+ TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2632
+ }
2633
+ }
2634
+ else if (id && wantName)
2635
+ {
2636
+ if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2637
+ & doc->lexer->versionEmitted)
2638
+ {
2639
+ /* todo: do not assume id is valid */
2640
+ TY_(RepairAttrValue)(doc, node, "name", id->value);
2641
+ NameEmitted = yes;
2642
+ }
2643
+ }
2644
+
2645
+ if (id && !wantId
2646
+ /* make sure that Name has been emitted if requested */
2647
+ && (hadName || !wantName || NameEmitted) )
2648
+ TY_(RemoveAttribute)(doc, node, id);
2649
+
2650
+ if (name && !wantName
2651
+ /* make sure that Id has been emitted if requested */
2652
+ && (hadId || !wantId || IdEmitted) )
2653
+ TY_(RemoveAttribute)(doc, node, name);
2654
+
2655
+ if (TY_(AttrGetById)(node, TidyAttr_NAME) == NULL &&
2656
+ TY_(AttrGetById)(node, TidyAttr_ID) == NULL)
2657
+ TY_(RemoveAnchorByNode)(doc, node);
2658
+ }
2659
+
2660
+ if (node->content)
2661
+ TY_(FixAnchors)(doc, node->content, wantName, wantId);
2662
+
2663
+ node = next;
2664
+ }
2665
+ }
2666
+
2667
+ /*
2668
+ * local variables:
2669
+ * mode: c
2670
+ * indent-tabs-mode: nil
2671
+ * c-basic-offset: 4
2672
+ * eval: (c-set-offset 'substatement-open 0)
2673
+ * end:
2674
+ */