tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/lexer.h ADDED
@@ -0,0 +1,617 @@
1
+ #ifndef __LEXER_H__
2
+ #define __LEXER_H__
3
+
4
+ /* lexer.h -- Lexer for html parser
5
+
6
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info:
10
+ $Author: arnaud02 $
11
+ $Date: 2008/03/22 21:06:11 $
12
+ $Revision: 1.41 $
13
+
14
+ */
15
+
16
+ /*
17
+ Given an input source, it returns a sequence of tokens.
18
+
19
+ GetToken(source) gets the next token
20
+ UngetToken(source) provides one level undo
21
+
22
+ The tags include an attribute list:
23
+
24
+ - linked list of attribute/value nodes
25
+ - each node has 2 NULL-terminated strings.
26
+ - entities are replaced in attribute values
27
+
28
+ white space is compacted if not in preformatted mode
29
+ If not in preformatted mode then leading white space
30
+ is discarded and subsequent white space sequences
31
+ compacted to single space characters.
32
+
33
+ If XmlTags is no then Tag names are folded to upper
34
+ case and attribute names to lower case.
35
+
36
+ Not yet done:
37
+ - Doctype subset and marked sections
38
+ */
39
+
40
+ #ifdef __cplusplus
41
+ extern "C" {
42
+ #endif
43
+
44
+ #include "forward.h"
45
+
46
+ /* lexer character types
47
+ */
48
+ #define digit 1u
49
+ #define letter 2u
50
+ #define namechar 4u
51
+ #define white 8u
52
+ #define newline 16u
53
+ #define lowercase 32u
54
+ #define uppercase 64u
55
+ #define digithex 128u
56
+
57
+
58
+ /* node->type is one of these values
59
+ */
60
+ typedef enum
61
+ {
62
+ RootNode,
63
+ DocTypeTag,
64
+ CommentTag,
65
+ ProcInsTag,
66
+ TextNode,
67
+ StartTag,
68
+ EndTag,
69
+ StartEndTag,
70
+ CDATATag,
71
+ SectionTag,
72
+ AspTag,
73
+ JsteTag,
74
+ PhpTag,
75
+ XmlDecl
76
+ } NodeType;
77
+
78
+
79
+
80
+ /* lexer GetToken states
81
+ */
82
+ typedef enum
83
+ {
84
+ LEX_CONTENT,
85
+ LEX_GT,
86
+ LEX_ENDTAG,
87
+ LEX_STARTTAG,
88
+ LEX_COMMENT,
89
+ LEX_DOCTYPE,
90
+ LEX_PROCINSTR,
91
+ LEX_CDATA,
92
+ LEX_SECTION,
93
+ LEX_ASP,
94
+ LEX_JSTE,
95
+ LEX_PHP,
96
+ LEX_XMLDECL
97
+ } LexerState;
98
+
99
+ /* ParseDocTypeDecl state constants */
100
+ typedef enum
101
+ {
102
+ DT_INTERMEDIATE,
103
+ DT_DOCTYPENAME,
104
+ DT_PUBLICSYSTEM,
105
+ DT_QUOTEDSTRING,
106
+ DT_INTSUBSET
107
+ } ParseDocTypeDeclState;
108
+
109
+ /* content model shortcut encoding
110
+
111
+ Descriptions are tentative.
112
+ */
113
+ #define CM_UNKNOWN 0
114
+ /* Elements with no content. Map to HTML specification. */
115
+ #define CM_EMPTY (1 << 0)
116
+ /* Elements that appear outside of "BODY". */
117
+ #define CM_HTML (1 << 1)
118
+ /* Elements that can appear within HEAD. */
119
+ #define CM_HEAD (1 << 2)
120
+ /* HTML "block" elements. */
121
+ #define CM_BLOCK (1 << 3)
122
+ /* HTML "inline" elements. */
123
+ #define CM_INLINE (1 << 4)
124
+ /* Elements that mark list item ("LI"). */
125
+ #define CM_LIST (1 << 5)
126
+ /* Elements that mark definition list item ("DL", "DT"). */
127
+ #define CM_DEFLIST (1 << 6)
128
+ /* Elements that can appear inside TABLE. */
129
+ #define CM_TABLE (1 << 7)
130
+ /* Used for "THEAD", "TFOOT" or "TBODY". */
131
+ #define CM_ROWGRP (1 << 8)
132
+ /* Used for "TD", "TH" */
133
+ #define CM_ROW (1 << 9)
134
+ /* Elements whose content must be protected against white space movement.
135
+ Includes some elements that can found in forms. */
136
+ #define CM_FIELD (1 << 10)
137
+ /* Used to avoid propagating inline emphasis inside some elements
138
+ such as OBJECT or APPLET. */
139
+ #define CM_OBJECT (1 << 11)
140
+ /* Elements that allows "PARAM". */
141
+ #define CM_PARAM (1 << 12)
142
+ /* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
143
+ #define CM_FRAMES (1 << 13)
144
+ /* Heading elements (h1, h2, ...). */
145
+ #define CM_HEADING (1 << 14)
146
+ /* Elements with an optional end tag. */
147
+ #define CM_OPT (1 << 15)
148
+ /* Elements that use "align" attribute for vertical position. */
149
+ #define CM_IMG (1 << 16)
150
+ /* Elements with inline and block model. Used to avoid calling InlineDup. */
151
+ #define CM_MIXED (1 << 17)
152
+ /* Elements whose content needs to be indented only if containing one
153
+ CM_BLOCK element. */
154
+ #define CM_NO_INDENT (1 << 18)
155
+ /* Elements that are obsolete (such as "dir", "menu"). */
156
+ #define CM_OBSOLETE (1 << 19)
157
+ /* User defined elements. Used to determine how attributes wihout value
158
+ should be printed. */
159
+ #define CM_NEW (1 << 20)
160
+ /* Elements that cannot be omitted. */
161
+ #define CM_OMITST (1 << 21)
162
+
163
+ /* If the document uses just HTML 2.0 tags and attributes described
164
+ ** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
165
+ ** If there are proprietary tags and attributes then describe it as
166
+ ** HTML Proprietary. If it includes the xml-lang or xmlns attributes
167
+ ** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
168
+ ** flavors of Voyager (strict, loose or frameset).
169
+ */
170
+
171
+ /* unknown */
172
+ #define xxxx 0u
173
+
174
+ /* W3C defined HTML/XHTML family document types */
175
+ #define HT20 1u
176
+ #define HT32 2u
177
+ #define H40S 4u
178
+ #define H40T 8u
179
+ #define H40F 16u
180
+ #define H41S 32u
181
+ #define H41T 64u
182
+ #define H41F 128u
183
+ #define X10S 256u
184
+ #define X10T 512u
185
+ #define X10F 1024u
186
+ #define XH11 2048u
187
+ #define XB10 4096u
188
+
189
+ /* proprietary stuff */
190
+ #define VERS_SUN 8192u
191
+ #define VERS_NETSCAPE 16384u
192
+ #define VERS_MICROSOFT 32768u
193
+
194
+ /* special flag */
195
+ #define VERS_XML 65536u
196
+
197
+ /* compatibility symbols */
198
+ #define VERS_UNKNOWN (xxxx)
199
+ #define VERS_HTML20 (HT20)
200
+ #define VERS_HTML32 (HT32)
201
+ #define VERS_HTML40_STRICT (H40S|H41S|X10S)
202
+ #define VERS_HTML40_LOOSE (H40T|H41T|X10T)
203
+ #define VERS_FRAMESET (H40F|H41F|X10F)
204
+ #define VERS_XHTML11 (XH11)
205
+ #define VERS_BASIC (XB10)
206
+
207
+ /* meta symbols */
208
+ #define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
209
+ #define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)
210
+ #define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
211
+ #define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)
212
+ #define VERS_FROM32 (VERS_HTML32|VERS_HTML40)
213
+ #define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
214
+ #define VERS_XHTML (X10S|X10T|X10F|XH11|XB10)
215
+
216
+ /* all W3C defined document types */
217
+ #define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40)
218
+
219
+ /* all proprietary types */
220
+ #define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
221
+
222
+ /* Linked list of class names and styles
223
+ */
224
+ struct _Style;
225
+ typedef struct _Style TagStyle;
226
+
227
+ struct _Style
228
+ {
229
+ tmbstr tag;
230
+ tmbstr tag_class;
231
+ tmbstr properties;
232
+ TagStyle *next;
233
+ };
234
+
235
+
236
+ /* Linked list of style properties
237
+ */
238
+ struct _StyleProp;
239
+ typedef struct _StyleProp StyleProp;
240
+
241
+ struct _StyleProp
242
+ {
243
+ tmbstr name;
244
+ tmbstr value;
245
+ StyleProp *next;
246
+ };
247
+
248
+
249
+
250
+
251
+ /* Attribute/Value linked list node
252
+ */
253
+
254
+ struct _AttVal
255
+ {
256
+ AttVal* next;
257
+ const Attribute* dict;
258
+ Node* asp;
259
+ Node* php;
260
+ int delim;
261
+ tmbstr attribute;
262
+ tmbstr value;
263
+ };
264
+
265
+
266
+
267
+ /*
268
+ Mosaic handles inlines via a separate stack from other elements
269
+ We duplicate this to recover from inline markup errors such as:
270
+
271
+ <i>italic text
272
+ <p>more italic text</b> normal text
273
+
274
+ which for compatibility with Mosaic is mapped to:
275
+
276
+ <i>italic text</i>
277
+ <p><i>more italic text</i> normal text
278
+
279
+ Note that any inline end tag pop's the effect of the current
280
+ inline start tag, so that </b> pop's <i> in the above example.
281
+ */
282
+ struct _IStack
283
+ {
284
+ IStack* next;
285
+ const Dict* tag; /* tag's dictionary definition */
286
+ tmbstr element; /* name (NULL for text nodes) */
287
+ AttVal* attributes;
288
+ };
289
+
290
+
291
+ /* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
292
+ ** etc. etc.
293
+ */
294
+
295
+ struct _Node
296
+ {
297
+ Node* parent; /* tree structure */
298
+ Node* prev;
299
+ Node* next;
300
+ Node* content;
301
+ Node* last;
302
+
303
+ AttVal* attributes;
304
+ const Dict* was; /* old tag when it was changed */
305
+ const Dict* tag; /* tag's dictionary definition */
306
+
307
+ tmbstr element; /* name (NULL for text nodes) */
308
+
309
+ uint start; /* start of span onto text array */
310
+ uint end; /* end of span onto text array */
311
+ NodeType type; /* TextNode, StartTag, EndTag etc. */
312
+
313
+ uint line; /* current line of document */
314
+ uint column; /* current column of document */
315
+
316
+ Bool closed; /* true if closed by explicit end tag */
317
+ Bool implicit; /* true if inferred */
318
+ Bool linebreak; /* true if followed by a line break */
319
+
320
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
321
+ tmbstr otext;
322
+ #endif
323
+ };
324
+
325
+
326
+ /*
327
+ The following are private to the lexer
328
+ Use NewLexer() to create a lexer, and
329
+ FreeLexer() to free it.
330
+ */
331
+
332
+ struct _Lexer
333
+ {
334
+ #if 0 /* Move to TidyDocImpl */
335
+ StreamIn* in; /* document content input */
336
+ StreamOut* errout; /* error output stream */
337
+
338
+ uint badAccess; /* for accessibility errors */
339
+ uint badLayout; /* for bad style errors */
340
+ uint badChars; /* for bad character encodings */
341
+ uint badForm; /* for mismatched/mispositioned form tags */
342
+ uint warnings; /* count of warnings in this document */
343
+ uint errors; /* count of errors */
344
+ #endif
345
+
346
+ uint lines; /* lines seen */
347
+ uint columns; /* at start of current token */
348
+ Bool waswhite; /* used to collapse contiguous white space */
349
+ Bool pushed; /* true after token has been pushed back */
350
+ Bool insertspace; /* when space is moved after end tag */
351
+ Bool excludeBlocks; /* Netscape compatibility */
352
+ Bool exiled; /* true if moved out of table */
353
+ Bool isvoyager; /* true if xmlns attribute on html element */
354
+ uint versions; /* bit vector of HTML versions */
355
+ uint doctype; /* version as given by doctype (if any) */
356
+ uint versionEmitted; /* version of doctype emitted */
357
+ Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
358
+ uint txtstart; /* start of current node */
359
+ uint txtend; /* end of current node */
360
+ LexerState state; /* state of lexer's finite state machine */
361
+
362
+ Node* token; /* last token returned by GetToken() */
363
+ Node* itoken; /* last duplicate inline returned by GetToken() */
364
+ Node* root; /* remember root node of the document */
365
+ Node* parent; /* remember parent node for CDATA elements */
366
+
367
+ Bool seenEndBody; /* true if a </body> tag has been encountered */
368
+ Bool seenEndHtml; /* true if a </html> tag has been encountered */
369
+
370
+ /*
371
+ Lexer character buffer
372
+
373
+ Parse tree nodes span onto this buffer
374
+ which contains the concatenated text
375
+ contents of all of the elements.
376
+
377
+ lexsize must be reset for each file.
378
+ */
379
+ tmbstr lexbuf; /* MB character buffer */
380
+ uint lexlength; /* allocated */
381
+ uint lexsize; /* used */
382
+
383
+ /* Inline stack for compatibility with Mosaic */
384
+ Node* inode; /* for deferring text node */
385
+ IStack* insert; /* for inferring inline tags */
386
+ IStack* istack;
387
+ uint istacklength; /* allocated */
388
+ uint istacksize; /* used */
389
+ uint istackbase; /* start of frame */
390
+
391
+ TagStyle *styles; /* used for cleaning up presentation markup */
392
+
393
+ TidyAllocator* allocator; /* allocator */
394
+
395
+ #if 0
396
+ TidyDocImpl* doc; /* Pointer back to doc for error reporting */
397
+ #endif
398
+ };
399
+
400
+
401
+ /* Lexer Functions
402
+ */
403
+
404
+ /* choose what version to use for new doctype */
405
+ int TY_(HTMLVersion)( TidyDocImpl* doc );
406
+
407
+ /* everything is allowed in proprietary version of HTML */
408
+ /* this is handled here rather than in the tag/attr dicts */
409
+
410
+ void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
411
+
412
+ Bool TY_(IsWhite)(uint c);
413
+ Bool TY_(IsDigit)(uint c);
414
+ Bool TY_(IsLetter)(uint c);
415
+ Bool TY_(IsNewline)(uint c);
416
+ Bool TY_(IsNamechar)(uint c);
417
+ Bool TY_(IsXMLLetter)(uint c);
418
+ Bool TY_(IsXMLNamechar)(uint c);
419
+
420
+ /* Bool IsLower(uint c); */
421
+ Bool TY_(IsUpper)(uint c);
422
+ uint TY_(ToLower)(uint c);
423
+ uint TY_(ToUpper)(uint c);
424
+
425
+ Lexer* TY_(NewLexer)( TidyDocImpl* doc );
426
+ void TY_(FreeLexer)( TidyDocImpl* doc );
427
+
428
+ /* store character c as UTF-8 encoded byte stream */
429
+ void TY_(AddCharToLexer)( Lexer *lexer, uint c );
430
+
431
+ /*
432
+ Used for elements and text nodes
433
+ element name is NULL for text nodes
434
+ start and end are offsets into lexbuf
435
+ which contains the textual content of
436
+ all elements in the parse tree.
437
+
438
+ parent and content allow traversal
439
+ of the parse tree in any direction.
440
+ attributes are represented as a linked
441
+ list of AttVal nodes which hold the
442
+ strings for attribute/value pairs.
443
+ */
444
+ Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
445
+
446
+
447
+ /* used to clone heading nodes when split by an <HR> */
448
+ Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
449
+
450
+ /* free node's attributes */
451
+ void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
452
+
453
+ /* doesn't repair attribute list linkage */
454
+ void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
455
+
456
+ /* detach attribute from node */
457
+ void TY_(DetachAttribute)( Node *node, AttVal *attr );
458
+
459
+ /* detach attribute from node then free it
460
+ */
461
+ void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
462
+
463
+ /*
464
+ Free document nodes by iterating through peers and recursing
465
+ through children. Set next to NULL before calling FreeNode()
466
+ to avoid freeing peer nodes. Doesn't patch up prev/next links.
467
+ */
468
+ void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
469
+
470
+ Node* TY_(TextToken)( Lexer *lexer );
471
+
472
+ /* used for creating preformatted text from Word2000 */
473
+ Node* TY_(NewLineNode)( Lexer *lexer );
474
+
475
+ /* used for adding a &nbsp; for Word2000 */
476
+ Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
477
+
478
+ void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
479
+ /* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
480
+
481
+ /* find element */
482
+ Node* TY_(FindDocType)( TidyDocImpl* doc );
483
+ Node* TY_(FindHTML)( TidyDocImpl* doc );
484
+ Node* TY_(FindHEAD)( TidyDocImpl* doc );
485
+ Node* TY_(FindTITLE)(TidyDocImpl* doc);
486
+ Node* TY_(FindBody)( TidyDocImpl* doc );
487
+ Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
488
+
489
+ /* Returns containing block element, if any */
490
+ Node* TY_(FindContainer)( Node* node );
491
+
492
+ /* add meta element for Tidy */
493
+ Bool TY_(AddGenerator)( TidyDocImpl* doc );
494
+
495
+ uint TY_(ApparentVersion)( TidyDocImpl* doc );
496
+
497
+ ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
498
+
499
+ Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
500
+
501
+ Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
502
+
503
+
504
+ /* fixup doctype if missing */
505
+ Bool TY_(FixDocType)( TidyDocImpl* doc );
506
+
507
+ /* ensure XML document starts with <?xml version="1.0"?> */
508
+ /* add encoding attribute if not using ASCII or UTF-8 output */
509
+ Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
510
+
511
+ Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
512
+
513
+ void TY_(UngetToken)( TidyDocImpl* doc );
514
+
515
+
516
+ /*
517
+ modes for GetToken()
518
+
519
+ MixedContent -- for elements which don't accept PCDATA
520
+ Preformatted -- white space preserved as is
521
+ IgnoreMarkup -- for CDATA elements such as script, style
522
+ */
523
+ typedef enum
524
+ {
525
+ IgnoreWhitespace,
526
+ MixedContent,
527
+ Preformatted,
528
+ IgnoreMarkup,
529
+ CdataContent
530
+ } GetTokenMode;
531
+
532
+ Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
533
+
534
+ void TY_(InitMap)(void);
535
+
536
+
537
+ /* create a new attribute */
538
+ AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
539
+
540
+ /* create a new attribute with given name and value */
541
+ AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
542
+ int delim );
543
+
544
+ /* insert attribute at the end of attribute list of a node */
545
+ void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
546
+
547
+ /* insert attribute at the start of attribute list of a node */
548
+ void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
549
+
550
+ /*************************************
551
+ In-line Stack functions
552
+ *************************************/
553
+
554
+
555
+ /* duplicate attributes */
556
+ AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
557
+
558
+ /*
559
+ push a copy of an inline node onto stack
560
+ but don't push if implicit or OBJECT or APPLET
561
+ (implicit tags are ones generated from the istack)
562
+
563
+ One issue arises with pushing inlines when
564
+ the tag is already pushed. For instance:
565
+
566
+ <p><em>text
567
+ <p><em>more text
568
+
569
+ Shouldn't be mapped to
570
+
571
+ <p><em>text</em></p>
572
+ <p><em><em>more text</em></em>
573
+ */
574
+ void TY_(PushInline)( TidyDocImpl* doc, Node* node );
575
+
576
+ /* pop inline stack */
577
+ void TY_(PopInline)( TidyDocImpl* doc, Node* node );
578
+
579
+ Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
580
+ Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
581
+
582
+ /*
583
+ This has the effect of inserting "missing" inline
584
+ elements around the contents of blocklevel elements
585
+ such as P, TD, TH, DIV, PRE etc. This procedure is
586
+ called at the start of ParseBlock. when the inline
587
+ stack is not empty, as will be the case in:
588
+
589
+ <i><h1>italic heading</h1></i>
590
+
591
+ which is then treated as equivalent to
592
+
593
+ <h1><i>italic heading</i></h1>
594
+
595
+ This is implemented by setting the lexer into a mode
596
+ where it gets tokens from the inline stack rather than
597
+ from the input stream.
598
+ */
599
+ int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
600
+
601
+ /*
602
+ defer duplicates when entering a table or other
603
+ element where the inlines shouldn't be duplicated
604
+ */
605
+ void TY_(DeferDup)( TidyDocImpl* doc );
606
+ Node* TY_(InsertedToken)( TidyDocImpl* doc );
607
+
608
+ /* stack manipulation for inline elements */
609
+ Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
610
+ Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
611
+
612
+ #ifdef __cplusplus
613
+ }
614
+ #endif
615
+
616
+
617
+ #endif /* __LEXER_H__ */