tidy-ext 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/LICENSE +50 -0
- data/README +12 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/ext/tidy/access.c +3310 -0
- data/ext/tidy/access.h +279 -0
- data/ext/tidy/alloc.c +107 -0
- data/ext/tidy/attrask.c +209 -0
- data/ext/tidy/attrdict.c +2398 -0
- data/ext/tidy/attrdict.h +122 -0
- data/ext/tidy/attrget.c +213 -0
- data/ext/tidy/attrs.c +1911 -0
- data/ext/tidy/attrs.h +374 -0
- data/ext/tidy/buffio.c +232 -0
- data/ext/tidy/buffio.h +118 -0
- data/ext/tidy/charsets.c +1032 -0
- data/ext/tidy/charsets.h +14 -0
- data/ext/tidy/clean.c +2674 -0
- data/ext/tidy/clean.h +87 -0
- data/ext/tidy/config.c +1746 -0
- data/ext/tidy/config.h +153 -0
- data/ext/tidy/entities.c +419 -0
- data/ext/tidy/entities.h +24 -0
- data/ext/tidy/extconf.rb +5 -0
- data/ext/tidy/fileio.c +106 -0
- data/ext/tidy/fileio.h +46 -0
- data/ext/tidy/forward.h +69 -0
- data/ext/tidy/iconvtc.c +105 -0
- data/ext/tidy/iconvtc.h +15 -0
- data/ext/tidy/istack.c +373 -0
- data/ext/tidy/lexer.c +3825 -0
- data/ext/tidy/lexer.h +617 -0
- data/ext/tidy/localize.c +1882 -0
- data/ext/tidy/mappedio.c +329 -0
- data/ext/tidy/mappedio.h +16 -0
- data/ext/tidy/message.h +207 -0
- data/ext/tidy/parser.c +4408 -0
- data/ext/tidy/parser.h +76 -0
- data/ext/tidy/platform.h +636 -0
- data/ext/tidy/pprint.c +2276 -0
- data/ext/tidy/pprint.h +93 -0
- data/ext/tidy/ruby-tidy.c +195 -0
- data/ext/tidy/streamio.c +1407 -0
- data/ext/tidy/streamio.h +222 -0
- data/ext/tidy/tagask.c +286 -0
- data/ext/tidy/tags.c +955 -0
- data/ext/tidy/tags.h +235 -0
- data/ext/tidy/tidy-int.h +129 -0
- data/ext/tidy/tidy.h +1097 -0
- data/ext/tidy/tidyenum.h +622 -0
- data/ext/tidy/tidylib.c +1751 -0
- data/ext/tidy/tmbstr.c +306 -0
- data/ext/tidy/tmbstr.h +92 -0
- data/ext/tidy/utf8.c +539 -0
- data/ext/tidy/utf8.h +52 -0
- data/ext/tidy/version.h +14 -0
- data/ext/tidy/win32tc.c +795 -0
- data/ext/tidy/win32tc.h +19 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/tidy/compat_spec.rb +44 -0
- data/spec/tidy/remote_uri_spec.rb +14 -0
- data/spec/tidy/test1.html +5 -0
- data/spec/tidy/tidy_spec.rb +34 -0
- metadata +125 -0
data/ext/tidy/lexer.h
ADDED
@@ -0,0 +1,617 @@
|
|
1
|
+
#ifndef __LEXER_H__
|
2
|
+
#define __LEXER_H__
|
3
|
+
|
4
|
+
/* lexer.h -- Lexer for html parser
|
5
|
+
|
6
|
+
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
7
|
+
See tidy.h for the copyright notice.
|
8
|
+
|
9
|
+
CVS Info:
|
10
|
+
$Author: arnaud02 $
|
11
|
+
$Date: 2008/03/22 21:06:11 $
|
12
|
+
$Revision: 1.41 $
|
13
|
+
|
14
|
+
*/
|
15
|
+
|
16
|
+
/*
|
17
|
+
Given an input source, it returns a sequence of tokens.
|
18
|
+
|
19
|
+
GetToken(source) gets the next token
|
20
|
+
UngetToken(source) provides one level undo
|
21
|
+
|
22
|
+
The tags include an attribute list:
|
23
|
+
|
24
|
+
- linked list of attribute/value nodes
|
25
|
+
- each node has 2 NULL-terminated strings.
|
26
|
+
- entities are replaced in attribute values
|
27
|
+
|
28
|
+
white space is compacted if not in preformatted mode
|
29
|
+
If not in preformatted mode then leading white space
|
30
|
+
is discarded and subsequent white space sequences
|
31
|
+
compacted to single space characters.
|
32
|
+
|
33
|
+
If XmlTags is no then Tag names are folded to upper
|
34
|
+
case and attribute names to lower case.
|
35
|
+
|
36
|
+
Not yet done:
|
37
|
+
- Doctype subset and marked sections
|
38
|
+
*/
|
39
|
+
|
40
|
+
#ifdef __cplusplus
|
41
|
+
extern "C" {
|
42
|
+
#endif
|
43
|
+
|
44
|
+
#include "forward.h"
|
45
|
+
|
46
|
+
/* lexer character types
|
47
|
+
*/
|
48
|
+
#define digit 1u
|
49
|
+
#define letter 2u
|
50
|
+
#define namechar 4u
|
51
|
+
#define white 8u
|
52
|
+
#define newline 16u
|
53
|
+
#define lowercase 32u
|
54
|
+
#define uppercase 64u
|
55
|
+
#define digithex 128u
|
56
|
+
|
57
|
+
|
58
|
+
/* node->type is one of these values
|
59
|
+
*/
|
60
|
+
typedef enum
|
61
|
+
{
|
62
|
+
RootNode,
|
63
|
+
DocTypeTag,
|
64
|
+
CommentTag,
|
65
|
+
ProcInsTag,
|
66
|
+
TextNode,
|
67
|
+
StartTag,
|
68
|
+
EndTag,
|
69
|
+
StartEndTag,
|
70
|
+
CDATATag,
|
71
|
+
SectionTag,
|
72
|
+
AspTag,
|
73
|
+
JsteTag,
|
74
|
+
PhpTag,
|
75
|
+
XmlDecl
|
76
|
+
} NodeType;
|
77
|
+
|
78
|
+
|
79
|
+
|
80
|
+
/* lexer GetToken states
|
81
|
+
*/
|
82
|
+
typedef enum
|
83
|
+
{
|
84
|
+
LEX_CONTENT,
|
85
|
+
LEX_GT,
|
86
|
+
LEX_ENDTAG,
|
87
|
+
LEX_STARTTAG,
|
88
|
+
LEX_COMMENT,
|
89
|
+
LEX_DOCTYPE,
|
90
|
+
LEX_PROCINSTR,
|
91
|
+
LEX_CDATA,
|
92
|
+
LEX_SECTION,
|
93
|
+
LEX_ASP,
|
94
|
+
LEX_JSTE,
|
95
|
+
LEX_PHP,
|
96
|
+
LEX_XMLDECL
|
97
|
+
} LexerState;
|
98
|
+
|
99
|
+
/* ParseDocTypeDecl state constants */
|
100
|
+
typedef enum
|
101
|
+
{
|
102
|
+
DT_INTERMEDIATE,
|
103
|
+
DT_DOCTYPENAME,
|
104
|
+
DT_PUBLICSYSTEM,
|
105
|
+
DT_QUOTEDSTRING,
|
106
|
+
DT_INTSUBSET
|
107
|
+
} ParseDocTypeDeclState;
|
108
|
+
|
109
|
+
/* content model shortcut encoding
|
110
|
+
|
111
|
+
Descriptions are tentative.
|
112
|
+
*/
|
113
|
+
#define CM_UNKNOWN 0
|
114
|
+
/* Elements with no content. Map to HTML specification. */
|
115
|
+
#define CM_EMPTY (1 << 0)
|
116
|
+
/* Elements that appear outside of "BODY". */
|
117
|
+
#define CM_HTML (1 << 1)
|
118
|
+
/* Elements that can appear within HEAD. */
|
119
|
+
#define CM_HEAD (1 << 2)
|
120
|
+
/* HTML "block" elements. */
|
121
|
+
#define CM_BLOCK (1 << 3)
|
122
|
+
/* HTML "inline" elements. */
|
123
|
+
#define CM_INLINE (1 << 4)
|
124
|
+
/* Elements that mark list item ("LI"). */
|
125
|
+
#define CM_LIST (1 << 5)
|
126
|
+
/* Elements that mark definition list item ("DL", "DT"). */
|
127
|
+
#define CM_DEFLIST (1 << 6)
|
128
|
+
/* Elements that can appear inside TABLE. */
|
129
|
+
#define CM_TABLE (1 << 7)
|
130
|
+
/* Used for "THEAD", "TFOOT" or "TBODY". */
|
131
|
+
#define CM_ROWGRP (1 << 8)
|
132
|
+
/* Used for "TD", "TH" */
|
133
|
+
#define CM_ROW (1 << 9)
|
134
|
+
/* Elements whose content must be protected against white space movement.
|
135
|
+
Includes some elements that can found in forms. */
|
136
|
+
#define CM_FIELD (1 << 10)
|
137
|
+
/* Used to avoid propagating inline emphasis inside some elements
|
138
|
+
such as OBJECT or APPLET. */
|
139
|
+
#define CM_OBJECT (1 << 11)
|
140
|
+
/* Elements that allows "PARAM". */
|
141
|
+
#define CM_PARAM (1 << 12)
|
142
|
+
/* "FRAME", "FRAMESET", "NOFRAMES". Used in ParseFrameSet. */
|
143
|
+
#define CM_FRAMES (1 << 13)
|
144
|
+
/* Heading elements (h1, h2, ...). */
|
145
|
+
#define CM_HEADING (1 << 14)
|
146
|
+
/* Elements with an optional end tag. */
|
147
|
+
#define CM_OPT (1 << 15)
|
148
|
+
/* Elements that use "align" attribute for vertical position. */
|
149
|
+
#define CM_IMG (1 << 16)
|
150
|
+
/* Elements with inline and block model. Used to avoid calling InlineDup. */
|
151
|
+
#define CM_MIXED (1 << 17)
|
152
|
+
/* Elements whose content needs to be indented only if containing one
|
153
|
+
CM_BLOCK element. */
|
154
|
+
#define CM_NO_INDENT (1 << 18)
|
155
|
+
/* Elements that are obsolete (such as "dir", "menu"). */
|
156
|
+
#define CM_OBSOLETE (1 << 19)
|
157
|
+
/* User defined elements. Used to determine how attributes wihout value
|
158
|
+
should be printed. */
|
159
|
+
#define CM_NEW (1 << 20)
|
160
|
+
/* Elements that cannot be omitted. */
|
161
|
+
#define CM_OMITST (1 << 21)
|
162
|
+
|
163
|
+
/* If the document uses just HTML 2.0 tags and attributes described
|
164
|
+
** it as HTML 2.0 Similarly for HTML 3.2 and the 3 flavors of HTML 4.0.
|
165
|
+
** If there are proprietary tags and attributes then describe it as
|
166
|
+
** HTML Proprietary. If it includes the xml-lang or xmlns attributes
|
167
|
+
** but is otherwise HTML 2.0, 3.2 or 4.0 then describe it as one of the
|
168
|
+
** flavors of Voyager (strict, loose or frameset).
|
169
|
+
*/
|
170
|
+
|
171
|
+
/* unknown */
|
172
|
+
#define xxxx 0u
|
173
|
+
|
174
|
+
/* W3C defined HTML/XHTML family document types */
|
175
|
+
#define HT20 1u
|
176
|
+
#define HT32 2u
|
177
|
+
#define H40S 4u
|
178
|
+
#define H40T 8u
|
179
|
+
#define H40F 16u
|
180
|
+
#define H41S 32u
|
181
|
+
#define H41T 64u
|
182
|
+
#define H41F 128u
|
183
|
+
#define X10S 256u
|
184
|
+
#define X10T 512u
|
185
|
+
#define X10F 1024u
|
186
|
+
#define XH11 2048u
|
187
|
+
#define XB10 4096u
|
188
|
+
|
189
|
+
/* proprietary stuff */
|
190
|
+
#define VERS_SUN 8192u
|
191
|
+
#define VERS_NETSCAPE 16384u
|
192
|
+
#define VERS_MICROSOFT 32768u
|
193
|
+
|
194
|
+
/* special flag */
|
195
|
+
#define VERS_XML 65536u
|
196
|
+
|
197
|
+
/* compatibility symbols */
|
198
|
+
#define VERS_UNKNOWN (xxxx)
|
199
|
+
#define VERS_HTML20 (HT20)
|
200
|
+
#define VERS_HTML32 (HT32)
|
201
|
+
#define VERS_HTML40_STRICT (H40S|H41S|X10S)
|
202
|
+
#define VERS_HTML40_LOOSE (H40T|H41T|X10T)
|
203
|
+
#define VERS_FRAMESET (H40F|H41F|X10F)
|
204
|
+
#define VERS_XHTML11 (XH11)
|
205
|
+
#define VERS_BASIC (XB10)
|
206
|
+
|
207
|
+
/* meta symbols */
|
208
|
+
#define VERS_HTML40 (VERS_HTML40_STRICT|VERS_HTML40_LOOSE|VERS_FRAMESET)
|
209
|
+
#define VERS_IFRAME (VERS_HTML40_LOOSE|VERS_FRAMESET)
|
210
|
+
#define VERS_LOOSE (VERS_HTML20|VERS_HTML32|VERS_IFRAME)
|
211
|
+
#define VERS_EVENTS (VERS_HTML40|VERS_XHTML11)
|
212
|
+
#define VERS_FROM32 (VERS_HTML32|VERS_HTML40)
|
213
|
+
#define VERS_FROM40 (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
|
214
|
+
#define VERS_XHTML (X10S|X10T|X10F|XH11|XB10)
|
215
|
+
|
216
|
+
/* all W3C defined document types */
|
217
|
+
#define VERS_ALL (VERS_HTML20|VERS_HTML32|VERS_FROM40)
|
218
|
+
|
219
|
+
/* all proprietary types */
|
220
|
+
#define VERS_PROPRIETARY (VERS_NETSCAPE|VERS_MICROSOFT|VERS_SUN)
|
221
|
+
|
222
|
+
/* Linked list of class names and styles
|
223
|
+
*/
|
224
|
+
struct _Style;
|
225
|
+
typedef struct _Style TagStyle;
|
226
|
+
|
227
|
+
struct _Style
|
228
|
+
{
|
229
|
+
tmbstr tag;
|
230
|
+
tmbstr tag_class;
|
231
|
+
tmbstr properties;
|
232
|
+
TagStyle *next;
|
233
|
+
};
|
234
|
+
|
235
|
+
|
236
|
+
/* Linked list of style properties
|
237
|
+
*/
|
238
|
+
struct _StyleProp;
|
239
|
+
typedef struct _StyleProp StyleProp;
|
240
|
+
|
241
|
+
struct _StyleProp
|
242
|
+
{
|
243
|
+
tmbstr name;
|
244
|
+
tmbstr value;
|
245
|
+
StyleProp *next;
|
246
|
+
};
|
247
|
+
|
248
|
+
|
249
|
+
|
250
|
+
|
251
|
+
/* Attribute/Value linked list node
|
252
|
+
*/
|
253
|
+
|
254
|
+
struct _AttVal
|
255
|
+
{
|
256
|
+
AttVal* next;
|
257
|
+
const Attribute* dict;
|
258
|
+
Node* asp;
|
259
|
+
Node* php;
|
260
|
+
int delim;
|
261
|
+
tmbstr attribute;
|
262
|
+
tmbstr value;
|
263
|
+
};
|
264
|
+
|
265
|
+
|
266
|
+
|
267
|
+
/*
|
268
|
+
Mosaic handles inlines via a separate stack from other elements
|
269
|
+
We duplicate this to recover from inline markup errors such as:
|
270
|
+
|
271
|
+
<i>italic text
|
272
|
+
<p>more italic text</b> normal text
|
273
|
+
|
274
|
+
which for compatibility with Mosaic is mapped to:
|
275
|
+
|
276
|
+
<i>italic text</i>
|
277
|
+
<p><i>more italic text</i> normal text
|
278
|
+
|
279
|
+
Note that any inline end tag pop's the effect of the current
|
280
|
+
inline start tag, so that </b> pop's <i> in the above example.
|
281
|
+
*/
|
282
|
+
struct _IStack
|
283
|
+
{
|
284
|
+
IStack* next;
|
285
|
+
const Dict* tag; /* tag's dictionary definition */
|
286
|
+
tmbstr element; /* name (NULL for text nodes) */
|
287
|
+
AttVal* attributes;
|
288
|
+
};
|
289
|
+
|
290
|
+
|
291
|
+
/* HTML/XHTML/XML Element, Comment, PI, DOCTYPE, XML Decl,
|
292
|
+
** etc. etc.
|
293
|
+
*/
|
294
|
+
|
295
|
+
struct _Node
|
296
|
+
{
|
297
|
+
Node* parent; /* tree structure */
|
298
|
+
Node* prev;
|
299
|
+
Node* next;
|
300
|
+
Node* content;
|
301
|
+
Node* last;
|
302
|
+
|
303
|
+
AttVal* attributes;
|
304
|
+
const Dict* was; /* old tag when it was changed */
|
305
|
+
const Dict* tag; /* tag's dictionary definition */
|
306
|
+
|
307
|
+
tmbstr element; /* name (NULL for text nodes) */
|
308
|
+
|
309
|
+
uint start; /* start of span onto text array */
|
310
|
+
uint end; /* end of span onto text array */
|
311
|
+
NodeType type; /* TextNode, StartTag, EndTag etc. */
|
312
|
+
|
313
|
+
uint line; /* current line of document */
|
314
|
+
uint column; /* current column of document */
|
315
|
+
|
316
|
+
Bool closed; /* true if closed by explicit end tag */
|
317
|
+
Bool implicit; /* true if inferred */
|
318
|
+
Bool linebreak; /* true if followed by a line break */
|
319
|
+
|
320
|
+
#ifdef TIDY_STORE_ORIGINAL_TEXT
|
321
|
+
tmbstr otext;
|
322
|
+
#endif
|
323
|
+
};
|
324
|
+
|
325
|
+
|
326
|
+
/*
|
327
|
+
The following are private to the lexer
|
328
|
+
Use NewLexer() to create a lexer, and
|
329
|
+
FreeLexer() to free it.
|
330
|
+
*/
|
331
|
+
|
332
|
+
struct _Lexer
|
333
|
+
{
|
334
|
+
#if 0 /* Move to TidyDocImpl */
|
335
|
+
StreamIn* in; /* document content input */
|
336
|
+
StreamOut* errout; /* error output stream */
|
337
|
+
|
338
|
+
uint badAccess; /* for accessibility errors */
|
339
|
+
uint badLayout; /* for bad style errors */
|
340
|
+
uint badChars; /* for bad character encodings */
|
341
|
+
uint badForm; /* for mismatched/mispositioned form tags */
|
342
|
+
uint warnings; /* count of warnings in this document */
|
343
|
+
uint errors; /* count of errors */
|
344
|
+
#endif
|
345
|
+
|
346
|
+
uint lines; /* lines seen */
|
347
|
+
uint columns; /* at start of current token */
|
348
|
+
Bool waswhite; /* used to collapse contiguous white space */
|
349
|
+
Bool pushed; /* true after token has been pushed back */
|
350
|
+
Bool insertspace; /* when space is moved after end tag */
|
351
|
+
Bool excludeBlocks; /* Netscape compatibility */
|
352
|
+
Bool exiled; /* true if moved out of table */
|
353
|
+
Bool isvoyager; /* true if xmlns attribute on html element */
|
354
|
+
uint versions; /* bit vector of HTML versions */
|
355
|
+
uint doctype; /* version as given by doctype (if any) */
|
356
|
+
uint versionEmitted; /* version of doctype emitted */
|
357
|
+
Bool bad_doctype; /* e.g. if html or PUBLIC is missing */
|
358
|
+
uint txtstart; /* start of current node */
|
359
|
+
uint txtend; /* end of current node */
|
360
|
+
LexerState state; /* state of lexer's finite state machine */
|
361
|
+
|
362
|
+
Node* token; /* last token returned by GetToken() */
|
363
|
+
Node* itoken; /* last duplicate inline returned by GetToken() */
|
364
|
+
Node* root; /* remember root node of the document */
|
365
|
+
Node* parent; /* remember parent node for CDATA elements */
|
366
|
+
|
367
|
+
Bool seenEndBody; /* true if a </body> tag has been encountered */
|
368
|
+
Bool seenEndHtml; /* true if a </html> tag has been encountered */
|
369
|
+
|
370
|
+
/*
|
371
|
+
Lexer character buffer
|
372
|
+
|
373
|
+
Parse tree nodes span onto this buffer
|
374
|
+
which contains the concatenated text
|
375
|
+
contents of all of the elements.
|
376
|
+
|
377
|
+
lexsize must be reset for each file.
|
378
|
+
*/
|
379
|
+
tmbstr lexbuf; /* MB character buffer */
|
380
|
+
uint lexlength; /* allocated */
|
381
|
+
uint lexsize; /* used */
|
382
|
+
|
383
|
+
/* Inline stack for compatibility with Mosaic */
|
384
|
+
Node* inode; /* for deferring text node */
|
385
|
+
IStack* insert; /* for inferring inline tags */
|
386
|
+
IStack* istack;
|
387
|
+
uint istacklength; /* allocated */
|
388
|
+
uint istacksize; /* used */
|
389
|
+
uint istackbase; /* start of frame */
|
390
|
+
|
391
|
+
TagStyle *styles; /* used for cleaning up presentation markup */
|
392
|
+
|
393
|
+
TidyAllocator* allocator; /* allocator */
|
394
|
+
|
395
|
+
#if 0
|
396
|
+
TidyDocImpl* doc; /* Pointer back to doc for error reporting */
|
397
|
+
#endif
|
398
|
+
};
|
399
|
+
|
400
|
+
|
401
|
+
/* Lexer Functions
|
402
|
+
*/
|
403
|
+
|
404
|
+
/* choose what version to use for new doctype */
|
405
|
+
int TY_(HTMLVersion)( TidyDocImpl* doc );
|
406
|
+
|
407
|
+
/* everything is allowed in proprietary version of HTML */
|
408
|
+
/* this is handled here rather than in the tag/attr dicts */
|
409
|
+
|
410
|
+
void TY_(ConstrainVersion)( TidyDocImpl* doc, uint vers );
|
411
|
+
|
412
|
+
Bool TY_(IsWhite)(uint c);
|
413
|
+
Bool TY_(IsDigit)(uint c);
|
414
|
+
Bool TY_(IsLetter)(uint c);
|
415
|
+
Bool TY_(IsNewline)(uint c);
|
416
|
+
Bool TY_(IsNamechar)(uint c);
|
417
|
+
Bool TY_(IsXMLLetter)(uint c);
|
418
|
+
Bool TY_(IsXMLNamechar)(uint c);
|
419
|
+
|
420
|
+
/* Bool IsLower(uint c); */
|
421
|
+
Bool TY_(IsUpper)(uint c);
|
422
|
+
uint TY_(ToLower)(uint c);
|
423
|
+
uint TY_(ToUpper)(uint c);
|
424
|
+
|
425
|
+
Lexer* TY_(NewLexer)( TidyDocImpl* doc );
|
426
|
+
void TY_(FreeLexer)( TidyDocImpl* doc );
|
427
|
+
|
428
|
+
/* store character c as UTF-8 encoded byte stream */
|
429
|
+
void TY_(AddCharToLexer)( Lexer *lexer, uint c );
|
430
|
+
|
431
|
+
/*
|
432
|
+
Used for elements and text nodes
|
433
|
+
element name is NULL for text nodes
|
434
|
+
start and end are offsets into lexbuf
|
435
|
+
which contains the textual content of
|
436
|
+
all elements in the parse tree.
|
437
|
+
|
438
|
+
parent and content allow traversal
|
439
|
+
of the parse tree in any direction.
|
440
|
+
attributes are represented as a linked
|
441
|
+
list of AttVal nodes which hold the
|
442
|
+
strings for attribute/value pairs.
|
443
|
+
*/
|
444
|
+
Node* TY_(NewNode)( TidyAllocator* allocator, Lexer* lexer );
|
445
|
+
|
446
|
+
|
447
|
+
/* used to clone heading nodes when split by an <HR> */
|
448
|
+
Node* TY_(CloneNode)( TidyDocImpl* doc, Node *element );
|
449
|
+
|
450
|
+
/* free node's attributes */
|
451
|
+
void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node );
|
452
|
+
|
453
|
+
/* doesn't repair attribute list linkage */
|
454
|
+
void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av );
|
455
|
+
|
456
|
+
/* detach attribute from node */
|
457
|
+
void TY_(DetachAttribute)( Node *node, AttVal *attr );
|
458
|
+
|
459
|
+
/* detach attribute from node then free it
|
460
|
+
*/
|
461
|
+
void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr );
|
462
|
+
|
463
|
+
/*
|
464
|
+
Free document nodes by iterating through peers and recursing
|
465
|
+
through children. Set next to NULL before calling FreeNode()
|
466
|
+
to avoid freeing peer nodes. Doesn't patch up prev/next links.
|
467
|
+
*/
|
468
|
+
void TY_(FreeNode)( TidyDocImpl* doc, Node *node );
|
469
|
+
|
470
|
+
Node* TY_(TextToken)( Lexer *lexer );
|
471
|
+
|
472
|
+
/* used for creating preformatted text from Word2000 */
|
473
|
+
Node* TY_(NewLineNode)( Lexer *lexer );
|
474
|
+
|
475
|
+
/* used for adding a for Word2000 */
|
476
|
+
Node* TY_(NewLiteralTextNode)(Lexer *lexer, ctmbstr txt );
|
477
|
+
|
478
|
+
void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str );
|
479
|
+
/* void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len ); */
|
480
|
+
|
481
|
+
/* find element */
|
482
|
+
Node* TY_(FindDocType)( TidyDocImpl* doc );
|
483
|
+
Node* TY_(FindHTML)( TidyDocImpl* doc );
|
484
|
+
Node* TY_(FindHEAD)( TidyDocImpl* doc );
|
485
|
+
Node* TY_(FindTITLE)(TidyDocImpl* doc);
|
486
|
+
Node* TY_(FindBody)( TidyDocImpl* doc );
|
487
|
+
Node* TY_(FindXmlDecl)(TidyDocImpl* doc);
|
488
|
+
|
489
|
+
/* Returns containing block element, if any */
|
490
|
+
Node* TY_(FindContainer)( Node* node );
|
491
|
+
|
492
|
+
/* add meta element for Tidy */
|
493
|
+
Bool TY_(AddGenerator)( TidyDocImpl* doc );
|
494
|
+
|
495
|
+
uint TY_(ApparentVersion)( TidyDocImpl* doc );
|
496
|
+
|
497
|
+
ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool isXhtml );
|
498
|
+
|
499
|
+
Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc );
|
500
|
+
|
501
|
+
Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc );
|
502
|
+
|
503
|
+
|
504
|
+
/* fixup doctype if missing */
|
505
|
+
Bool TY_(FixDocType)( TidyDocImpl* doc );
|
506
|
+
|
507
|
+
/* ensure XML document starts with <?xml version="1.0"?> */
|
508
|
+
/* add encoding attribute if not using ASCII or UTF-8 output */
|
509
|
+
Bool TY_(FixXmlDecl)( TidyDocImpl* doc );
|
510
|
+
|
511
|
+
Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id);
|
512
|
+
|
513
|
+
void TY_(UngetToken)( TidyDocImpl* doc );
|
514
|
+
|
515
|
+
|
516
|
+
/*
|
517
|
+
modes for GetToken()
|
518
|
+
|
519
|
+
MixedContent -- for elements which don't accept PCDATA
|
520
|
+
Preformatted -- white space preserved as is
|
521
|
+
IgnoreMarkup -- for CDATA elements such as script, style
|
522
|
+
*/
|
523
|
+
typedef enum
|
524
|
+
{
|
525
|
+
IgnoreWhitespace,
|
526
|
+
MixedContent,
|
527
|
+
Preformatted,
|
528
|
+
IgnoreMarkup,
|
529
|
+
CdataContent
|
530
|
+
} GetTokenMode;
|
531
|
+
|
532
|
+
Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode );
|
533
|
+
|
534
|
+
void TY_(InitMap)(void);
|
535
|
+
|
536
|
+
|
537
|
+
/* create a new attribute */
|
538
|
+
AttVal* TY_(NewAttribute)( TidyDocImpl* doc );
|
539
|
+
|
540
|
+
/* create a new attribute with given name and value */
|
541
|
+
AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
|
542
|
+
int delim );
|
543
|
+
|
544
|
+
/* insert attribute at the end of attribute list of a node */
|
545
|
+
void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av );
|
546
|
+
|
547
|
+
/* insert attribute at the start of attribute list of a node */
|
548
|
+
void TY_(InsertAttributeAtStart)( Node *node, AttVal *av );
|
549
|
+
|
550
|
+
/*************************************
|
551
|
+
In-line Stack functions
|
552
|
+
*************************************/
|
553
|
+
|
554
|
+
|
555
|
+
/* duplicate attributes */
|
556
|
+
AttVal* TY_(DupAttrs)( TidyDocImpl* doc, AttVal* attrs );
|
557
|
+
|
558
|
+
/*
|
559
|
+
push a copy of an inline node onto stack
|
560
|
+
but don't push if implicit or OBJECT or APPLET
|
561
|
+
(implicit tags are ones generated from the istack)
|
562
|
+
|
563
|
+
One issue arises with pushing inlines when
|
564
|
+
the tag is already pushed. For instance:
|
565
|
+
|
566
|
+
<p><em>text
|
567
|
+
<p><em>more text
|
568
|
+
|
569
|
+
Shouldn't be mapped to
|
570
|
+
|
571
|
+
<p><em>text</em></p>
|
572
|
+
<p><em><em>more text</em></em>
|
573
|
+
*/
|
574
|
+
void TY_(PushInline)( TidyDocImpl* doc, Node* node );
|
575
|
+
|
576
|
+
/* pop inline stack */
|
577
|
+
void TY_(PopInline)( TidyDocImpl* doc, Node* node );
|
578
|
+
|
579
|
+
Bool TY_(IsPushed)( TidyDocImpl* doc, Node* node );
|
580
|
+
Bool TY_(IsPushedLast)( TidyDocImpl* doc, Node *element, Node *node );
|
581
|
+
|
582
|
+
/*
|
583
|
+
This has the effect of inserting "missing" inline
|
584
|
+
elements around the contents of blocklevel elements
|
585
|
+
such as P, TD, TH, DIV, PRE etc. This procedure is
|
586
|
+
called at the start of ParseBlock. when the inline
|
587
|
+
stack is not empty, as will be the case in:
|
588
|
+
|
589
|
+
<i><h1>italic heading</h1></i>
|
590
|
+
|
591
|
+
which is then treated as equivalent to
|
592
|
+
|
593
|
+
<h1><i>italic heading</i></h1>
|
594
|
+
|
595
|
+
This is implemented by setting the lexer into a mode
|
596
|
+
where it gets tokens from the inline stack rather than
|
597
|
+
from the input stream.
|
598
|
+
*/
|
599
|
+
int TY_(InlineDup)( TidyDocImpl* doc, Node *node );
|
600
|
+
|
601
|
+
/*
|
602
|
+
defer duplicates when entering a table or other
|
603
|
+
element where the inlines shouldn't be duplicated
|
604
|
+
*/
|
605
|
+
void TY_(DeferDup)( TidyDocImpl* doc );
|
606
|
+
Node* TY_(InsertedToken)( TidyDocImpl* doc );
|
607
|
+
|
608
|
+
/* stack manipulation for inline elements */
|
609
|
+
Bool TY_(SwitchInline)( TidyDocImpl* doc, Node* element, Node* node );
|
610
|
+
Bool TY_(InlineDup1)( TidyDocImpl* doc, Node* node, Node* element );
|
611
|
+
|
612
|
+
#ifdef __cplusplus
|
613
|
+
}
|
614
|
+
#endif
|
615
|
+
|
616
|
+
|
617
|
+
#endif /* __LEXER_H__ */
|