tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/tags.h ADDED
@@ -0,0 +1,235 @@
1
+ #ifndef __TAGS_H__
2
+ #define __TAGS_H__
3
+
4
+ /* tags.h -- recognize HTML tags
5
+
6
+ (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2006/12/15 10:17:55 $
13
+ $Revision: 1.20 $
14
+
15
+ The HTML tags are stored as 8 bit ASCII strings.
16
+ Use lookupw() to find a tag given a wide char string.
17
+
18
+ */
19
+
20
+ #include "forward.h"
21
+ #include "attrdict.h"
22
+
23
+ typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
24
+ typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node );
25
+
26
+ /*
27
+ Tag dictionary node
28
+ */
29
+
30
+ /* types of tags that the user can define */
31
+ typedef enum
32
+ {
33
+ tagtype_null = 0,
34
+ tagtype_empty = 1,
35
+ tagtype_inline = 2,
36
+ tagtype_block = 4,
37
+ tagtype_pre = 8
38
+ } UserTagType;
39
+
40
+ struct _Dict
41
+ {
42
+ TidyTagId id;
43
+ tmbstr name;
44
+ uint versions;
45
+ AttrVersion const * attrvers;
46
+ uint model;
47
+ Parser* parser;
48
+ CheckAttribs* chkattrs;
49
+ Dict* next;
50
+ };
51
+
52
+ #if !defined(ELEMENT_HASH_LOOKUP)
53
+ #define ELEMENT_HASH_LOOKUP 1
54
+ #endif
55
+
56
+ #if ELEMENT_HASH_LOOKUP
57
+ enum
58
+ {
59
+ ELEMENT_HASH_SIZE=178u
60
+ };
61
+
62
+ struct _DictHash
63
+ {
64
+ Dict const* tag;
65
+ struct _DictHash* next;
66
+ };
67
+
68
+ typedef struct _DictHash DictHash;
69
+ #endif
70
+
71
+ struct _TidyTagImpl
72
+ {
73
+ Dict* xml_tags; /* placeholder for all xml tags */
74
+ Dict* declared_tag_list; /* User declared tags */
75
+ #if ELEMENT_HASH_LOOKUP
76
+ DictHash* hashtab[ELEMENT_HASH_SIZE];
77
+ #endif
78
+ };
79
+
80
+ typedef struct _TidyTagImpl TidyTagImpl;
81
+
82
+ /* interface for finding tag by name */
83
+ const Dict* TY_(LookupTagDef)( TidyTagId tid );
84
+ Bool TY_(FindTag)( TidyDocImpl* doc, Node *node );
85
+ Parser* TY_(FindParser)( TidyDocImpl* doc, Node *node );
86
+ void TY_(DefineTag)( TidyDocImpl* doc, UserTagType tagType, ctmbstr name );
87
+ void TY_(FreeDeclaredTags)( TidyDocImpl* doc, UserTagType tagType ); /* tagtype_null to free all */
88
+
89
+ TidyIterator TY_(GetDeclaredTagList)( TidyDocImpl* doc );
90
+ ctmbstr TY_(GetNextDeclaredTag)( TidyDocImpl* doc, UserTagType tagType,
91
+ TidyIterator* iter );
92
+
93
+ void TY_(InitTags)( TidyDocImpl* doc );
94
+ void TY_(FreeTags)( TidyDocImpl* doc );
95
+
96
+
97
+ /* Parser methods for tags */
98
+
99
+ Parser TY_(ParseHTML);
100
+ Parser TY_(ParseHead);
101
+ Parser TY_(ParseTitle);
102
+ Parser TY_(ParseScript);
103
+ Parser TY_(ParseFrameSet);
104
+ Parser TY_(ParseNoFrames);
105
+ Parser TY_(ParseBody);
106
+ Parser TY_(ParsePre);
107
+ Parser TY_(ParseList);
108
+ Parser TY_(ParseDefList);
109
+ Parser TY_(ParseBlock);
110
+ Parser TY_(ParseInline);
111
+ Parser TY_(ParseEmpty);
112
+ Parser TY_(ParseTableTag);
113
+ Parser TY_(ParseColGroup);
114
+ Parser TY_(ParseRowGroup);
115
+ Parser TY_(ParseRow);
116
+ Parser TY_(ParseSelect);
117
+ Parser TY_(ParseOptGroup);
118
+ Parser TY_(ParseText);
119
+
120
+ CheckAttribs TY_(CheckAttributes);
121
+
122
+ /* 0 == TidyTag_UNKNOWN */
123
+ #define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN)
124
+ #define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid)
125
+
126
+ Bool TY_(nodeIsText)( Node* node );
127
+ Bool TY_(nodeIsElement)( Node* node );
128
+
129
+ Bool TY_(nodeHasText)( TidyDocImpl* doc, Node* node );
130
+
131
+ #if 0
132
+ /* Compare & result to operand. If equal, then all bits
133
+ ** requested are set.
134
+ */
135
+ Bool nodeMatchCM( Node* node, uint contentModel );
136
+ #endif
137
+
138
+ /* True if any of the bits requested are set.
139
+ */
140
+ Bool TY_(nodeHasCM)( Node* node, uint contentModel );
141
+
142
+ Bool TY_(nodeCMIsBlock)( Node* node );
143
+ Bool TY_(nodeCMIsInline)( Node* node );
144
+ Bool TY_(nodeCMIsEmpty)( Node* node );
145
+
146
+
147
+ Bool TY_(nodeIsHeader)( Node* node ); /* H1, H2, ..., H6 */
148
+ uint TY_(nodeHeaderLevel)( Node* node ); /* 1, 2, ..., 6 */
149
+
150
+ #define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML )
151
+ #define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD )
152
+ #define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE )
153
+ #define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE )
154
+ #define nodeIsMETA( node ) TagIsId( node, TidyTag_META )
155
+ #define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY )
156
+ #define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET )
157
+ #define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
158
+ #define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME )
159
+ #define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES )
160
+ #define nodeIsHR( node ) TagIsId( node, TidyTag_HR )
161
+ #define nodeIsH1( node ) TagIsId( node, TidyTag_H1 )
162
+ #define nodeIsH2( node ) TagIsId( node, TidyTag_H2 )
163
+ #define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE )
164
+ #define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING )
165
+ #define nodeIsP( node ) TagIsId( node, TidyTag_P )
166
+ #define nodeIsUL( node ) TagIsId( node, TidyTag_UL )
167
+ #define nodeIsOL( node ) TagIsId( node, TidyTag_OL )
168
+ #define nodeIsDL( node ) TagIsId( node, TidyTag_DL )
169
+ #define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR )
170
+ #define nodeIsLI( node ) TagIsId( node, TidyTag_LI )
171
+ #define nodeIsDT( node ) TagIsId( node, TidyTag_DT )
172
+ #define nodeIsDD( node ) TagIsId( node, TidyTag_DD )
173
+ #define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE )
174
+ #define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION )
175
+ #define nodeIsTD( node ) TagIsId( node, TidyTag_TD )
176
+ #define nodeIsTH( node ) TagIsId( node, TidyTag_TH )
177
+ #define nodeIsTR( node ) TagIsId( node, TidyTag_TR )
178
+ #define nodeIsCOL( node ) TagIsId( node, TidyTag_COL )
179
+ #define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP )
180
+ #define nodeIsBR( node ) TagIsId( node, TidyTag_BR )
181
+ #define nodeIsA( node ) TagIsId( node, TidyTag_A )
182
+ #define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK )
183
+ #define nodeIsB( node ) TagIsId( node, TidyTag_B )
184
+ #define nodeIsI( node ) TagIsId( node, TidyTag_I )
185
+ #define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG )
186
+ #define nodeIsEM( node ) TagIsId( node, TidyTag_EM )
187
+ #define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG )
188
+ #define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL )
189
+ #define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM )
190
+ #define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION )
191
+ #define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP )
192
+ #define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG )
193
+ #define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP )
194
+ #define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA )
195
+ #define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR )
196
+ #define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR )
197
+ #define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT )
198
+ #define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER )
199
+ #define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER )
200
+ #define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER )
201
+ #define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE )
202
+ #define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT )
203
+ #define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT )
204
+ #define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM )
205
+ #define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA )
206
+ #define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE )
207
+ #define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET )
208
+ #define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT )
209
+ #define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV )
210
+ #define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN )
211
+ #define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT )
212
+ #define nodeIsQ( node ) TagIsId( node, TidyTag_Q )
213
+ #define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL )
214
+ #define nodeIsH3( node ) TagIsId( node, TidyTag_H3 )
215
+ #define nodeIsH4( node ) TagIsId( node, TidyTag_H4 )
216
+ #define nodeIsH5( node ) TagIsId( node, TidyTag_H5 )
217
+ #define nodeIsH6( node ) TagIsId( node, TidyTag_H6 )
218
+ #define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS )
219
+ #define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP )
220
+ #define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT )
221
+ #define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK )
222
+ #define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE )
223
+ #define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED )
224
+ #define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT )
225
+ #define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX )
226
+ #define nodeIsS( node ) TagIsId( node, TidyTag_S )
227
+ #define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE )
228
+ #define nodeIsSUB( node ) TagIsId( node, TidyTag_SUB )
229
+ #define nodeIsSUP( node ) TagIsId( node, TidyTag_SUP )
230
+ #define nodeIsU( node ) TagIsId( node, TidyTag_U )
231
+ #define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU )
232
+ #define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON )
233
+
234
+
235
+ #endif /* __TAGS_H__ */
@@ -0,0 +1,129 @@
1
+ #ifndef __TIDY_INT_H__
2
+ #define __TIDY_INT_H__
3
+
4
+ /* tidy-int.h -- internal library declarations
5
+
6
+ (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2007/02/11 09:45:52 $
13
+ $Revision: 1.13 $
14
+
15
+ */
16
+
17
+ #include "tidy.h"
18
+ #include "config.h"
19
+ #include "lexer.h"
20
+ #include "tags.h"
21
+ #include "attrs.h"
22
+ #include "pprint.h"
23
+ #include "access.h"
24
+
25
+ #ifndef MAX
26
+ #define MAX(a,b) (((a) > (b))?(a):(b))
27
+ #endif
28
+ #ifndef MIN
29
+ #define MIN(a,b) (((a) < (b))?(a):(b))
30
+ #endif
31
+
32
+ struct _TidyDocImpl
33
+ {
34
+ /* The Document Tree (and backing store buffer) */
35
+ Node root; /* This MUST remain the first declared
36
+ variable in this structure */
37
+ Lexer* lexer;
38
+
39
+ /* Config + Markup Declarations */
40
+ TidyConfigImpl config;
41
+ TidyTagImpl tags;
42
+ TidyAttribImpl attribs;
43
+
44
+ #if SUPPORT_ACCESSIBILITY_CHECKS
45
+ /* Accessibility Checks state */
46
+ TidyAccessImpl access;
47
+ #endif
48
+
49
+ /* The Pretty Print buffer */
50
+ TidyPrintImpl pprint;
51
+
52
+ /* I/O */
53
+ StreamIn* docIn;
54
+ StreamOut* docOut;
55
+ StreamOut* errout;
56
+ TidyReportFilter mssgFilt;
57
+ TidyOptCallback pOptCallback;
58
+
59
+ /* Parse + Repair Results */
60
+ uint optionErrors;
61
+ uint errors;
62
+ uint warnings;
63
+ uint accessErrors;
64
+ uint infoMessages;
65
+ uint docErrors;
66
+ int parseStatus;
67
+
68
+ uint badAccess; /* for accessibility errors */
69
+ uint badLayout; /* for bad style errors */
70
+ uint badChars; /* for bad char encodings */
71
+ uint badForm; /* for badly placed form tags */
72
+
73
+ /* Memory allocator */
74
+ TidyAllocator* allocator;
75
+
76
+ /* Miscellaneous */
77
+ void* appData;
78
+ uint nClassId;
79
+ Bool inputHadBOM;
80
+
81
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
82
+ Bool storeText;
83
+ #endif
84
+
85
+ #if PRESERVE_FILE_TIMES
86
+ struct utimbuf filetimes;
87
+ #endif
88
+ tmbstr givenDoctype;
89
+ };
90
+
91
+
92
+ /* Twizzle internal/external types */
93
+ #ifdef NEVER
94
+ TidyDocImpl* tidyDocToImpl( TidyDoc tdoc );
95
+ TidyDoc tidyImplToDoc( TidyDocImpl* impl );
96
+
97
+ Node* tidyNodeToImpl( TidyNode tnod );
98
+ TidyNode tidyImplToNode( Node* node );
99
+
100
+ AttVal* tidyAttrToImpl( TidyAttr tattr );
101
+ TidyAttr tidyImplToAttr( AttVal* attval );
102
+
103
+ const TidyOptionImpl* tidyOptionToImpl( TidyOption topt );
104
+ TidyOption tidyImplToOption( const TidyOptionImpl* option );
105
+ #else
106
+
107
+ #define tidyDocToImpl( tdoc ) ((TidyDocImpl*)(tdoc))
108
+ #define tidyImplToDoc( doc ) ((TidyDoc)(doc))
109
+
110
+ #define tidyNodeToImpl( tnod ) ((Node*)(tnod))
111
+ #define tidyImplToNode( node ) ((TidyNode)(node))
112
+
113
+ #define tidyAttrToImpl( tattr ) ((AttVal*)(tattr))
114
+ #define tidyImplToAttr( attval ) ((TidyAttr)(attval))
115
+
116
+ #define tidyOptionToImpl( topt ) ((const TidyOptionImpl*)(topt))
117
+ #define tidyImplToOption( option ) ((TidyOption)(option))
118
+
119
+ #endif
120
+
121
+ /** Wrappers for easy memory allocation using the document's allocator */
122
+ #define TidyDocAlloc(doc, size) TidyAlloc((doc)->allocator, size)
123
+ #define TidyDocRealloc(doc, block, size) TidyRealloc((doc)->allocator, block, size)
124
+ #define TidyDocFree(doc, block) TidyFree((doc)->allocator, block)
125
+ #define TidyDocPanic(doc, msg) TidyPanic((doc)->allocator, msg)
126
+
127
+ int TY_(DocParseStream)( TidyDocImpl* impl, StreamIn* in );
128
+
129
+ #endif /* __TIDY_INT_H__ */