tidy-ext 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/tags.h ADDED
@@ -0,0 +1,235 @@
1
+ #ifndef __TAGS_H__
2
+ #define __TAGS_H__
3
+
4
+ /* tags.h -- recognize HTML tags
5
+
6
+ (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2006/12/15 10:17:55 $
13
+ $Revision: 1.20 $
14
+
15
+ The HTML tags are stored as 8 bit ASCII strings.
16
+ Use lookupw() to find a tag given a wide char string.
17
+
18
+ */
19
+
20
+ #include "forward.h"
21
+ #include "attrdict.h"
22
+
23
+ typedef void (Parser)( TidyDocImpl* doc, Node *node, GetTokenMode mode );
24
+ typedef void (CheckAttribs)( TidyDocImpl* doc, Node *node );
25
+
26
+ /*
27
+ Tag dictionary node
28
+ */
29
+
30
+ /* types of tags that the user can define */
31
+ typedef enum
32
+ {
33
+ tagtype_null = 0,
34
+ tagtype_empty = 1,
35
+ tagtype_inline = 2,
36
+ tagtype_block = 4,
37
+ tagtype_pre = 8
38
+ } UserTagType;
39
+
40
+ struct _Dict
41
+ {
42
+ TidyTagId id;
43
+ tmbstr name;
44
+ uint versions;
45
+ AttrVersion const * attrvers;
46
+ uint model;
47
+ Parser* parser;
48
+ CheckAttribs* chkattrs;
49
+ Dict* next;
50
+ };
51
+
52
+ #if !defined(ELEMENT_HASH_LOOKUP)
53
+ #define ELEMENT_HASH_LOOKUP 1
54
+ #endif
55
+
56
+ #if ELEMENT_HASH_LOOKUP
57
+ enum
58
+ {
59
+ ELEMENT_HASH_SIZE=178u
60
+ };
61
+
62
+ struct _DictHash
63
+ {
64
+ Dict const* tag;
65
+ struct _DictHash* next;
66
+ };
67
+
68
+ typedef struct _DictHash DictHash;
69
+ #endif
70
+
71
+ struct _TidyTagImpl
72
+ {
73
+ Dict* xml_tags; /* placeholder for all xml tags */
74
+ Dict* declared_tag_list; /* User declared tags */
75
+ #if ELEMENT_HASH_LOOKUP
76
+ DictHash* hashtab[ELEMENT_HASH_SIZE];
77
+ #endif
78
+ };
79
+
80
+ typedef struct _TidyTagImpl TidyTagImpl;
81
+
82
+ /* interface for finding tag by name */
83
+ const Dict* TY_(LookupTagDef)( TidyTagId tid );
84
+ Bool TY_(FindTag)( TidyDocImpl* doc, Node *node );
85
+ Parser* TY_(FindParser)( TidyDocImpl* doc, Node *node );
86
+ void TY_(DefineTag)( TidyDocImpl* doc, UserTagType tagType, ctmbstr name );
87
+ void TY_(FreeDeclaredTags)( TidyDocImpl* doc, UserTagType tagType ); /* tagtype_null to free all */
88
+
89
+ TidyIterator TY_(GetDeclaredTagList)( TidyDocImpl* doc );
90
+ ctmbstr TY_(GetNextDeclaredTag)( TidyDocImpl* doc, UserTagType tagType,
91
+ TidyIterator* iter );
92
+
93
+ void TY_(InitTags)( TidyDocImpl* doc );
94
+ void TY_(FreeTags)( TidyDocImpl* doc );
95
+
96
+
97
+ /* Parser methods for tags */
98
+
99
+ Parser TY_(ParseHTML);
100
+ Parser TY_(ParseHead);
101
+ Parser TY_(ParseTitle);
102
+ Parser TY_(ParseScript);
103
+ Parser TY_(ParseFrameSet);
104
+ Parser TY_(ParseNoFrames);
105
+ Parser TY_(ParseBody);
106
+ Parser TY_(ParsePre);
107
+ Parser TY_(ParseList);
108
+ Parser TY_(ParseDefList);
109
+ Parser TY_(ParseBlock);
110
+ Parser TY_(ParseInline);
111
+ Parser TY_(ParseEmpty);
112
+ Parser TY_(ParseTableTag);
113
+ Parser TY_(ParseColGroup);
114
+ Parser TY_(ParseRowGroup);
115
+ Parser TY_(ParseRow);
116
+ Parser TY_(ParseSelect);
117
+ Parser TY_(ParseOptGroup);
118
+ Parser TY_(ParseText);
119
+
120
+ CheckAttribs TY_(CheckAttributes);
121
+
122
+ /* 0 == TidyTag_UNKNOWN */
123
+ #define TagId(node) ((node) && (node)->tag ? (node)->tag->id : TidyTag_UNKNOWN)
124
+ #define TagIsId(node, tid) ((node) && (node)->tag && (node)->tag->id == tid)
125
+
126
+ Bool TY_(nodeIsText)( Node* node );
127
+ Bool TY_(nodeIsElement)( Node* node );
128
+
129
+ Bool TY_(nodeHasText)( TidyDocImpl* doc, Node* node );
130
+
131
+ #if 0
132
+ /* Compare & result to operand. If equal, then all bits
133
+ ** requested are set.
134
+ */
135
+ Bool nodeMatchCM( Node* node, uint contentModel );
136
+ #endif
137
+
138
+ /* True if any of the bits requested are set.
139
+ */
140
+ Bool TY_(nodeHasCM)( Node* node, uint contentModel );
141
+
142
+ Bool TY_(nodeCMIsBlock)( Node* node );
143
+ Bool TY_(nodeCMIsInline)( Node* node );
144
+ Bool TY_(nodeCMIsEmpty)( Node* node );
145
+
146
+
147
+ Bool TY_(nodeIsHeader)( Node* node ); /* H1, H2, ..., H6 */
148
+ uint TY_(nodeHeaderLevel)( Node* node ); /* 1, 2, ..., 6 */
149
+
150
+ #define nodeIsHTML( node ) TagIsId( node, TidyTag_HTML )
151
+ #define nodeIsHEAD( node ) TagIsId( node, TidyTag_HEAD )
152
+ #define nodeIsTITLE( node ) TagIsId( node, TidyTag_TITLE )
153
+ #define nodeIsBASE( node ) TagIsId( node, TidyTag_BASE )
154
+ #define nodeIsMETA( node ) TagIsId( node, TidyTag_META )
155
+ #define nodeIsBODY( node ) TagIsId( node, TidyTag_BODY )
156
+ #define nodeIsFRAMESET( node ) TagIsId( node, TidyTag_FRAMESET )
157
+ #define nodeIsFRAME( node ) TagIsId( node, TidyTag_FRAME )
158
+ #define nodeIsIFRAME( node ) TagIsId( node, TidyTag_IFRAME )
159
+ #define nodeIsNOFRAMES( node ) TagIsId( node, TidyTag_NOFRAMES )
160
+ #define nodeIsHR( node ) TagIsId( node, TidyTag_HR )
161
+ #define nodeIsH1( node ) TagIsId( node, TidyTag_H1 )
162
+ #define nodeIsH2( node ) TagIsId( node, TidyTag_H2 )
163
+ #define nodeIsPRE( node ) TagIsId( node, TidyTag_PRE )
164
+ #define nodeIsLISTING( node ) TagIsId( node, TidyTag_LISTING )
165
+ #define nodeIsP( node ) TagIsId( node, TidyTag_P )
166
+ #define nodeIsUL( node ) TagIsId( node, TidyTag_UL )
167
+ #define nodeIsOL( node ) TagIsId( node, TidyTag_OL )
168
+ #define nodeIsDL( node ) TagIsId( node, TidyTag_DL )
169
+ #define nodeIsDIR( node ) TagIsId( node, TidyTag_DIR )
170
+ #define nodeIsLI( node ) TagIsId( node, TidyTag_LI )
171
+ #define nodeIsDT( node ) TagIsId( node, TidyTag_DT )
172
+ #define nodeIsDD( node ) TagIsId( node, TidyTag_DD )
173
+ #define nodeIsTABLE( node ) TagIsId( node, TidyTag_TABLE )
174
+ #define nodeIsCAPTION( node ) TagIsId( node, TidyTag_CAPTION )
175
+ #define nodeIsTD( node ) TagIsId( node, TidyTag_TD )
176
+ #define nodeIsTH( node ) TagIsId( node, TidyTag_TH )
177
+ #define nodeIsTR( node ) TagIsId( node, TidyTag_TR )
178
+ #define nodeIsCOL( node ) TagIsId( node, TidyTag_COL )
179
+ #define nodeIsCOLGROUP( node ) TagIsId( node, TidyTag_COLGROUP )
180
+ #define nodeIsBR( node ) TagIsId( node, TidyTag_BR )
181
+ #define nodeIsA( node ) TagIsId( node, TidyTag_A )
182
+ #define nodeIsLINK( node ) TagIsId( node, TidyTag_LINK )
183
+ #define nodeIsB( node ) TagIsId( node, TidyTag_B )
184
+ #define nodeIsI( node ) TagIsId( node, TidyTag_I )
185
+ #define nodeIsSTRONG( node ) TagIsId( node, TidyTag_STRONG )
186
+ #define nodeIsEM( node ) TagIsId( node, TidyTag_EM )
187
+ #define nodeIsBIG( node ) TagIsId( node, TidyTag_BIG )
188
+ #define nodeIsSMALL( node ) TagIsId( node, TidyTag_SMALL )
189
+ #define nodeIsPARAM( node ) TagIsId( node, TidyTag_PARAM )
190
+ #define nodeIsOPTION( node ) TagIsId( node, TidyTag_OPTION )
191
+ #define nodeIsOPTGROUP( node ) TagIsId( node, TidyTag_OPTGROUP )
192
+ #define nodeIsIMG( node ) TagIsId( node, TidyTag_IMG )
193
+ #define nodeIsMAP( node ) TagIsId( node, TidyTag_MAP )
194
+ #define nodeIsAREA( node ) TagIsId( node, TidyTag_AREA )
195
+ #define nodeIsNOBR( node ) TagIsId( node, TidyTag_NOBR )
196
+ #define nodeIsWBR( node ) TagIsId( node, TidyTag_WBR )
197
+ #define nodeIsFONT( node ) TagIsId( node, TidyTag_FONT )
198
+ #define nodeIsLAYER( node ) TagIsId( node, TidyTag_LAYER )
199
+ #define nodeIsSPACER( node ) TagIsId( node, TidyTag_SPACER )
200
+ #define nodeIsCENTER( node ) TagIsId( node, TidyTag_CENTER )
201
+ #define nodeIsSTYLE( node ) TagIsId( node, TidyTag_STYLE )
202
+ #define nodeIsSCRIPT( node ) TagIsId( node, TidyTag_SCRIPT )
203
+ #define nodeIsNOSCRIPT( node ) TagIsId( node, TidyTag_NOSCRIPT )
204
+ #define nodeIsFORM( node ) TagIsId( node, TidyTag_FORM )
205
+ #define nodeIsTEXTAREA( node ) TagIsId( node, TidyTag_TEXTAREA )
206
+ #define nodeIsBLOCKQUOTE( node ) TagIsId( node, TidyTag_BLOCKQUOTE )
207
+ #define nodeIsAPPLET( node ) TagIsId( node, TidyTag_APPLET )
208
+ #define nodeIsOBJECT( node ) TagIsId( node, TidyTag_OBJECT )
209
+ #define nodeIsDIV( node ) TagIsId( node, TidyTag_DIV )
210
+ #define nodeIsSPAN( node ) TagIsId( node, TidyTag_SPAN )
211
+ #define nodeIsINPUT( node ) TagIsId( node, TidyTag_INPUT )
212
+ #define nodeIsQ( node ) TagIsId( node, TidyTag_Q )
213
+ #define nodeIsLABEL( node ) TagIsId( node, TidyTag_LABEL )
214
+ #define nodeIsH3( node ) TagIsId( node, TidyTag_H3 )
215
+ #define nodeIsH4( node ) TagIsId( node, TidyTag_H4 )
216
+ #define nodeIsH5( node ) TagIsId( node, TidyTag_H5 )
217
+ #define nodeIsH6( node ) TagIsId( node, TidyTag_H6 )
218
+ #define nodeIsADDRESS( node ) TagIsId( node, TidyTag_ADDRESS )
219
+ #define nodeIsXMP( node ) TagIsId( node, TidyTag_XMP )
220
+ #define nodeIsSELECT( node ) TagIsId( node, TidyTag_SELECT )
221
+ #define nodeIsBLINK( node ) TagIsId( node, TidyTag_BLINK )
222
+ #define nodeIsMARQUEE( node ) TagIsId( node, TidyTag_MARQUEE )
223
+ #define nodeIsEMBED( node ) TagIsId( node, TidyTag_EMBED )
224
+ #define nodeIsBASEFONT( node ) TagIsId( node, TidyTag_BASEFONT )
225
+ #define nodeIsISINDEX( node ) TagIsId( node, TidyTag_ISINDEX )
226
+ #define nodeIsS( node ) TagIsId( node, TidyTag_S )
227
+ #define nodeIsSTRIKE( node ) TagIsId( node, TidyTag_STRIKE )
228
+ #define nodeIsSUB( node ) TagIsId( node, TidyTag_SUB )
229
+ #define nodeIsSUP( node ) TagIsId( node, TidyTag_SUP )
230
+ #define nodeIsU( node ) TagIsId( node, TidyTag_U )
231
+ #define nodeIsMENU( node ) TagIsId( node, TidyTag_MENU )
232
+ #define nodeIsBUTTON( node ) TagIsId( node, TidyTag_BUTTON )
233
+
234
+
235
+ #endif /* __TAGS_H__ */
@@ -0,0 +1,129 @@
1
+ #ifndef __TIDY_INT_H__
2
+ #define __TIDY_INT_H__
3
+
4
+ /* tidy-int.h -- internal library declarations
5
+
6
+ (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2007/02/11 09:45:52 $
13
+ $Revision: 1.13 $
14
+
15
+ */
16
+
17
+ #include "tidy.h"
18
+ #include "config.h"
19
+ #include "lexer.h"
20
+ #include "tags.h"
21
+ #include "attrs.h"
22
+ #include "pprint.h"
23
+ #include "access.h"
24
+
25
+ #ifndef MAX
26
+ #define MAX(a,b) (((a) > (b))?(a):(b))
27
+ #endif
28
+ #ifndef MIN
29
+ #define MIN(a,b) (((a) < (b))?(a):(b))
30
+ #endif
31
+
32
+ struct _TidyDocImpl
33
+ {
34
+ /* The Document Tree (and backing store buffer) */
35
+ Node root; /* This MUST remain the first declared
36
+ variable in this structure */
37
+ Lexer* lexer;
38
+
39
+ /* Config + Markup Declarations */
40
+ TidyConfigImpl config;
41
+ TidyTagImpl tags;
42
+ TidyAttribImpl attribs;
43
+
44
+ #if SUPPORT_ACCESSIBILITY_CHECKS
45
+ /* Accessibility Checks state */
46
+ TidyAccessImpl access;
47
+ #endif
48
+
49
+ /* The Pretty Print buffer */
50
+ TidyPrintImpl pprint;
51
+
52
+ /* I/O */
53
+ StreamIn* docIn;
54
+ StreamOut* docOut;
55
+ StreamOut* errout;
56
+ TidyReportFilter mssgFilt;
57
+ TidyOptCallback pOptCallback;
58
+
59
+ /* Parse + Repair Results */
60
+ uint optionErrors;
61
+ uint errors;
62
+ uint warnings;
63
+ uint accessErrors;
64
+ uint infoMessages;
65
+ uint docErrors;
66
+ int parseStatus;
67
+
68
+ uint badAccess; /* for accessibility errors */
69
+ uint badLayout; /* for bad style errors */
70
+ uint badChars; /* for bad char encodings */
71
+ uint badForm; /* for badly placed form tags */
72
+
73
+ /* Memory allocator */
74
+ TidyAllocator* allocator;
75
+
76
+ /* Miscellaneous */
77
+ void* appData;
78
+ uint nClassId;
79
+ Bool inputHadBOM;
80
+
81
+ #ifdef TIDY_STORE_ORIGINAL_TEXT
82
+ Bool storeText;
83
+ #endif
84
+
85
+ #if PRESERVE_FILE_TIMES
86
+ struct utimbuf filetimes;
87
+ #endif
88
+ tmbstr givenDoctype;
89
+ };
90
+
91
+
92
+ /* Twizzle internal/external types */
93
+ #ifdef NEVER
94
+ TidyDocImpl* tidyDocToImpl( TidyDoc tdoc );
95
+ TidyDoc tidyImplToDoc( TidyDocImpl* impl );
96
+
97
+ Node* tidyNodeToImpl( TidyNode tnod );
98
+ TidyNode tidyImplToNode( Node* node );
99
+
100
+ AttVal* tidyAttrToImpl( TidyAttr tattr );
101
+ TidyAttr tidyImplToAttr( AttVal* attval );
102
+
103
+ const TidyOptionImpl* tidyOptionToImpl( TidyOption topt );
104
+ TidyOption tidyImplToOption( const TidyOptionImpl* option );
105
+ #else
106
+
107
+ #define tidyDocToImpl( tdoc ) ((TidyDocImpl*)(tdoc))
108
+ #define tidyImplToDoc( doc ) ((TidyDoc)(doc))
109
+
110
+ #define tidyNodeToImpl( tnod ) ((Node*)(tnod))
111
+ #define tidyImplToNode( node ) ((TidyNode)(node))
112
+
113
+ #define tidyAttrToImpl( tattr ) ((AttVal*)(tattr))
114
+ #define tidyImplToAttr( attval ) ((TidyAttr)(attval))
115
+
116
+ #define tidyOptionToImpl( topt ) ((const TidyOptionImpl*)(topt))
117
+ #define tidyImplToOption( option ) ((TidyOption)(option))
118
+
119
+ #endif
120
+
121
+ /** Wrappers for easy memory allocation using the document's allocator */
122
+ #define TidyDocAlloc(doc, size) TidyAlloc((doc)->allocator, size)
123
+ #define TidyDocRealloc(doc, block, size) TidyRealloc((doc)->allocator, block, size)
124
+ #define TidyDocFree(doc, block) TidyFree((doc)->allocator, block)
125
+ #define TidyDocPanic(doc, msg) TidyPanic((doc)->allocator, msg)
126
+
127
+ int TY_(DocParseStream)( TidyDocImpl* impl, StreamIn* in );
128
+
129
+ #endif /* __TIDY_INT_H__ */