tidy-ext 0.1.7

Sign up to get free protection for your applications and to get access to all the features.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/config.h ADDED
@@ -0,0 +1,153 @@
1
+ #ifndef __CONFIG_H__
2
+ #define __CONFIG_H__
3
+
4
+ /* config.h -- read config file and manage config properties
5
+
6
+ (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2006/12/29 16:31:08 $
13
+ $Revision: 1.14 $
14
+
15
+ config files associate a property name with a value.
16
+
17
+ // comments can start at the beginning of a line
18
+ # comments can start at the beginning of a line
19
+ name: short values fit onto one line
20
+ name: a really long value that
21
+ continues on the next line
22
+
23
+ property names are case insensitive and should be less than
24
+ 60 characters in length and must start at the begining of
25
+ the line, as whitespace at the start of a line signifies a
26
+ line continuation.
27
+
28
+ */
29
+
30
+ #include "forward.h"
31
+ #include "tidy.h"
32
+ #include "streamio.h"
33
+
34
+ struct _tidy_option;
35
+ typedef struct _tidy_option TidyOptionImpl;
36
+
37
+ typedef Bool (ParseProperty)( TidyDocImpl* doc, const TidyOptionImpl* opt );
38
+
39
+ struct _tidy_option
40
+ {
41
+ TidyOptionId id;
42
+ TidyConfigCategory category; /* put 'em in groups */
43
+ ctmbstr name; /* property name */
44
+ TidyOptionType type; /* string, int or bool */
45
+ ulong dflt; /* default for TidyInteger and TidyBoolean */
46
+ ParseProperty* parser; /* parsing method, read-only if NULL */
47
+ const ctmbstr* pickList; /* pick list */
48
+ ctmbstr pdflt; /* default for TidyString */
49
+ };
50
+
51
+ typedef union
52
+ {
53
+ ulong v; /* Value for TidyInteger and TidyBoolean */
54
+ char *p; /* Value for TidyString */
55
+ } TidyOptionValue;
56
+
57
+ typedef struct _tidy_config
58
+ {
59
+ TidyOptionValue value[ N_TIDY_OPTIONS + 1 ]; /* current config values */
60
+ TidyOptionValue snapshot[ N_TIDY_OPTIONS + 1 ]; /* Snapshot of values to be restored later */
61
+
62
+ /* track what tags user has defined to eliminate unnecessary searches */
63
+ uint defined_tags;
64
+
65
+ uint c; /* current char in input stream */
66
+ StreamIn* cfgIn; /* current input source */
67
+
68
+ } TidyConfigImpl;
69
+
70
+
71
+ typedef struct {
72
+ TidyOptionId opt; /**< Identifier. */
73
+ ctmbstr doc; /**< HTML text */
74
+ TidyOptionId const *links; /**< Cross references.
75
+ Last element must be 'TidyUnknownOption'. */
76
+ } TidyOptionDoc;
77
+
78
+
79
+ const TidyOptionImpl* TY_(lookupOption)( ctmbstr optnam );
80
+ const TidyOptionImpl* TY_(getOption)( TidyOptionId optId );
81
+
82
+ TidyIterator TY_(getOptionList)( TidyDocImpl* doc );
83
+ const TidyOptionImpl* TY_(getNextOption)( TidyDocImpl* doc, TidyIterator* iter );
84
+
85
+ TidyIterator TY_(getOptionPickList)( const TidyOptionImpl* option );
86
+ ctmbstr TY_(getNextOptionPick)( const TidyOptionImpl* option, TidyIterator* iter );
87
+
88
+ const TidyOptionDoc* TY_(OptGetDocDesc)( TidyOptionId optId );
89
+
90
+ void TY_(InitConfig)( TidyDocImpl* doc );
91
+ void TY_(FreeConfig)( TidyDocImpl* doc );
92
+
93
+ /* Bool SetOptionValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr val ); */
94
+ Bool TY_(SetOptionInt)( TidyDocImpl* doc, TidyOptionId optId, ulong val );
95
+ Bool TY_(SetOptionBool)( TidyDocImpl* doc, TidyOptionId optId, Bool val );
96
+
97
+ Bool TY_(ResetOptionToDefault)( TidyDocImpl* doc, TidyOptionId optId );
98
+ void TY_(ResetConfigToDefault)( TidyDocImpl* doc );
99
+ void TY_(TakeConfigSnapshot)( TidyDocImpl* doc );
100
+ void TY_(ResetConfigToSnapshot)( TidyDocImpl* doc );
101
+
102
+ void TY_(CopyConfig)( TidyDocImpl* docTo, TidyDocImpl* docFrom );
103
+
104
+ int TY_(ParseConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
105
+ int TY_(ParseConfigFileEnc)( TidyDocImpl* doc,
106
+ ctmbstr cfgfil, ctmbstr charenc );
107
+
108
+ int TY_(SaveConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
109
+ int TY_(SaveConfigSink)( TidyDocImpl* doc, TidyOutputSink* sink );
110
+
111
+ /* returns false if unknown option, missing parameter, or
112
+ option doesn't use parameter
113
+ */
114
+ Bool TY_(ParseConfigOption)( TidyDocImpl* doc, ctmbstr optnam, ctmbstr optVal );
115
+ Bool TY_(ParseConfigValue)( TidyDocImpl* doc, TidyOptionId optId, ctmbstr optVal );
116
+
117
+ /* ensure that char encodings are self consistent */
118
+ Bool TY_(AdjustCharEncoding)( TidyDocImpl* doc, int encoding );
119
+
120
+ Bool TY_(ConfigDiffThanDefault)( TidyDocImpl* doc );
121
+ Bool TY_(ConfigDiffThanSnapshot)( TidyDocImpl* doc );
122
+
123
+ int TY_(CharEncodingId)( TidyDocImpl* doc, ctmbstr charenc );
124
+ ctmbstr TY_(CharEncodingName)( int encoding );
125
+ ctmbstr TY_(CharEncodingOptName)( int encoding );
126
+
127
+ /* void SetEmacsFilename( TidyDocImpl* doc, ctmbstr filename ); */
128
+
129
+
130
+ #ifdef _DEBUG
131
+
132
+ /* Debug lookup functions will be type-safe and assert option type match */
133
+ ulong TY_(_cfgGet)( TidyDocImpl* doc, TidyOptionId optId );
134
+ Bool TY_(_cfgGetBool)( TidyDocImpl* doc, TidyOptionId optId );
135
+ TidyTriState TY_(_cfgGetAutoBool)( TidyDocImpl* doc, TidyOptionId optId );
136
+ ctmbstr TY_(_cfgGetString)( TidyDocImpl* doc, TidyOptionId optId );
137
+
138
+ #define cfg(doc, id) TY_(_cfgGet)( (doc), (id) )
139
+ #define cfgBool(doc, id) TY_(_cfgGetBool)( (doc), (id) )
140
+ #define cfgAutoBool(doc, id) TY_(_cfgGetAutoBool)( (doc), (id) )
141
+ #define cfgStr(doc, id) TY_(_cfgGetString)( (doc), (id) )
142
+
143
+ #else
144
+
145
+ /* Release build macros for speed */
146
+ #define cfg(doc, id) ((doc)->config.value[ (id) ].v)
147
+ #define cfgBool(doc, id) ((Bool) cfg(doc, id))
148
+ #define cfgAutoBool(doc, id) ((TidyTriState) cfg(doc, id))
149
+ #define cfgStr(doc, id) ((ctmbstr) (doc)->config.value[ (id) ].p)
150
+
151
+ #endif /* _DEBUG */
152
+
153
+ #endif /* __CONFIG_H__ */
@@ -0,0 +1,419 @@
1
+ /* entities.c -- recognize HTML ISO entities
2
+
3
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: hoehrmann $
9
+ $Date: 2008/08/09 11:55:27 $
10
+ $Revision: 1.19 $
11
+
12
+ Entity handling can be static because there are no config or
13
+ document-specific values. Lookup table is 100% defined at
14
+ compile time.
15
+
16
+ */
17
+
18
+ #include <stdio.h>
19
+ #include "entities.h"
20
+ #include "tidy-int.h"
21
+ #include "tmbstr.h"
22
+
23
+ struct _entity;
24
+ typedef struct _entity entity;
25
+
26
+ struct _entity
27
+ {
28
+ ctmbstr name;
29
+ uint versions;
30
+ uint code;
31
+ };
32
+
33
+
34
+ static const entity entities[] =
35
+ {
36
+ /*
37
+ ** Markup pre-defined character entities
38
+ */
39
+ { "quot", VERS_ALL|VERS_XML, 34 },
40
+ { "amp", VERS_ALL|VERS_XML, 38 },
41
+ { "apos", VERS_FROM40|VERS_XML, 39 },
42
+ { "lt", VERS_ALL|VERS_XML, 60 },
43
+ { "gt", VERS_ALL|VERS_XML, 62 },
44
+
45
+ /*
46
+ ** Latin-1 character entities
47
+ */
48
+ { "nbsp", VERS_ALL, 160 },
49
+ { "iexcl", VERS_ALL, 161 },
50
+ { "cent", VERS_ALL, 162 },
51
+ { "pound", VERS_ALL, 163 },
52
+ { "curren", VERS_ALL, 164 },
53
+ { "yen", VERS_ALL, 165 },
54
+ { "brvbar", VERS_ALL, 166 },
55
+ { "sect", VERS_ALL, 167 },
56
+ { "uml", VERS_ALL, 168 },
57
+ { "copy", VERS_ALL, 169 },
58
+ { "ordf", VERS_ALL, 170 },
59
+ { "laquo", VERS_ALL, 171 },
60
+ { "not", VERS_ALL, 172 },
61
+ { "shy", VERS_ALL, 173 },
62
+ { "reg", VERS_ALL, 174 },
63
+ { "macr", VERS_ALL, 175 },
64
+ { "deg", VERS_ALL, 176 },
65
+ { "plusmn", VERS_ALL, 177 },
66
+ { "sup2", VERS_ALL, 178 },
67
+ { "sup3", VERS_ALL, 179 },
68
+ { "acute", VERS_ALL, 180 },
69
+ { "micro", VERS_ALL, 181 },
70
+ { "para", VERS_ALL, 182 },
71
+ { "middot", VERS_ALL, 183 },
72
+ { "cedil", VERS_ALL, 184 },
73
+ { "sup1", VERS_ALL, 185 },
74
+ { "ordm", VERS_ALL, 186 },
75
+ { "raquo", VERS_ALL, 187 },
76
+ { "frac14", VERS_ALL, 188 },
77
+ { "frac12", VERS_ALL, 189 },
78
+ { "frac34", VERS_ALL, 190 },
79
+ { "iquest", VERS_ALL, 191 },
80
+ { "Agrave", VERS_ALL, 192 },
81
+ { "Aacute", VERS_ALL, 193 },
82
+ { "Acirc", VERS_ALL, 194 },
83
+ { "Atilde", VERS_ALL, 195 },
84
+ { "Auml", VERS_ALL, 196 },
85
+ { "Aring", VERS_ALL, 197 },
86
+ { "AElig", VERS_ALL, 198 },
87
+ { "Ccedil", VERS_ALL, 199 },
88
+ { "Egrave", VERS_ALL, 200 },
89
+ { "Eacute", VERS_ALL, 201 },
90
+ { "Ecirc", VERS_ALL, 202 },
91
+ { "Euml", VERS_ALL, 203 },
92
+ { "Igrave", VERS_ALL, 204 },
93
+ { "Iacute", VERS_ALL, 205 },
94
+ { "Icirc", VERS_ALL, 206 },
95
+ { "Iuml", VERS_ALL, 207 },
96
+ { "ETH", VERS_ALL, 208 },
97
+ { "Ntilde", VERS_ALL, 209 },
98
+ { "Ograve", VERS_ALL, 210 },
99
+ { "Oacute", VERS_ALL, 211 },
100
+ { "Ocirc", VERS_ALL, 212 },
101
+ { "Otilde", VERS_ALL, 213 },
102
+ { "Ouml", VERS_ALL, 214 },
103
+ { "times", VERS_ALL, 215 },
104
+ { "Oslash", VERS_ALL, 216 },
105
+ { "Ugrave", VERS_ALL, 217 },
106
+ { "Uacute", VERS_ALL, 218 },
107
+ { "Ucirc", VERS_ALL, 219 },
108
+ { "Uuml", VERS_ALL, 220 },
109
+ { "Yacute", VERS_ALL, 221 },
110
+ { "THORN", VERS_ALL, 222 },
111
+ { "szlig", VERS_ALL, 223 },
112
+ { "agrave", VERS_ALL, 224 },
113
+ { "aacute", VERS_ALL, 225 },
114
+ { "acirc", VERS_ALL, 226 },
115
+ { "atilde", VERS_ALL, 227 },
116
+ { "auml", VERS_ALL, 228 },
117
+ { "aring", VERS_ALL, 229 },
118
+ { "aelig", VERS_ALL, 230 },
119
+ { "ccedil", VERS_ALL, 231 },
120
+ { "egrave", VERS_ALL, 232 },
121
+ { "eacute", VERS_ALL, 233 },
122
+ { "ecirc", VERS_ALL, 234 },
123
+ { "euml", VERS_ALL, 235 },
124
+ { "igrave", VERS_ALL, 236 },
125
+ { "iacute", VERS_ALL, 237 },
126
+ { "icirc", VERS_ALL, 238 },
127
+ { "iuml", VERS_ALL, 239 },
128
+ { "eth", VERS_ALL, 240 },
129
+ { "ntilde", VERS_ALL, 241 },
130
+ { "ograve", VERS_ALL, 242 },
131
+ { "oacute", VERS_ALL, 243 },
132
+ { "ocirc", VERS_ALL, 244 },
133
+ { "otilde", VERS_ALL, 245 },
134
+ { "ouml", VERS_ALL, 246 },
135
+ { "divide", VERS_ALL, 247 },
136
+ { "oslash", VERS_ALL, 248 },
137
+ { "ugrave", VERS_ALL, 249 },
138
+ { "uacute", VERS_ALL, 250 },
139
+ { "ucirc", VERS_ALL, 251 },
140
+ { "uuml", VERS_ALL, 252 },
141
+ { "yacute", VERS_ALL, 253 },
142
+ { "thorn", VERS_ALL, 254 },
143
+ { "yuml", VERS_ALL, 255 },
144
+
145
+ /*
146
+ ** Extended Entities defined in HTML 4: Symbols
147
+ */
148
+ { "fnof", VERS_FROM40, 402 },
149
+ { "Alpha", VERS_FROM40, 913 },
150
+ { "Beta", VERS_FROM40, 914 },
151
+ { "Gamma", VERS_FROM40, 915 },
152
+ { "Delta", VERS_FROM40, 916 },
153
+ { "Epsilon", VERS_FROM40, 917 },
154
+ { "Zeta", VERS_FROM40, 918 },
155
+ { "Eta", VERS_FROM40, 919 },
156
+ { "Theta", VERS_FROM40, 920 },
157
+ { "Iota", VERS_FROM40, 921 },
158
+ { "Kappa", VERS_FROM40, 922 },
159
+ { "Lambda", VERS_FROM40, 923 },
160
+ { "Mu", VERS_FROM40, 924 },
161
+ { "Nu", VERS_FROM40, 925 },
162
+ { "Xi", VERS_FROM40, 926 },
163
+ { "Omicron", VERS_FROM40, 927 },
164
+ { "Pi", VERS_FROM40, 928 },
165
+ { "Rho", VERS_FROM40, 929 },
166
+ { "Sigma", VERS_FROM40, 931 },
167
+ { "Tau", VERS_FROM40, 932 },
168
+ { "Upsilon", VERS_FROM40, 933 },
169
+ { "Phi", VERS_FROM40, 934 },
170
+ { "Chi", VERS_FROM40, 935 },
171
+ { "Psi", VERS_FROM40, 936 },
172
+ { "Omega", VERS_FROM40, 937 },
173
+ { "alpha", VERS_FROM40, 945 },
174
+ { "beta", VERS_FROM40, 946 },
175
+ { "gamma", VERS_FROM40, 947 },
176
+ { "delta", VERS_FROM40, 948 },
177
+ { "epsilon", VERS_FROM40, 949 },
178
+ { "zeta", VERS_FROM40, 950 },
179
+ { "eta", VERS_FROM40, 951 },
180
+ { "theta", VERS_FROM40, 952 },
181
+ { "iota", VERS_FROM40, 953 },
182
+ { "kappa", VERS_FROM40, 954 },
183
+ { "lambda", VERS_FROM40, 955 },
184
+ { "mu", VERS_FROM40, 956 },
185
+ { "nu", VERS_FROM40, 957 },
186
+ { "xi", VERS_FROM40, 958 },
187
+ { "omicron", VERS_FROM40, 959 },
188
+ { "pi", VERS_FROM40, 960 },
189
+ { "rho", VERS_FROM40, 961 },
190
+ { "sigmaf", VERS_FROM40, 962 },
191
+ { "sigma", VERS_FROM40, 963 },
192
+ { "tau", VERS_FROM40, 964 },
193
+ { "upsilon", VERS_FROM40, 965 },
194
+ { "phi", VERS_FROM40, 966 },
195
+ { "chi", VERS_FROM40, 967 },
196
+ { "psi", VERS_FROM40, 968 },
197
+ { "omega", VERS_FROM40, 969 },
198
+ { "thetasym", VERS_FROM40, 977 },
199
+ { "upsih", VERS_FROM40, 978 },
200
+ { "piv", VERS_FROM40, 982 },
201
+ { "bull", VERS_FROM40, 8226 },
202
+ { "hellip", VERS_FROM40, 8230 },
203
+ { "prime", VERS_FROM40, 8242 },
204
+ { "Prime", VERS_FROM40, 8243 },
205
+ { "oline", VERS_FROM40, 8254 },
206
+ { "frasl", VERS_FROM40, 8260 },
207
+ { "weierp", VERS_FROM40, 8472 },
208
+ { "image", VERS_FROM40, 8465 },
209
+ { "real", VERS_FROM40, 8476 },
210
+ { "trade", VERS_FROM40, 8482 },
211
+ { "alefsym", VERS_FROM40, 8501 },
212
+ { "larr", VERS_FROM40, 8592 },
213
+ { "uarr", VERS_FROM40, 8593 },
214
+ { "rarr", VERS_FROM40, 8594 },
215
+ { "darr", VERS_FROM40, 8595 },
216
+ { "harr", VERS_FROM40, 8596 },
217
+ { "crarr", VERS_FROM40, 8629 },
218
+ { "lArr", VERS_FROM40, 8656 },
219
+ { "uArr", VERS_FROM40, 8657 },
220
+ { "rArr", VERS_FROM40, 8658 },
221
+ { "dArr", VERS_FROM40, 8659 },
222
+ { "hArr", VERS_FROM40, 8660 },
223
+ { "forall", VERS_FROM40, 8704 },
224
+ { "part", VERS_FROM40, 8706 },
225
+ { "exist", VERS_FROM40, 8707 },
226
+ { "empty", VERS_FROM40, 8709 },
227
+ { "nabla", VERS_FROM40, 8711 },
228
+ { "isin", VERS_FROM40, 8712 },
229
+ { "notin", VERS_FROM40, 8713 },
230
+ { "ni", VERS_FROM40, 8715 },
231
+ { "prod", VERS_FROM40, 8719 },
232
+ { "sum", VERS_FROM40, 8721 },
233
+ { "minus", VERS_FROM40, 8722 },
234
+ { "lowast", VERS_FROM40, 8727 },
235
+ { "radic", VERS_FROM40, 8730 },
236
+ { "prop", VERS_FROM40, 8733 },
237
+ { "infin", VERS_FROM40, 8734 },
238
+ { "ang", VERS_FROM40, 8736 },
239
+ { "and", VERS_FROM40, 8743 },
240
+ { "or", VERS_FROM40, 8744 },
241
+ { "cap", VERS_FROM40, 8745 },
242
+ { "cup", VERS_FROM40, 8746 },
243
+ { "int", VERS_FROM40, 8747 },
244
+ { "there4", VERS_FROM40, 8756 },
245
+ { "sim", VERS_FROM40, 8764 },
246
+ { "cong", VERS_FROM40, 8773 },
247
+ { "asymp", VERS_FROM40, 8776 },
248
+ { "ne", VERS_FROM40, 8800 },
249
+ { "equiv", VERS_FROM40, 8801 },
250
+ { "le", VERS_FROM40, 8804 },
251
+ { "ge", VERS_FROM40, 8805 },
252
+ { "sub", VERS_FROM40, 8834 },
253
+ { "sup", VERS_FROM40, 8835 },
254
+ { "nsub", VERS_FROM40, 8836 },
255
+ { "sube", VERS_FROM40, 8838 },
256
+ { "supe", VERS_FROM40, 8839 },
257
+ { "oplus", VERS_FROM40, 8853 },
258
+ { "otimes", VERS_FROM40, 8855 },
259
+ { "perp", VERS_FROM40, 8869 },
260
+ { "sdot", VERS_FROM40, 8901 },
261
+ { "lceil", VERS_FROM40, 8968 },
262
+ { "rceil", VERS_FROM40, 8969 },
263
+ { "lfloor", VERS_FROM40, 8970 },
264
+ { "rfloor", VERS_FROM40, 8971 },
265
+ { "lang", VERS_FROM40, 9001 },
266
+ { "rang", VERS_FROM40, 9002 },
267
+ { "loz", VERS_FROM40, 9674 },
268
+ { "spades", VERS_FROM40, 9824 },
269
+ { "clubs", VERS_FROM40, 9827 },
270
+ { "hearts", VERS_FROM40, 9829 },
271
+ { "diams", VERS_FROM40, 9830 },
272
+
273
+ /*
274
+ ** Extended Entities defined in HTML 4: Special (less Markup at top)
275
+ */
276
+ { "OElig", VERS_FROM40, 338 },
277
+ { "oelig", VERS_FROM40, 339 },
278
+ { "Scaron", VERS_FROM40, 352 },
279
+ { "scaron", VERS_FROM40, 353 },
280
+ { "Yuml", VERS_FROM40, 376 },
281
+ { "circ", VERS_FROM40, 710 },
282
+ { "tilde", VERS_FROM40, 732 },
283
+ { "ensp", VERS_FROM40, 8194 },
284
+ { "emsp", VERS_FROM40, 8195 },
285
+ { "thinsp", VERS_FROM40, 8201 },
286
+ { "zwnj", VERS_FROM40, 8204 },
287
+ { "zwj", VERS_FROM40, 8205 },
288
+ { "lrm", VERS_FROM40, 8206 },
289
+ { "rlm", VERS_FROM40, 8207 },
290
+ { "ndash", VERS_FROM40, 8211 },
291
+ { "mdash", VERS_FROM40, 8212 },
292
+ { "lsquo", VERS_FROM40, 8216 },
293
+ { "rsquo", VERS_FROM40, 8217 },
294
+ { "sbquo", VERS_FROM40, 8218 },
295
+ { "ldquo", VERS_FROM40, 8220 },
296
+ { "rdquo", VERS_FROM40, 8221 },
297
+ { "bdquo", VERS_FROM40, 8222 },
298
+ { "dagger", VERS_FROM40, 8224 },
299
+ { "Dagger", VERS_FROM40, 8225 },
300
+ { "permil", VERS_FROM40, 8240 },
301
+ { "lsaquo", VERS_FROM40, 8249 },
302
+ { "rsaquo", VERS_FROM40, 8250 },
303
+ { "euro", VERS_FROM40, 8364 },
304
+ { NULL, VERS_UNKNOWN, 0 }
305
+ };
306
+
307
+
308
+ /* Pure static implementation. Trades off lookup speed
309
+ ** for faster setup time (well, none actually).
310
+ ** Optimization of comparing 1st character buys enough
311
+ ** speed that hash doesn't improve things without > 500
312
+ ** items in list.
313
+ */
314
+ static const entity* entitiesLookup( ctmbstr s )
315
+ {
316
+ tmbchar ch = (tmbchar)( s ? *s : 0 );
317
+ const entity *np;
318
+ for ( np = entities; ch && np && np->name; ++np )
319
+ if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 )
320
+ return np;
321
+ return NULL;
322
+ }
323
+
324
+ #if 0
325
+ /* entity starting with "&" returns zero on error */
326
+ uint EntityCode( ctmbstr name, uint versions )
327
+ {
328
+ const entity* np;
329
+ assert( name && name[0] == '&' );
330
+
331
+ /* numeric entitity: name = "&#" followed by number */
332
+ if ( name[1] == '#' )
333
+ {
334
+ uint c = 0; /* zero on missing/bad number */
335
+ Bool isXml = ( (versions & VERS_XML) == VERS_XML );
336
+
337
+ /* 'x' prefix denotes hexadecimal number format */
338
+ if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
339
+ sscanf( name+3, "%x", &c );
340
+ else
341
+ sscanf( name+2, "%u", &c );
342
+
343
+ return (uint) c;
344
+ }
345
+
346
+ /* Named entity: name ="&" followed by a name */
347
+ if ( NULL != (np = entitiesLookup(name+1)) )
348
+ {
349
+ /* Only recognize entity name if version supports it. */
350
+ if ( np->versions & versions )
351
+ return np->code;
352
+ }
353
+
354
+ return 0; /* zero signifies unknown entity name */
355
+ }
356
+ #endif
357
+
358
+ Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions )
359
+ {
360
+ const entity* np;
361
+ assert( name && name[0] == '&' );
362
+ assert( code != NULL );
363
+ assert( versions != NULL );
364
+
365
+ /* numeric entitity: name = "&#" followed by number */
366
+ if ( name[1] == '#' )
367
+ {
368
+ uint c = 0; /* zero on missing/bad number */
369
+
370
+ /* 'x' prefix denotes hexadecimal number format */
371
+ if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
372
+ sscanf( name+3, "%x", &c );
373
+ else
374
+ sscanf( name+2, "%u", &c );
375
+
376
+ *code = c;
377
+ *versions = VERS_ALL;
378
+ return yes;
379
+ }
380
+
381
+ /* Named entity: name ="&" followed by a name */
382
+ if ( NULL != (np = entitiesLookup(name+1)) )
383
+ {
384
+ *code = np->code;
385
+ *versions = np->versions;
386
+ return yes;
387
+ }
388
+
389
+ *code = 0;
390
+ *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
391
+ return no;
392
+ }
393
+
394
+
395
+ ctmbstr TY_(EntityName)( uint ch, uint versions )
396
+ {
397
+ ctmbstr entnam = NULL;
398
+ const entity *ep;
399
+
400
+ for ( ep = entities; ep->name != NULL; ++ep )
401
+ {
402
+ if ( ep->code == ch )
403
+ {
404
+ if ( ep->versions & versions )
405
+ entnam = ep->name;
406
+ break; /* Found code. Stop search. */
407
+ }
408
+ }
409
+ return entnam;
410
+ }
411
+
412
+ /*
413
+ * local variables:
414
+ * mode: c
415
+ * indent-tabs-mode: nil
416
+ * c-basic-offset: 4
417
+ * eval: (c-set-offset 'substatement-open 0)
418
+ * end:
419
+ */
@@ -0,0 +1,24 @@
1
+ #ifndef __ENTITIES_H__
2
+ #define __ENTITIES_H__
3
+
4
+ /* entities.h -- recognize character entities
5
+
6
+ (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2006/09/12 15:14:44 $
13
+ $Revision: 1.8 $
14
+
15
+ */
16
+
17
+ #include "forward.h"
18
+
19
+ /* entity starting with "&" returns zero on error */
20
+ /* uint EntityCode( ctmbstr name, uint versions ); */
21
+ ctmbstr TY_(EntityName)( uint charCode, uint versions );
22
+ Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions );
23
+
24
+ #endif /* __ENTITIES_H__ */
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("tidy")
4
+ create_makefile("tidy")
5
+