tidy-ext 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. data/.gitignore +4 -0
  2. data/LICENSE +50 -0
  3. data/README +12 -0
  4. data/Rakefile +60 -0
  5. data/VERSION +1 -0
  6. data/ext/tidy/access.c +3310 -0
  7. data/ext/tidy/access.h +279 -0
  8. data/ext/tidy/alloc.c +107 -0
  9. data/ext/tidy/attrask.c +209 -0
  10. data/ext/tidy/attrdict.c +2398 -0
  11. data/ext/tidy/attrdict.h +122 -0
  12. data/ext/tidy/attrget.c +213 -0
  13. data/ext/tidy/attrs.c +1911 -0
  14. data/ext/tidy/attrs.h +374 -0
  15. data/ext/tidy/buffio.c +232 -0
  16. data/ext/tidy/buffio.h +118 -0
  17. data/ext/tidy/charsets.c +1032 -0
  18. data/ext/tidy/charsets.h +14 -0
  19. data/ext/tidy/clean.c +2674 -0
  20. data/ext/tidy/clean.h +87 -0
  21. data/ext/tidy/config.c +1746 -0
  22. data/ext/tidy/config.h +153 -0
  23. data/ext/tidy/entities.c +419 -0
  24. data/ext/tidy/entities.h +24 -0
  25. data/ext/tidy/extconf.rb +5 -0
  26. data/ext/tidy/fileio.c +106 -0
  27. data/ext/tidy/fileio.h +46 -0
  28. data/ext/tidy/forward.h +69 -0
  29. data/ext/tidy/iconvtc.c +105 -0
  30. data/ext/tidy/iconvtc.h +15 -0
  31. data/ext/tidy/istack.c +373 -0
  32. data/ext/tidy/lexer.c +3825 -0
  33. data/ext/tidy/lexer.h +617 -0
  34. data/ext/tidy/localize.c +1882 -0
  35. data/ext/tidy/mappedio.c +329 -0
  36. data/ext/tidy/mappedio.h +16 -0
  37. data/ext/tidy/message.h +207 -0
  38. data/ext/tidy/parser.c +4408 -0
  39. data/ext/tidy/parser.h +76 -0
  40. data/ext/tidy/platform.h +636 -0
  41. data/ext/tidy/pprint.c +2276 -0
  42. data/ext/tidy/pprint.h +93 -0
  43. data/ext/tidy/ruby-tidy.c +195 -0
  44. data/ext/tidy/streamio.c +1407 -0
  45. data/ext/tidy/streamio.h +222 -0
  46. data/ext/tidy/tagask.c +286 -0
  47. data/ext/tidy/tags.c +955 -0
  48. data/ext/tidy/tags.h +235 -0
  49. data/ext/tidy/tidy-int.h +129 -0
  50. data/ext/tidy/tidy.h +1097 -0
  51. data/ext/tidy/tidyenum.h +622 -0
  52. data/ext/tidy/tidylib.c +1751 -0
  53. data/ext/tidy/tmbstr.c +306 -0
  54. data/ext/tidy/tmbstr.h +92 -0
  55. data/ext/tidy/utf8.c +539 -0
  56. data/ext/tidy/utf8.h +52 -0
  57. data/ext/tidy/version.h +14 -0
  58. data/ext/tidy/win32tc.c +795 -0
  59. data/ext/tidy/win32tc.h +19 -0
  60. data/spec/spec_helper.rb +5 -0
  61. data/spec/tidy/compat_spec.rb +44 -0
  62. data/spec/tidy/remote_uri_spec.rb +14 -0
  63. data/spec/tidy/test1.html +5 -0
  64. data/spec/tidy/tidy_spec.rb +34 -0
  65. metadata +125 -0
data/ext/tidy/config.h ADDED
@@ -0,0 +1,153 @@
1
+ #ifndef __CONFIG_H__
2
+ #define __CONFIG_H__
3
+
4
+ /* config.h -- read config file and manage config properties
5
+
6
+ (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2006/12/29 16:31:08 $
13
+ $Revision: 1.14 $
14
+
15
+ config files associate a property name with a value.
16
+
17
+ // comments can start at the beginning of a line
18
+ # comments can start at the beginning of a line
19
+ name: short values fit onto one line
20
+ name: a really long value that
21
+ continues on the next line
22
+
23
+ property names are case insensitive and should be less than
24
+ 60 characters in length and must start at the begining of
25
+ the line, as whitespace at the start of a line signifies a
26
+ line continuation.
27
+
28
+ */
29
+
30
+ #include "forward.h"
31
+ #include "tidy.h"
32
+ #include "streamio.h"
33
+
34
+ struct _tidy_option;
35
+ typedef struct _tidy_option TidyOptionImpl;
36
+
37
+ typedef Bool (ParseProperty)( TidyDocImpl* doc, const TidyOptionImpl* opt );
38
+
39
+ struct _tidy_option
40
+ {
41
+ TidyOptionId id;
42
+ TidyConfigCategory category; /* put 'em in groups */
43
+ ctmbstr name; /* property name */
44
+ TidyOptionType type; /* string, int or bool */
45
+ ulong dflt; /* default for TidyInteger and TidyBoolean */
46
+ ParseProperty* parser; /* parsing method, read-only if NULL */
47
+ const ctmbstr* pickList; /* pick list */
48
+ ctmbstr pdflt; /* default for TidyString */
49
+ };
50
+
51
+ typedef union
52
+ {
53
+ ulong v; /* Value for TidyInteger and TidyBoolean */
54
+ char *p; /* Value for TidyString */
55
+ } TidyOptionValue;
56
+
57
+ typedef struct _tidy_config
58
+ {
59
+ TidyOptionValue value[ N_TIDY_OPTIONS + 1 ]; /* current config values */
60
+ TidyOptionValue snapshot[ N_TIDY_OPTIONS + 1 ]; /* Snapshot of values to be restored later */
61
+
62
+ /* track what tags user has defined to eliminate unnecessary searches */
63
+ uint defined_tags;
64
+
65
+ uint c; /* current char in input stream */
66
+ StreamIn* cfgIn; /* current input source */
67
+
68
+ } TidyConfigImpl;
69
+
70
+
71
+ typedef struct {
72
+ TidyOptionId opt; /**< Identifier. */
73
+ ctmbstr doc; /**< HTML text */
74
+ TidyOptionId const *links; /**< Cross references.
75
+ Last element must be 'TidyUnknownOption'. */
76
+ } TidyOptionDoc;
77
+
78
+
79
+ const TidyOptionImpl* TY_(lookupOption)( ctmbstr optnam );
80
+ const TidyOptionImpl* TY_(getOption)( TidyOptionId optId );
81
+
82
+ TidyIterator TY_(getOptionList)( TidyDocImpl* doc );
83
+ const TidyOptionImpl* TY_(getNextOption)( TidyDocImpl* doc, TidyIterator* iter );
84
+
85
+ TidyIterator TY_(getOptionPickList)( const TidyOptionImpl* option );
86
+ ctmbstr TY_(getNextOptionPick)( const TidyOptionImpl* option, TidyIterator* iter );
87
+
88
+ const TidyOptionDoc* TY_(OptGetDocDesc)( TidyOptionId optId );
89
+
90
+ void TY_(InitConfig)( TidyDocImpl* doc );
91
+ void TY_(FreeConfig)( TidyDocImpl* doc );
92
+
93
+ /* Bool SetOptionValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr val ); */
94
+ Bool TY_(SetOptionInt)( TidyDocImpl* doc, TidyOptionId optId, ulong val );
95
+ Bool TY_(SetOptionBool)( TidyDocImpl* doc, TidyOptionId optId, Bool val );
96
+
97
+ Bool TY_(ResetOptionToDefault)( TidyDocImpl* doc, TidyOptionId optId );
98
+ void TY_(ResetConfigToDefault)( TidyDocImpl* doc );
99
+ void TY_(TakeConfigSnapshot)( TidyDocImpl* doc );
100
+ void TY_(ResetConfigToSnapshot)( TidyDocImpl* doc );
101
+
102
+ void TY_(CopyConfig)( TidyDocImpl* docTo, TidyDocImpl* docFrom );
103
+
104
+ int TY_(ParseConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
105
+ int TY_(ParseConfigFileEnc)( TidyDocImpl* doc,
106
+ ctmbstr cfgfil, ctmbstr charenc );
107
+
108
+ int TY_(SaveConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
109
+ int TY_(SaveConfigSink)( TidyDocImpl* doc, TidyOutputSink* sink );
110
+
111
+ /* returns false if unknown option, missing parameter, or
112
+ option doesn't use parameter
113
+ */
114
+ Bool TY_(ParseConfigOption)( TidyDocImpl* doc, ctmbstr optnam, ctmbstr optVal );
115
+ Bool TY_(ParseConfigValue)( TidyDocImpl* doc, TidyOptionId optId, ctmbstr optVal );
116
+
117
+ /* ensure that char encodings are self consistent */
118
+ Bool TY_(AdjustCharEncoding)( TidyDocImpl* doc, int encoding );
119
+
120
+ Bool TY_(ConfigDiffThanDefault)( TidyDocImpl* doc );
121
+ Bool TY_(ConfigDiffThanSnapshot)( TidyDocImpl* doc );
122
+
123
+ int TY_(CharEncodingId)( TidyDocImpl* doc, ctmbstr charenc );
124
+ ctmbstr TY_(CharEncodingName)( int encoding );
125
+ ctmbstr TY_(CharEncodingOptName)( int encoding );
126
+
127
+ /* void SetEmacsFilename( TidyDocImpl* doc, ctmbstr filename ); */
128
+
129
+
130
+ #ifdef _DEBUG
131
+
132
+ /* Debug lookup functions will be type-safe and assert option type match */
133
+ ulong TY_(_cfgGet)( TidyDocImpl* doc, TidyOptionId optId );
134
+ Bool TY_(_cfgGetBool)( TidyDocImpl* doc, TidyOptionId optId );
135
+ TidyTriState TY_(_cfgGetAutoBool)( TidyDocImpl* doc, TidyOptionId optId );
136
+ ctmbstr TY_(_cfgGetString)( TidyDocImpl* doc, TidyOptionId optId );
137
+
138
+ #define cfg(doc, id) TY_(_cfgGet)( (doc), (id) )
139
+ #define cfgBool(doc, id) TY_(_cfgGetBool)( (doc), (id) )
140
+ #define cfgAutoBool(doc, id) TY_(_cfgGetAutoBool)( (doc), (id) )
141
+ #define cfgStr(doc, id) TY_(_cfgGetString)( (doc), (id) )
142
+
143
+ #else
144
+
145
+ /* Release build macros for speed */
146
+ #define cfg(doc, id) ((doc)->config.value[ (id) ].v)
147
+ #define cfgBool(doc, id) ((Bool) cfg(doc, id))
148
+ #define cfgAutoBool(doc, id) ((TidyTriState) cfg(doc, id))
149
+ #define cfgStr(doc, id) ((ctmbstr) (doc)->config.value[ (id) ].p)
150
+
151
+ #endif /* _DEBUG */
152
+
153
+ #endif /* __CONFIG_H__ */
@@ -0,0 +1,419 @@
1
+ /* entities.c -- recognize HTML ISO entities
2
+
3
+ (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
+ See tidy.h for the copyright notice.
5
+
6
+ CVS Info :
7
+
8
+ $Author: hoehrmann $
9
+ $Date: 2008/08/09 11:55:27 $
10
+ $Revision: 1.19 $
11
+
12
+ Entity handling can be static because there are no config or
13
+ document-specific values. Lookup table is 100% defined at
14
+ compile time.
15
+
16
+ */
17
+
18
+ #include <stdio.h>
19
+ #include "entities.h"
20
+ #include "tidy-int.h"
21
+ #include "tmbstr.h"
22
+
23
+ struct _entity;
24
+ typedef struct _entity entity;
25
+
26
+ struct _entity
27
+ {
28
+ ctmbstr name;
29
+ uint versions;
30
+ uint code;
31
+ };
32
+
33
+
34
+ static const entity entities[] =
35
+ {
36
+ /*
37
+ ** Markup pre-defined character entities
38
+ */
39
+ { "quot", VERS_ALL|VERS_XML, 34 },
40
+ { "amp", VERS_ALL|VERS_XML, 38 },
41
+ { "apos", VERS_FROM40|VERS_XML, 39 },
42
+ { "lt", VERS_ALL|VERS_XML, 60 },
43
+ { "gt", VERS_ALL|VERS_XML, 62 },
44
+
45
+ /*
46
+ ** Latin-1 character entities
47
+ */
48
+ { "nbsp", VERS_ALL, 160 },
49
+ { "iexcl", VERS_ALL, 161 },
50
+ { "cent", VERS_ALL, 162 },
51
+ { "pound", VERS_ALL, 163 },
52
+ { "curren", VERS_ALL, 164 },
53
+ { "yen", VERS_ALL, 165 },
54
+ { "brvbar", VERS_ALL, 166 },
55
+ { "sect", VERS_ALL, 167 },
56
+ { "uml", VERS_ALL, 168 },
57
+ { "copy", VERS_ALL, 169 },
58
+ { "ordf", VERS_ALL, 170 },
59
+ { "laquo", VERS_ALL, 171 },
60
+ { "not", VERS_ALL, 172 },
61
+ { "shy", VERS_ALL, 173 },
62
+ { "reg", VERS_ALL, 174 },
63
+ { "macr", VERS_ALL, 175 },
64
+ { "deg", VERS_ALL, 176 },
65
+ { "plusmn", VERS_ALL, 177 },
66
+ { "sup2", VERS_ALL, 178 },
67
+ { "sup3", VERS_ALL, 179 },
68
+ { "acute", VERS_ALL, 180 },
69
+ { "micro", VERS_ALL, 181 },
70
+ { "para", VERS_ALL, 182 },
71
+ { "middot", VERS_ALL, 183 },
72
+ { "cedil", VERS_ALL, 184 },
73
+ { "sup1", VERS_ALL, 185 },
74
+ { "ordm", VERS_ALL, 186 },
75
+ { "raquo", VERS_ALL, 187 },
76
+ { "frac14", VERS_ALL, 188 },
77
+ { "frac12", VERS_ALL, 189 },
78
+ { "frac34", VERS_ALL, 190 },
79
+ { "iquest", VERS_ALL, 191 },
80
+ { "Agrave", VERS_ALL, 192 },
81
+ { "Aacute", VERS_ALL, 193 },
82
+ { "Acirc", VERS_ALL, 194 },
83
+ { "Atilde", VERS_ALL, 195 },
84
+ { "Auml", VERS_ALL, 196 },
85
+ { "Aring", VERS_ALL, 197 },
86
+ { "AElig", VERS_ALL, 198 },
87
+ { "Ccedil", VERS_ALL, 199 },
88
+ { "Egrave", VERS_ALL, 200 },
89
+ { "Eacute", VERS_ALL, 201 },
90
+ { "Ecirc", VERS_ALL, 202 },
91
+ { "Euml", VERS_ALL, 203 },
92
+ { "Igrave", VERS_ALL, 204 },
93
+ { "Iacute", VERS_ALL, 205 },
94
+ { "Icirc", VERS_ALL, 206 },
95
+ { "Iuml", VERS_ALL, 207 },
96
+ { "ETH", VERS_ALL, 208 },
97
+ { "Ntilde", VERS_ALL, 209 },
98
+ { "Ograve", VERS_ALL, 210 },
99
+ { "Oacute", VERS_ALL, 211 },
100
+ { "Ocirc", VERS_ALL, 212 },
101
+ { "Otilde", VERS_ALL, 213 },
102
+ { "Ouml", VERS_ALL, 214 },
103
+ { "times", VERS_ALL, 215 },
104
+ { "Oslash", VERS_ALL, 216 },
105
+ { "Ugrave", VERS_ALL, 217 },
106
+ { "Uacute", VERS_ALL, 218 },
107
+ { "Ucirc", VERS_ALL, 219 },
108
+ { "Uuml", VERS_ALL, 220 },
109
+ { "Yacute", VERS_ALL, 221 },
110
+ { "THORN", VERS_ALL, 222 },
111
+ { "szlig", VERS_ALL, 223 },
112
+ { "agrave", VERS_ALL, 224 },
113
+ { "aacute", VERS_ALL, 225 },
114
+ { "acirc", VERS_ALL, 226 },
115
+ { "atilde", VERS_ALL, 227 },
116
+ { "auml", VERS_ALL, 228 },
117
+ { "aring", VERS_ALL, 229 },
118
+ { "aelig", VERS_ALL, 230 },
119
+ { "ccedil", VERS_ALL, 231 },
120
+ { "egrave", VERS_ALL, 232 },
121
+ { "eacute", VERS_ALL, 233 },
122
+ { "ecirc", VERS_ALL, 234 },
123
+ { "euml", VERS_ALL, 235 },
124
+ { "igrave", VERS_ALL, 236 },
125
+ { "iacute", VERS_ALL, 237 },
126
+ { "icirc", VERS_ALL, 238 },
127
+ { "iuml", VERS_ALL, 239 },
128
+ { "eth", VERS_ALL, 240 },
129
+ { "ntilde", VERS_ALL, 241 },
130
+ { "ograve", VERS_ALL, 242 },
131
+ { "oacute", VERS_ALL, 243 },
132
+ { "ocirc", VERS_ALL, 244 },
133
+ { "otilde", VERS_ALL, 245 },
134
+ { "ouml", VERS_ALL, 246 },
135
+ { "divide", VERS_ALL, 247 },
136
+ { "oslash", VERS_ALL, 248 },
137
+ { "ugrave", VERS_ALL, 249 },
138
+ { "uacute", VERS_ALL, 250 },
139
+ { "ucirc", VERS_ALL, 251 },
140
+ { "uuml", VERS_ALL, 252 },
141
+ { "yacute", VERS_ALL, 253 },
142
+ { "thorn", VERS_ALL, 254 },
143
+ { "yuml", VERS_ALL, 255 },
144
+
145
+ /*
146
+ ** Extended Entities defined in HTML 4: Symbols
147
+ */
148
+ { "fnof", VERS_FROM40, 402 },
149
+ { "Alpha", VERS_FROM40, 913 },
150
+ { "Beta", VERS_FROM40, 914 },
151
+ { "Gamma", VERS_FROM40, 915 },
152
+ { "Delta", VERS_FROM40, 916 },
153
+ { "Epsilon", VERS_FROM40, 917 },
154
+ { "Zeta", VERS_FROM40, 918 },
155
+ { "Eta", VERS_FROM40, 919 },
156
+ { "Theta", VERS_FROM40, 920 },
157
+ { "Iota", VERS_FROM40, 921 },
158
+ { "Kappa", VERS_FROM40, 922 },
159
+ { "Lambda", VERS_FROM40, 923 },
160
+ { "Mu", VERS_FROM40, 924 },
161
+ { "Nu", VERS_FROM40, 925 },
162
+ { "Xi", VERS_FROM40, 926 },
163
+ { "Omicron", VERS_FROM40, 927 },
164
+ { "Pi", VERS_FROM40, 928 },
165
+ { "Rho", VERS_FROM40, 929 },
166
+ { "Sigma", VERS_FROM40, 931 },
167
+ { "Tau", VERS_FROM40, 932 },
168
+ { "Upsilon", VERS_FROM40, 933 },
169
+ { "Phi", VERS_FROM40, 934 },
170
+ { "Chi", VERS_FROM40, 935 },
171
+ { "Psi", VERS_FROM40, 936 },
172
+ { "Omega", VERS_FROM40, 937 },
173
+ { "alpha", VERS_FROM40, 945 },
174
+ { "beta", VERS_FROM40, 946 },
175
+ { "gamma", VERS_FROM40, 947 },
176
+ { "delta", VERS_FROM40, 948 },
177
+ { "epsilon", VERS_FROM40, 949 },
178
+ { "zeta", VERS_FROM40, 950 },
179
+ { "eta", VERS_FROM40, 951 },
180
+ { "theta", VERS_FROM40, 952 },
181
+ { "iota", VERS_FROM40, 953 },
182
+ { "kappa", VERS_FROM40, 954 },
183
+ { "lambda", VERS_FROM40, 955 },
184
+ { "mu", VERS_FROM40, 956 },
185
+ { "nu", VERS_FROM40, 957 },
186
+ { "xi", VERS_FROM40, 958 },
187
+ { "omicron", VERS_FROM40, 959 },
188
+ { "pi", VERS_FROM40, 960 },
189
+ { "rho", VERS_FROM40, 961 },
190
+ { "sigmaf", VERS_FROM40, 962 },
191
+ { "sigma", VERS_FROM40, 963 },
192
+ { "tau", VERS_FROM40, 964 },
193
+ { "upsilon", VERS_FROM40, 965 },
194
+ { "phi", VERS_FROM40, 966 },
195
+ { "chi", VERS_FROM40, 967 },
196
+ { "psi", VERS_FROM40, 968 },
197
+ { "omega", VERS_FROM40, 969 },
198
+ { "thetasym", VERS_FROM40, 977 },
199
+ { "upsih", VERS_FROM40, 978 },
200
+ { "piv", VERS_FROM40, 982 },
201
+ { "bull", VERS_FROM40, 8226 },
202
+ { "hellip", VERS_FROM40, 8230 },
203
+ { "prime", VERS_FROM40, 8242 },
204
+ { "Prime", VERS_FROM40, 8243 },
205
+ { "oline", VERS_FROM40, 8254 },
206
+ { "frasl", VERS_FROM40, 8260 },
207
+ { "weierp", VERS_FROM40, 8472 },
208
+ { "image", VERS_FROM40, 8465 },
209
+ { "real", VERS_FROM40, 8476 },
210
+ { "trade", VERS_FROM40, 8482 },
211
+ { "alefsym", VERS_FROM40, 8501 },
212
+ { "larr", VERS_FROM40, 8592 },
213
+ { "uarr", VERS_FROM40, 8593 },
214
+ { "rarr", VERS_FROM40, 8594 },
215
+ { "darr", VERS_FROM40, 8595 },
216
+ { "harr", VERS_FROM40, 8596 },
217
+ { "crarr", VERS_FROM40, 8629 },
218
+ { "lArr", VERS_FROM40, 8656 },
219
+ { "uArr", VERS_FROM40, 8657 },
220
+ { "rArr", VERS_FROM40, 8658 },
221
+ { "dArr", VERS_FROM40, 8659 },
222
+ { "hArr", VERS_FROM40, 8660 },
223
+ { "forall", VERS_FROM40, 8704 },
224
+ { "part", VERS_FROM40, 8706 },
225
+ { "exist", VERS_FROM40, 8707 },
226
+ { "empty", VERS_FROM40, 8709 },
227
+ { "nabla", VERS_FROM40, 8711 },
228
+ { "isin", VERS_FROM40, 8712 },
229
+ { "notin", VERS_FROM40, 8713 },
230
+ { "ni", VERS_FROM40, 8715 },
231
+ { "prod", VERS_FROM40, 8719 },
232
+ { "sum", VERS_FROM40, 8721 },
233
+ { "minus", VERS_FROM40, 8722 },
234
+ { "lowast", VERS_FROM40, 8727 },
235
+ { "radic", VERS_FROM40, 8730 },
236
+ { "prop", VERS_FROM40, 8733 },
237
+ { "infin", VERS_FROM40, 8734 },
238
+ { "ang", VERS_FROM40, 8736 },
239
+ { "and", VERS_FROM40, 8743 },
240
+ { "or", VERS_FROM40, 8744 },
241
+ { "cap", VERS_FROM40, 8745 },
242
+ { "cup", VERS_FROM40, 8746 },
243
+ { "int", VERS_FROM40, 8747 },
244
+ { "there4", VERS_FROM40, 8756 },
245
+ { "sim", VERS_FROM40, 8764 },
246
+ { "cong", VERS_FROM40, 8773 },
247
+ { "asymp", VERS_FROM40, 8776 },
248
+ { "ne", VERS_FROM40, 8800 },
249
+ { "equiv", VERS_FROM40, 8801 },
250
+ { "le", VERS_FROM40, 8804 },
251
+ { "ge", VERS_FROM40, 8805 },
252
+ { "sub", VERS_FROM40, 8834 },
253
+ { "sup", VERS_FROM40, 8835 },
254
+ { "nsub", VERS_FROM40, 8836 },
255
+ { "sube", VERS_FROM40, 8838 },
256
+ { "supe", VERS_FROM40, 8839 },
257
+ { "oplus", VERS_FROM40, 8853 },
258
+ { "otimes", VERS_FROM40, 8855 },
259
+ { "perp", VERS_FROM40, 8869 },
260
+ { "sdot", VERS_FROM40, 8901 },
261
+ { "lceil", VERS_FROM40, 8968 },
262
+ { "rceil", VERS_FROM40, 8969 },
263
+ { "lfloor", VERS_FROM40, 8970 },
264
+ { "rfloor", VERS_FROM40, 8971 },
265
+ { "lang", VERS_FROM40, 9001 },
266
+ { "rang", VERS_FROM40, 9002 },
267
+ { "loz", VERS_FROM40, 9674 },
268
+ { "spades", VERS_FROM40, 9824 },
269
+ { "clubs", VERS_FROM40, 9827 },
270
+ { "hearts", VERS_FROM40, 9829 },
271
+ { "diams", VERS_FROM40, 9830 },
272
+
273
+ /*
274
+ ** Extended Entities defined in HTML 4: Special (less Markup at top)
275
+ */
276
+ { "OElig", VERS_FROM40, 338 },
277
+ { "oelig", VERS_FROM40, 339 },
278
+ { "Scaron", VERS_FROM40, 352 },
279
+ { "scaron", VERS_FROM40, 353 },
280
+ { "Yuml", VERS_FROM40, 376 },
281
+ { "circ", VERS_FROM40, 710 },
282
+ { "tilde", VERS_FROM40, 732 },
283
+ { "ensp", VERS_FROM40, 8194 },
284
+ { "emsp", VERS_FROM40, 8195 },
285
+ { "thinsp", VERS_FROM40, 8201 },
286
+ { "zwnj", VERS_FROM40, 8204 },
287
+ { "zwj", VERS_FROM40, 8205 },
288
+ { "lrm", VERS_FROM40, 8206 },
289
+ { "rlm", VERS_FROM40, 8207 },
290
+ { "ndash", VERS_FROM40, 8211 },
291
+ { "mdash", VERS_FROM40, 8212 },
292
+ { "lsquo", VERS_FROM40, 8216 },
293
+ { "rsquo", VERS_FROM40, 8217 },
294
+ { "sbquo", VERS_FROM40, 8218 },
295
+ { "ldquo", VERS_FROM40, 8220 },
296
+ { "rdquo", VERS_FROM40, 8221 },
297
+ { "bdquo", VERS_FROM40, 8222 },
298
+ { "dagger", VERS_FROM40, 8224 },
299
+ { "Dagger", VERS_FROM40, 8225 },
300
+ { "permil", VERS_FROM40, 8240 },
301
+ { "lsaquo", VERS_FROM40, 8249 },
302
+ { "rsaquo", VERS_FROM40, 8250 },
303
+ { "euro", VERS_FROM40, 8364 },
304
+ { NULL, VERS_UNKNOWN, 0 }
305
+ };
306
+
307
+
308
+ /* Pure static implementation. Trades off lookup speed
309
+ ** for faster setup time (well, none actually).
310
+ ** Optimization of comparing 1st character buys enough
311
+ ** speed that hash doesn't improve things without > 500
312
+ ** items in list.
313
+ */
314
+ static const entity* entitiesLookup( ctmbstr s )
315
+ {
316
+ tmbchar ch = (tmbchar)( s ? *s : 0 );
317
+ const entity *np;
318
+ for ( np = entities; ch && np && np->name; ++np )
319
+ if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 )
320
+ return np;
321
+ return NULL;
322
+ }
323
+
324
+ #if 0
325
+ /* entity starting with "&" returns zero on error */
326
+ uint EntityCode( ctmbstr name, uint versions )
327
+ {
328
+ const entity* np;
329
+ assert( name && name[0] == '&' );
330
+
331
+ /* numeric entitity: name = "&#" followed by number */
332
+ if ( name[1] == '#' )
333
+ {
334
+ uint c = 0; /* zero on missing/bad number */
335
+ Bool isXml = ( (versions & VERS_XML) == VERS_XML );
336
+
337
+ /* 'x' prefix denotes hexadecimal number format */
338
+ if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
339
+ sscanf( name+3, "%x", &c );
340
+ else
341
+ sscanf( name+2, "%u", &c );
342
+
343
+ return (uint) c;
344
+ }
345
+
346
+ /* Named entity: name ="&" followed by a name */
347
+ if ( NULL != (np = entitiesLookup(name+1)) )
348
+ {
349
+ /* Only recognize entity name if version supports it. */
350
+ if ( np->versions & versions )
351
+ return np->code;
352
+ }
353
+
354
+ return 0; /* zero signifies unknown entity name */
355
+ }
356
+ #endif
357
+
358
+ Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions )
359
+ {
360
+ const entity* np;
361
+ assert( name && name[0] == '&' );
362
+ assert( code != NULL );
363
+ assert( versions != NULL );
364
+
365
+ /* numeric entitity: name = "&#" followed by number */
366
+ if ( name[1] == '#' )
367
+ {
368
+ uint c = 0; /* zero on missing/bad number */
369
+
370
+ /* 'x' prefix denotes hexadecimal number format */
371
+ if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
372
+ sscanf( name+3, "%x", &c );
373
+ else
374
+ sscanf( name+2, "%u", &c );
375
+
376
+ *code = c;
377
+ *versions = VERS_ALL;
378
+ return yes;
379
+ }
380
+
381
+ /* Named entity: name ="&" followed by a name */
382
+ if ( NULL != (np = entitiesLookup(name+1)) )
383
+ {
384
+ *code = np->code;
385
+ *versions = np->versions;
386
+ return yes;
387
+ }
388
+
389
+ *code = 0;
390
+ *versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
391
+ return no;
392
+ }
393
+
394
+
395
+ ctmbstr TY_(EntityName)( uint ch, uint versions )
396
+ {
397
+ ctmbstr entnam = NULL;
398
+ const entity *ep;
399
+
400
+ for ( ep = entities; ep->name != NULL; ++ep )
401
+ {
402
+ if ( ep->code == ch )
403
+ {
404
+ if ( ep->versions & versions )
405
+ entnam = ep->name;
406
+ break; /* Found code. Stop search. */
407
+ }
408
+ }
409
+ return entnam;
410
+ }
411
+
412
+ /*
413
+ * local variables:
414
+ * mode: c
415
+ * indent-tabs-mode: nil
416
+ * c-basic-offset: 4
417
+ * eval: (c-set-offset 'substatement-open 0)
418
+ * end:
419
+ */
@@ -0,0 +1,24 @@
1
+ #ifndef __ENTITIES_H__
2
+ #define __ENTITIES_H__
3
+
4
+ /* entities.h -- recognize character entities
5
+
6
+ (c) 1998-2006 (W3C) MIT, ERCIM, Keio University
7
+ See tidy.h for the copyright notice.
8
+
9
+ CVS Info :
10
+
11
+ $Author: arnaud02 $
12
+ $Date: 2006/09/12 15:14:44 $
13
+ $Revision: 1.8 $
14
+
15
+ */
16
+
17
+ #include "forward.h"
18
+
19
+ /* entity starting with "&" returns zero on error */
20
+ /* uint EntityCode( ctmbstr name, uint versions ); */
21
+ ctmbstr TY_(EntityName)( uint charCode, uint versions );
22
+ Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions );
23
+
24
+ #endif /* __ENTITIES_H__ */
@@ -0,0 +1,5 @@
1
+ require 'mkmf'
2
+
3
+ dir_config("tidy")
4
+ create_makefile("tidy")
5
+