tidy-ext 0.1.7
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/LICENSE +50 -0
- data/README +12 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/ext/tidy/access.c +3310 -0
- data/ext/tidy/access.h +279 -0
- data/ext/tidy/alloc.c +107 -0
- data/ext/tidy/attrask.c +209 -0
- data/ext/tidy/attrdict.c +2398 -0
- data/ext/tidy/attrdict.h +122 -0
- data/ext/tidy/attrget.c +213 -0
- data/ext/tidy/attrs.c +1911 -0
- data/ext/tidy/attrs.h +374 -0
- data/ext/tidy/buffio.c +232 -0
- data/ext/tidy/buffio.h +118 -0
- data/ext/tidy/charsets.c +1032 -0
- data/ext/tidy/charsets.h +14 -0
- data/ext/tidy/clean.c +2674 -0
- data/ext/tidy/clean.h +87 -0
- data/ext/tidy/config.c +1746 -0
- data/ext/tidy/config.h +153 -0
- data/ext/tidy/entities.c +419 -0
- data/ext/tidy/entities.h +24 -0
- data/ext/tidy/extconf.rb +5 -0
- data/ext/tidy/fileio.c +106 -0
- data/ext/tidy/fileio.h +46 -0
- data/ext/tidy/forward.h +69 -0
- data/ext/tidy/iconvtc.c +105 -0
- data/ext/tidy/iconvtc.h +15 -0
- data/ext/tidy/istack.c +373 -0
- data/ext/tidy/lexer.c +3825 -0
- data/ext/tidy/lexer.h +617 -0
- data/ext/tidy/localize.c +1882 -0
- data/ext/tidy/mappedio.c +329 -0
- data/ext/tidy/mappedio.h +16 -0
- data/ext/tidy/message.h +207 -0
- data/ext/tidy/parser.c +4408 -0
- data/ext/tidy/parser.h +76 -0
- data/ext/tidy/platform.h +636 -0
- data/ext/tidy/pprint.c +2276 -0
- data/ext/tidy/pprint.h +93 -0
- data/ext/tidy/ruby-tidy.c +195 -0
- data/ext/tidy/streamio.c +1407 -0
- data/ext/tidy/streamio.h +222 -0
- data/ext/tidy/tagask.c +286 -0
- data/ext/tidy/tags.c +955 -0
- data/ext/tidy/tags.h +235 -0
- data/ext/tidy/tidy-int.h +129 -0
- data/ext/tidy/tidy.h +1097 -0
- data/ext/tidy/tidyenum.h +622 -0
- data/ext/tidy/tidylib.c +1751 -0
- data/ext/tidy/tmbstr.c +306 -0
- data/ext/tidy/tmbstr.h +92 -0
- data/ext/tidy/utf8.c +539 -0
- data/ext/tidy/utf8.h +52 -0
- data/ext/tidy/version.h +14 -0
- data/ext/tidy/win32tc.c +795 -0
- data/ext/tidy/win32tc.h +19 -0
- data/spec/spec_helper.rb +5 -0
- data/spec/tidy/compat_spec.rb +44 -0
- data/spec/tidy/remote_uri_spec.rb +14 -0
- data/spec/tidy/test1.html +5 -0
- data/spec/tidy/tidy_spec.rb +34 -0
- metadata +125 -0
data/ext/tidy/config.h
ADDED
@@ -0,0 +1,153 @@
|
|
1
|
+
#ifndef __CONFIG_H__
|
2
|
+
#define __CONFIG_H__
|
3
|
+
|
4
|
+
/* config.h -- read config file and manage config properties
|
5
|
+
|
6
|
+
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
|
7
|
+
See tidy.h for the copyright notice.
|
8
|
+
|
9
|
+
CVS Info :
|
10
|
+
|
11
|
+
$Author: arnaud02 $
|
12
|
+
$Date: 2006/12/29 16:31:08 $
|
13
|
+
$Revision: 1.14 $
|
14
|
+
|
15
|
+
config files associate a property name with a value.
|
16
|
+
|
17
|
+
// comments can start at the beginning of a line
|
18
|
+
# comments can start at the beginning of a line
|
19
|
+
name: short values fit onto one line
|
20
|
+
name: a really long value that
|
21
|
+
continues on the next line
|
22
|
+
|
23
|
+
property names are case insensitive and should be less than
|
24
|
+
60 characters in length and must start at the begining of
|
25
|
+
the line, as whitespace at the start of a line signifies a
|
26
|
+
line continuation.
|
27
|
+
|
28
|
+
*/
|
29
|
+
|
30
|
+
#include "forward.h"
|
31
|
+
#include "tidy.h"
|
32
|
+
#include "streamio.h"
|
33
|
+
|
34
|
+
struct _tidy_option;
|
35
|
+
typedef struct _tidy_option TidyOptionImpl;
|
36
|
+
|
37
|
+
typedef Bool (ParseProperty)( TidyDocImpl* doc, const TidyOptionImpl* opt );
|
38
|
+
|
39
|
+
struct _tidy_option
|
40
|
+
{
|
41
|
+
TidyOptionId id;
|
42
|
+
TidyConfigCategory category; /* put 'em in groups */
|
43
|
+
ctmbstr name; /* property name */
|
44
|
+
TidyOptionType type; /* string, int or bool */
|
45
|
+
ulong dflt; /* default for TidyInteger and TidyBoolean */
|
46
|
+
ParseProperty* parser; /* parsing method, read-only if NULL */
|
47
|
+
const ctmbstr* pickList; /* pick list */
|
48
|
+
ctmbstr pdflt; /* default for TidyString */
|
49
|
+
};
|
50
|
+
|
51
|
+
typedef union
|
52
|
+
{
|
53
|
+
ulong v; /* Value for TidyInteger and TidyBoolean */
|
54
|
+
char *p; /* Value for TidyString */
|
55
|
+
} TidyOptionValue;
|
56
|
+
|
57
|
+
typedef struct _tidy_config
|
58
|
+
{
|
59
|
+
TidyOptionValue value[ N_TIDY_OPTIONS + 1 ]; /* current config values */
|
60
|
+
TidyOptionValue snapshot[ N_TIDY_OPTIONS + 1 ]; /* Snapshot of values to be restored later */
|
61
|
+
|
62
|
+
/* track what tags user has defined to eliminate unnecessary searches */
|
63
|
+
uint defined_tags;
|
64
|
+
|
65
|
+
uint c; /* current char in input stream */
|
66
|
+
StreamIn* cfgIn; /* current input source */
|
67
|
+
|
68
|
+
} TidyConfigImpl;
|
69
|
+
|
70
|
+
|
71
|
+
typedef struct {
|
72
|
+
TidyOptionId opt; /**< Identifier. */
|
73
|
+
ctmbstr doc; /**< HTML text */
|
74
|
+
TidyOptionId const *links; /**< Cross references.
|
75
|
+
Last element must be 'TidyUnknownOption'. */
|
76
|
+
} TidyOptionDoc;
|
77
|
+
|
78
|
+
|
79
|
+
const TidyOptionImpl* TY_(lookupOption)( ctmbstr optnam );
|
80
|
+
const TidyOptionImpl* TY_(getOption)( TidyOptionId optId );
|
81
|
+
|
82
|
+
TidyIterator TY_(getOptionList)( TidyDocImpl* doc );
|
83
|
+
const TidyOptionImpl* TY_(getNextOption)( TidyDocImpl* doc, TidyIterator* iter );
|
84
|
+
|
85
|
+
TidyIterator TY_(getOptionPickList)( const TidyOptionImpl* option );
|
86
|
+
ctmbstr TY_(getNextOptionPick)( const TidyOptionImpl* option, TidyIterator* iter );
|
87
|
+
|
88
|
+
const TidyOptionDoc* TY_(OptGetDocDesc)( TidyOptionId optId );
|
89
|
+
|
90
|
+
void TY_(InitConfig)( TidyDocImpl* doc );
|
91
|
+
void TY_(FreeConfig)( TidyDocImpl* doc );
|
92
|
+
|
93
|
+
/* Bool SetOptionValue( TidyDocImpl* doc, TidyOptionId optId, ctmbstr val ); */
|
94
|
+
Bool TY_(SetOptionInt)( TidyDocImpl* doc, TidyOptionId optId, ulong val );
|
95
|
+
Bool TY_(SetOptionBool)( TidyDocImpl* doc, TidyOptionId optId, Bool val );
|
96
|
+
|
97
|
+
Bool TY_(ResetOptionToDefault)( TidyDocImpl* doc, TidyOptionId optId );
|
98
|
+
void TY_(ResetConfigToDefault)( TidyDocImpl* doc );
|
99
|
+
void TY_(TakeConfigSnapshot)( TidyDocImpl* doc );
|
100
|
+
void TY_(ResetConfigToSnapshot)( TidyDocImpl* doc );
|
101
|
+
|
102
|
+
void TY_(CopyConfig)( TidyDocImpl* docTo, TidyDocImpl* docFrom );
|
103
|
+
|
104
|
+
int TY_(ParseConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
|
105
|
+
int TY_(ParseConfigFileEnc)( TidyDocImpl* doc,
|
106
|
+
ctmbstr cfgfil, ctmbstr charenc );
|
107
|
+
|
108
|
+
int TY_(SaveConfigFile)( TidyDocImpl* doc, ctmbstr cfgfil );
|
109
|
+
int TY_(SaveConfigSink)( TidyDocImpl* doc, TidyOutputSink* sink );
|
110
|
+
|
111
|
+
/* returns false if unknown option, missing parameter, or
|
112
|
+
option doesn't use parameter
|
113
|
+
*/
|
114
|
+
Bool TY_(ParseConfigOption)( TidyDocImpl* doc, ctmbstr optnam, ctmbstr optVal );
|
115
|
+
Bool TY_(ParseConfigValue)( TidyDocImpl* doc, TidyOptionId optId, ctmbstr optVal );
|
116
|
+
|
117
|
+
/* ensure that char encodings are self consistent */
|
118
|
+
Bool TY_(AdjustCharEncoding)( TidyDocImpl* doc, int encoding );
|
119
|
+
|
120
|
+
Bool TY_(ConfigDiffThanDefault)( TidyDocImpl* doc );
|
121
|
+
Bool TY_(ConfigDiffThanSnapshot)( TidyDocImpl* doc );
|
122
|
+
|
123
|
+
int TY_(CharEncodingId)( TidyDocImpl* doc, ctmbstr charenc );
|
124
|
+
ctmbstr TY_(CharEncodingName)( int encoding );
|
125
|
+
ctmbstr TY_(CharEncodingOptName)( int encoding );
|
126
|
+
|
127
|
+
/* void SetEmacsFilename( TidyDocImpl* doc, ctmbstr filename ); */
|
128
|
+
|
129
|
+
|
130
|
+
#ifdef _DEBUG
|
131
|
+
|
132
|
+
/* Debug lookup functions will be type-safe and assert option type match */
|
133
|
+
ulong TY_(_cfgGet)( TidyDocImpl* doc, TidyOptionId optId );
|
134
|
+
Bool TY_(_cfgGetBool)( TidyDocImpl* doc, TidyOptionId optId );
|
135
|
+
TidyTriState TY_(_cfgGetAutoBool)( TidyDocImpl* doc, TidyOptionId optId );
|
136
|
+
ctmbstr TY_(_cfgGetString)( TidyDocImpl* doc, TidyOptionId optId );
|
137
|
+
|
138
|
+
#define cfg(doc, id) TY_(_cfgGet)( (doc), (id) )
|
139
|
+
#define cfgBool(doc, id) TY_(_cfgGetBool)( (doc), (id) )
|
140
|
+
#define cfgAutoBool(doc, id) TY_(_cfgGetAutoBool)( (doc), (id) )
|
141
|
+
#define cfgStr(doc, id) TY_(_cfgGetString)( (doc), (id) )
|
142
|
+
|
143
|
+
#else
|
144
|
+
|
145
|
+
/* Release build macros for speed */
|
146
|
+
#define cfg(doc, id) ((doc)->config.value[ (id) ].v)
|
147
|
+
#define cfgBool(doc, id) ((Bool) cfg(doc, id))
|
148
|
+
#define cfgAutoBool(doc, id) ((TidyTriState) cfg(doc, id))
|
149
|
+
#define cfgStr(doc, id) ((ctmbstr) (doc)->config.value[ (id) ].p)
|
150
|
+
|
151
|
+
#endif /* _DEBUG */
|
152
|
+
|
153
|
+
#endif /* __CONFIG_H__ */
|
data/ext/tidy/entities.c
ADDED
@@ -0,0 +1,419 @@
|
|
1
|
+
/* entities.c -- recognize HTML ISO entities
|
2
|
+
|
3
|
+
(c) 1998-2008 (W3C) MIT, ERCIM, Keio University
|
4
|
+
See tidy.h for the copyright notice.
|
5
|
+
|
6
|
+
CVS Info :
|
7
|
+
|
8
|
+
$Author: hoehrmann $
|
9
|
+
$Date: 2008/08/09 11:55:27 $
|
10
|
+
$Revision: 1.19 $
|
11
|
+
|
12
|
+
Entity handling can be static because there are no config or
|
13
|
+
document-specific values. Lookup table is 100% defined at
|
14
|
+
compile time.
|
15
|
+
|
16
|
+
*/
|
17
|
+
|
18
|
+
#include <stdio.h>
|
19
|
+
#include "entities.h"
|
20
|
+
#include "tidy-int.h"
|
21
|
+
#include "tmbstr.h"
|
22
|
+
|
23
|
+
struct _entity;
|
24
|
+
typedef struct _entity entity;
|
25
|
+
|
26
|
+
struct _entity
|
27
|
+
{
|
28
|
+
ctmbstr name;
|
29
|
+
uint versions;
|
30
|
+
uint code;
|
31
|
+
};
|
32
|
+
|
33
|
+
|
34
|
+
static const entity entities[] =
|
35
|
+
{
|
36
|
+
/*
|
37
|
+
** Markup pre-defined character entities
|
38
|
+
*/
|
39
|
+
{ "quot", VERS_ALL|VERS_XML, 34 },
|
40
|
+
{ "amp", VERS_ALL|VERS_XML, 38 },
|
41
|
+
{ "apos", VERS_FROM40|VERS_XML, 39 },
|
42
|
+
{ "lt", VERS_ALL|VERS_XML, 60 },
|
43
|
+
{ "gt", VERS_ALL|VERS_XML, 62 },
|
44
|
+
|
45
|
+
/*
|
46
|
+
** Latin-1 character entities
|
47
|
+
*/
|
48
|
+
{ "nbsp", VERS_ALL, 160 },
|
49
|
+
{ "iexcl", VERS_ALL, 161 },
|
50
|
+
{ "cent", VERS_ALL, 162 },
|
51
|
+
{ "pound", VERS_ALL, 163 },
|
52
|
+
{ "curren", VERS_ALL, 164 },
|
53
|
+
{ "yen", VERS_ALL, 165 },
|
54
|
+
{ "brvbar", VERS_ALL, 166 },
|
55
|
+
{ "sect", VERS_ALL, 167 },
|
56
|
+
{ "uml", VERS_ALL, 168 },
|
57
|
+
{ "copy", VERS_ALL, 169 },
|
58
|
+
{ "ordf", VERS_ALL, 170 },
|
59
|
+
{ "laquo", VERS_ALL, 171 },
|
60
|
+
{ "not", VERS_ALL, 172 },
|
61
|
+
{ "shy", VERS_ALL, 173 },
|
62
|
+
{ "reg", VERS_ALL, 174 },
|
63
|
+
{ "macr", VERS_ALL, 175 },
|
64
|
+
{ "deg", VERS_ALL, 176 },
|
65
|
+
{ "plusmn", VERS_ALL, 177 },
|
66
|
+
{ "sup2", VERS_ALL, 178 },
|
67
|
+
{ "sup3", VERS_ALL, 179 },
|
68
|
+
{ "acute", VERS_ALL, 180 },
|
69
|
+
{ "micro", VERS_ALL, 181 },
|
70
|
+
{ "para", VERS_ALL, 182 },
|
71
|
+
{ "middot", VERS_ALL, 183 },
|
72
|
+
{ "cedil", VERS_ALL, 184 },
|
73
|
+
{ "sup1", VERS_ALL, 185 },
|
74
|
+
{ "ordm", VERS_ALL, 186 },
|
75
|
+
{ "raquo", VERS_ALL, 187 },
|
76
|
+
{ "frac14", VERS_ALL, 188 },
|
77
|
+
{ "frac12", VERS_ALL, 189 },
|
78
|
+
{ "frac34", VERS_ALL, 190 },
|
79
|
+
{ "iquest", VERS_ALL, 191 },
|
80
|
+
{ "Agrave", VERS_ALL, 192 },
|
81
|
+
{ "Aacute", VERS_ALL, 193 },
|
82
|
+
{ "Acirc", VERS_ALL, 194 },
|
83
|
+
{ "Atilde", VERS_ALL, 195 },
|
84
|
+
{ "Auml", VERS_ALL, 196 },
|
85
|
+
{ "Aring", VERS_ALL, 197 },
|
86
|
+
{ "AElig", VERS_ALL, 198 },
|
87
|
+
{ "Ccedil", VERS_ALL, 199 },
|
88
|
+
{ "Egrave", VERS_ALL, 200 },
|
89
|
+
{ "Eacute", VERS_ALL, 201 },
|
90
|
+
{ "Ecirc", VERS_ALL, 202 },
|
91
|
+
{ "Euml", VERS_ALL, 203 },
|
92
|
+
{ "Igrave", VERS_ALL, 204 },
|
93
|
+
{ "Iacute", VERS_ALL, 205 },
|
94
|
+
{ "Icirc", VERS_ALL, 206 },
|
95
|
+
{ "Iuml", VERS_ALL, 207 },
|
96
|
+
{ "ETH", VERS_ALL, 208 },
|
97
|
+
{ "Ntilde", VERS_ALL, 209 },
|
98
|
+
{ "Ograve", VERS_ALL, 210 },
|
99
|
+
{ "Oacute", VERS_ALL, 211 },
|
100
|
+
{ "Ocirc", VERS_ALL, 212 },
|
101
|
+
{ "Otilde", VERS_ALL, 213 },
|
102
|
+
{ "Ouml", VERS_ALL, 214 },
|
103
|
+
{ "times", VERS_ALL, 215 },
|
104
|
+
{ "Oslash", VERS_ALL, 216 },
|
105
|
+
{ "Ugrave", VERS_ALL, 217 },
|
106
|
+
{ "Uacute", VERS_ALL, 218 },
|
107
|
+
{ "Ucirc", VERS_ALL, 219 },
|
108
|
+
{ "Uuml", VERS_ALL, 220 },
|
109
|
+
{ "Yacute", VERS_ALL, 221 },
|
110
|
+
{ "THORN", VERS_ALL, 222 },
|
111
|
+
{ "szlig", VERS_ALL, 223 },
|
112
|
+
{ "agrave", VERS_ALL, 224 },
|
113
|
+
{ "aacute", VERS_ALL, 225 },
|
114
|
+
{ "acirc", VERS_ALL, 226 },
|
115
|
+
{ "atilde", VERS_ALL, 227 },
|
116
|
+
{ "auml", VERS_ALL, 228 },
|
117
|
+
{ "aring", VERS_ALL, 229 },
|
118
|
+
{ "aelig", VERS_ALL, 230 },
|
119
|
+
{ "ccedil", VERS_ALL, 231 },
|
120
|
+
{ "egrave", VERS_ALL, 232 },
|
121
|
+
{ "eacute", VERS_ALL, 233 },
|
122
|
+
{ "ecirc", VERS_ALL, 234 },
|
123
|
+
{ "euml", VERS_ALL, 235 },
|
124
|
+
{ "igrave", VERS_ALL, 236 },
|
125
|
+
{ "iacute", VERS_ALL, 237 },
|
126
|
+
{ "icirc", VERS_ALL, 238 },
|
127
|
+
{ "iuml", VERS_ALL, 239 },
|
128
|
+
{ "eth", VERS_ALL, 240 },
|
129
|
+
{ "ntilde", VERS_ALL, 241 },
|
130
|
+
{ "ograve", VERS_ALL, 242 },
|
131
|
+
{ "oacute", VERS_ALL, 243 },
|
132
|
+
{ "ocirc", VERS_ALL, 244 },
|
133
|
+
{ "otilde", VERS_ALL, 245 },
|
134
|
+
{ "ouml", VERS_ALL, 246 },
|
135
|
+
{ "divide", VERS_ALL, 247 },
|
136
|
+
{ "oslash", VERS_ALL, 248 },
|
137
|
+
{ "ugrave", VERS_ALL, 249 },
|
138
|
+
{ "uacute", VERS_ALL, 250 },
|
139
|
+
{ "ucirc", VERS_ALL, 251 },
|
140
|
+
{ "uuml", VERS_ALL, 252 },
|
141
|
+
{ "yacute", VERS_ALL, 253 },
|
142
|
+
{ "thorn", VERS_ALL, 254 },
|
143
|
+
{ "yuml", VERS_ALL, 255 },
|
144
|
+
|
145
|
+
/*
|
146
|
+
** Extended Entities defined in HTML 4: Symbols
|
147
|
+
*/
|
148
|
+
{ "fnof", VERS_FROM40, 402 },
|
149
|
+
{ "Alpha", VERS_FROM40, 913 },
|
150
|
+
{ "Beta", VERS_FROM40, 914 },
|
151
|
+
{ "Gamma", VERS_FROM40, 915 },
|
152
|
+
{ "Delta", VERS_FROM40, 916 },
|
153
|
+
{ "Epsilon", VERS_FROM40, 917 },
|
154
|
+
{ "Zeta", VERS_FROM40, 918 },
|
155
|
+
{ "Eta", VERS_FROM40, 919 },
|
156
|
+
{ "Theta", VERS_FROM40, 920 },
|
157
|
+
{ "Iota", VERS_FROM40, 921 },
|
158
|
+
{ "Kappa", VERS_FROM40, 922 },
|
159
|
+
{ "Lambda", VERS_FROM40, 923 },
|
160
|
+
{ "Mu", VERS_FROM40, 924 },
|
161
|
+
{ "Nu", VERS_FROM40, 925 },
|
162
|
+
{ "Xi", VERS_FROM40, 926 },
|
163
|
+
{ "Omicron", VERS_FROM40, 927 },
|
164
|
+
{ "Pi", VERS_FROM40, 928 },
|
165
|
+
{ "Rho", VERS_FROM40, 929 },
|
166
|
+
{ "Sigma", VERS_FROM40, 931 },
|
167
|
+
{ "Tau", VERS_FROM40, 932 },
|
168
|
+
{ "Upsilon", VERS_FROM40, 933 },
|
169
|
+
{ "Phi", VERS_FROM40, 934 },
|
170
|
+
{ "Chi", VERS_FROM40, 935 },
|
171
|
+
{ "Psi", VERS_FROM40, 936 },
|
172
|
+
{ "Omega", VERS_FROM40, 937 },
|
173
|
+
{ "alpha", VERS_FROM40, 945 },
|
174
|
+
{ "beta", VERS_FROM40, 946 },
|
175
|
+
{ "gamma", VERS_FROM40, 947 },
|
176
|
+
{ "delta", VERS_FROM40, 948 },
|
177
|
+
{ "epsilon", VERS_FROM40, 949 },
|
178
|
+
{ "zeta", VERS_FROM40, 950 },
|
179
|
+
{ "eta", VERS_FROM40, 951 },
|
180
|
+
{ "theta", VERS_FROM40, 952 },
|
181
|
+
{ "iota", VERS_FROM40, 953 },
|
182
|
+
{ "kappa", VERS_FROM40, 954 },
|
183
|
+
{ "lambda", VERS_FROM40, 955 },
|
184
|
+
{ "mu", VERS_FROM40, 956 },
|
185
|
+
{ "nu", VERS_FROM40, 957 },
|
186
|
+
{ "xi", VERS_FROM40, 958 },
|
187
|
+
{ "omicron", VERS_FROM40, 959 },
|
188
|
+
{ "pi", VERS_FROM40, 960 },
|
189
|
+
{ "rho", VERS_FROM40, 961 },
|
190
|
+
{ "sigmaf", VERS_FROM40, 962 },
|
191
|
+
{ "sigma", VERS_FROM40, 963 },
|
192
|
+
{ "tau", VERS_FROM40, 964 },
|
193
|
+
{ "upsilon", VERS_FROM40, 965 },
|
194
|
+
{ "phi", VERS_FROM40, 966 },
|
195
|
+
{ "chi", VERS_FROM40, 967 },
|
196
|
+
{ "psi", VERS_FROM40, 968 },
|
197
|
+
{ "omega", VERS_FROM40, 969 },
|
198
|
+
{ "thetasym", VERS_FROM40, 977 },
|
199
|
+
{ "upsih", VERS_FROM40, 978 },
|
200
|
+
{ "piv", VERS_FROM40, 982 },
|
201
|
+
{ "bull", VERS_FROM40, 8226 },
|
202
|
+
{ "hellip", VERS_FROM40, 8230 },
|
203
|
+
{ "prime", VERS_FROM40, 8242 },
|
204
|
+
{ "Prime", VERS_FROM40, 8243 },
|
205
|
+
{ "oline", VERS_FROM40, 8254 },
|
206
|
+
{ "frasl", VERS_FROM40, 8260 },
|
207
|
+
{ "weierp", VERS_FROM40, 8472 },
|
208
|
+
{ "image", VERS_FROM40, 8465 },
|
209
|
+
{ "real", VERS_FROM40, 8476 },
|
210
|
+
{ "trade", VERS_FROM40, 8482 },
|
211
|
+
{ "alefsym", VERS_FROM40, 8501 },
|
212
|
+
{ "larr", VERS_FROM40, 8592 },
|
213
|
+
{ "uarr", VERS_FROM40, 8593 },
|
214
|
+
{ "rarr", VERS_FROM40, 8594 },
|
215
|
+
{ "darr", VERS_FROM40, 8595 },
|
216
|
+
{ "harr", VERS_FROM40, 8596 },
|
217
|
+
{ "crarr", VERS_FROM40, 8629 },
|
218
|
+
{ "lArr", VERS_FROM40, 8656 },
|
219
|
+
{ "uArr", VERS_FROM40, 8657 },
|
220
|
+
{ "rArr", VERS_FROM40, 8658 },
|
221
|
+
{ "dArr", VERS_FROM40, 8659 },
|
222
|
+
{ "hArr", VERS_FROM40, 8660 },
|
223
|
+
{ "forall", VERS_FROM40, 8704 },
|
224
|
+
{ "part", VERS_FROM40, 8706 },
|
225
|
+
{ "exist", VERS_FROM40, 8707 },
|
226
|
+
{ "empty", VERS_FROM40, 8709 },
|
227
|
+
{ "nabla", VERS_FROM40, 8711 },
|
228
|
+
{ "isin", VERS_FROM40, 8712 },
|
229
|
+
{ "notin", VERS_FROM40, 8713 },
|
230
|
+
{ "ni", VERS_FROM40, 8715 },
|
231
|
+
{ "prod", VERS_FROM40, 8719 },
|
232
|
+
{ "sum", VERS_FROM40, 8721 },
|
233
|
+
{ "minus", VERS_FROM40, 8722 },
|
234
|
+
{ "lowast", VERS_FROM40, 8727 },
|
235
|
+
{ "radic", VERS_FROM40, 8730 },
|
236
|
+
{ "prop", VERS_FROM40, 8733 },
|
237
|
+
{ "infin", VERS_FROM40, 8734 },
|
238
|
+
{ "ang", VERS_FROM40, 8736 },
|
239
|
+
{ "and", VERS_FROM40, 8743 },
|
240
|
+
{ "or", VERS_FROM40, 8744 },
|
241
|
+
{ "cap", VERS_FROM40, 8745 },
|
242
|
+
{ "cup", VERS_FROM40, 8746 },
|
243
|
+
{ "int", VERS_FROM40, 8747 },
|
244
|
+
{ "there4", VERS_FROM40, 8756 },
|
245
|
+
{ "sim", VERS_FROM40, 8764 },
|
246
|
+
{ "cong", VERS_FROM40, 8773 },
|
247
|
+
{ "asymp", VERS_FROM40, 8776 },
|
248
|
+
{ "ne", VERS_FROM40, 8800 },
|
249
|
+
{ "equiv", VERS_FROM40, 8801 },
|
250
|
+
{ "le", VERS_FROM40, 8804 },
|
251
|
+
{ "ge", VERS_FROM40, 8805 },
|
252
|
+
{ "sub", VERS_FROM40, 8834 },
|
253
|
+
{ "sup", VERS_FROM40, 8835 },
|
254
|
+
{ "nsub", VERS_FROM40, 8836 },
|
255
|
+
{ "sube", VERS_FROM40, 8838 },
|
256
|
+
{ "supe", VERS_FROM40, 8839 },
|
257
|
+
{ "oplus", VERS_FROM40, 8853 },
|
258
|
+
{ "otimes", VERS_FROM40, 8855 },
|
259
|
+
{ "perp", VERS_FROM40, 8869 },
|
260
|
+
{ "sdot", VERS_FROM40, 8901 },
|
261
|
+
{ "lceil", VERS_FROM40, 8968 },
|
262
|
+
{ "rceil", VERS_FROM40, 8969 },
|
263
|
+
{ "lfloor", VERS_FROM40, 8970 },
|
264
|
+
{ "rfloor", VERS_FROM40, 8971 },
|
265
|
+
{ "lang", VERS_FROM40, 9001 },
|
266
|
+
{ "rang", VERS_FROM40, 9002 },
|
267
|
+
{ "loz", VERS_FROM40, 9674 },
|
268
|
+
{ "spades", VERS_FROM40, 9824 },
|
269
|
+
{ "clubs", VERS_FROM40, 9827 },
|
270
|
+
{ "hearts", VERS_FROM40, 9829 },
|
271
|
+
{ "diams", VERS_FROM40, 9830 },
|
272
|
+
|
273
|
+
/*
|
274
|
+
** Extended Entities defined in HTML 4: Special (less Markup at top)
|
275
|
+
*/
|
276
|
+
{ "OElig", VERS_FROM40, 338 },
|
277
|
+
{ "oelig", VERS_FROM40, 339 },
|
278
|
+
{ "Scaron", VERS_FROM40, 352 },
|
279
|
+
{ "scaron", VERS_FROM40, 353 },
|
280
|
+
{ "Yuml", VERS_FROM40, 376 },
|
281
|
+
{ "circ", VERS_FROM40, 710 },
|
282
|
+
{ "tilde", VERS_FROM40, 732 },
|
283
|
+
{ "ensp", VERS_FROM40, 8194 },
|
284
|
+
{ "emsp", VERS_FROM40, 8195 },
|
285
|
+
{ "thinsp", VERS_FROM40, 8201 },
|
286
|
+
{ "zwnj", VERS_FROM40, 8204 },
|
287
|
+
{ "zwj", VERS_FROM40, 8205 },
|
288
|
+
{ "lrm", VERS_FROM40, 8206 },
|
289
|
+
{ "rlm", VERS_FROM40, 8207 },
|
290
|
+
{ "ndash", VERS_FROM40, 8211 },
|
291
|
+
{ "mdash", VERS_FROM40, 8212 },
|
292
|
+
{ "lsquo", VERS_FROM40, 8216 },
|
293
|
+
{ "rsquo", VERS_FROM40, 8217 },
|
294
|
+
{ "sbquo", VERS_FROM40, 8218 },
|
295
|
+
{ "ldquo", VERS_FROM40, 8220 },
|
296
|
+
{ "rdquo", VERS_FROM40, 8221 },
|
297
|
+
{ "bdquo", VERS_FROM40, 8222 },
|
298
|
+
{ "dagger", VERS_FROM40, 8224 },
|
299
|
+
{ "Dagger", VERS_FROM40, 8225 },
|
300
|
+
{ "permil", VERS_FROM40, 8240 },
|
301
|
+
{ "lsaquo", VERS_FROM40, 8249 },
|
302
|
+
{ "rsaquo", VERS_FROM40, 8250 },
|
303
|
+
{ "euro", VERS_FROM40, 8364 },
|
304
|
+
{ NULL, VERS_UNKNOWN, 0 }
|
305
|
+
};
|
306
|
+
|
307
|
+
|
308
|
+
/* Pure static implementation. Trades off lookup speed
|
309
|
+
** for faster setup time (well, none actually).
|
310
|
+
** Optimization of comparing 1st character buys enough
|
311
|
+
** speed that hash doesn't improve things without > 500
|
312
|
+
** items in list.
|
313
|
+
*/
|
314
|
+
static const entity* entitiesLookup( ctmbstr s )
|
315
|
+
{
|
316
|
+
tmbchar ch = (tmbchar)( s ? *s : 0 );
|
317
|
+
const entity *np;
|
318
|
+
for ( np = entities; ch && np && np->name; ++np )
|
319
|
+
if ( ch == *np->name && TY_(tmbstrcmp)(s, np->name) == 0 )
|
320
|
+
return np;
|
321
|
+
return NULL;
|
322
|
+
}
|
323
|
+
|
324
|
+
#if 0
|
325
|
+
/* entity starting with "&" returns zero on error */
|
326
|
+
uint EntityCode( ctmbstr name, uint versions )
|
327
|
+
{
|
328
|
+
const entity* np;
|
329
|
+
assert( name && name[0] == '&' );
|
330
|
+
|
331
|
+
/* numeric entitity: name = "&#" followed by number */
|
332
|
+
if ( name[1] == '#' )
|
333
|
+
{
|
334
|
+
uint c = 0; /* zero on missing/bad number */
|
335
|
+
Bool isXml = ( (versions & VERS_XML) == VERS_XML );
|
336
|
+
|
337
|
+
/* 'x' prefix denotes hexadecimal number format */
|
338
|
+
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
|
339
|
+
sscanf( name+3, "%x", &c );
|
340
|
+
else
|
341
|
+
sscanf( name+2, "%u", &c );
|
342
|
+
|
343
|
+
return (uint) c;
|
344
|
+
}
|
345
|
+
|
346
|
+
/* Named entity: name ="&" followed by a name */
|
347
|
+
if ( NULL != (np = entitiesLookup(name+1)) )
|
348
|
+
{
|
349
|
+
/* Only recognize entity name if version supports it. */
|
350
|
+
if ( np->versions & versions )
|
351
|
+
return np->code;
|
352
|
+
}
|
353
|
+
|
354
|
+
return 0; /* zero signifies unknown entity name */
|
355
|
+
}
|
356
|
+
#endif
|
357
|
+
|
358
|
+
Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions )
|
359
|
+
{
|
360
|
+
const entity* np;
|
361
|
+
assert( name && name[0] == '&' );
|
362
|
+
assert( code != NULL );
|
363
|
+
assert( versions != NULL );
|
364
|
+
|
365
|
+
/* numeric entitity: name = "&#" followed by number */
|
366
|
+
if ( name[1] == '#' )
|
367
|
+
{
|
368
|
+
uint c = 0; /* zero on missing/bad number */
|
369
|
+
|
370
|
+
/* 'x' prefix denotes hexadecimal number format */
|
371
|
+
if ( name[2] == 'x' || (!isXml && name[2] == 'X') )
|
372
|
+
sscanf( name+3, "%x", &c );
|
373
|
+
else
|
374
|
+
sscanf( name+2, "%u", &c );
|
375
|
+
|
376
|
+
*code = c;
|
377
|
+
*versions = VERS_ALL;
|
378
|
+
return yes;
|
379
|
+
}
|
380
|
+
|
381
|
+
/* Named entity: name ="&" followed by a name */
|
382
|
+
if ( NULL != (np = entitiesLookup(name+1)) )
|
383
|
+
{
|
384
|
+
*code = np->code;
|
385
|
+
*versions = np->versions;
|
386
|
+
return yes;
|
387
|
+
}
|
388
|
+
|
389
|
+
*code = 0;
|
390
|
+
*versions = ( isXml ? VERS_XML : VERS_PROPRIETARY );
|
391
|
+
return no;
|
392
|
+
}
|
393
|
+
|
394
|
+
|
395
|
+
ctmbstr TY_(EntityName)( uint ch, uint versions )
|
396
|
+
{
|
397
|
+
ctmbstr entnam = NULL;
|
398
|
+
const entity *ep;
|
399
|
+
|
400
|
+
for ( ep = entities; ep->name != NULL; ++ep )
|
401
|
+
{
|
402
|
+
if ( ep->code == ch )
|
403
|
+
{
|
404
|
+
if ( ep->versions & versions )
|
405
|
+
entnam = ep->name;
|
406
|
+
break; /* Found code. Stop search. */
|
407
|
+
}
|
408
|
+
}
|
409
|
+
return entnam;
|
410
|
+
}
|
411
|
+
|
412
|
+
/*
|
413
|
+
* local variables:
|
414
|
+
* mode: c
|
415
|
+
* indent-tabs-mode: nil
|
416
|
+
* c-basic-offset: 4
|
417
|
+
* eval: (c-set-offset 'substatement-open 0)
|
418
|
+
* end:
|
419
|
+
*/
|
data/ext/tidy/entities.h
ADDED
@@ -0,0 +1,24 @@
|
|
1
|
+
#ifndef __ENTITIES_H__
|
2
|
+
#define __ENTITIES_H__
|
3
|
+
|
4
|
+
/* entities.h -- recognize character entities
|
5
|
+
|
6
|
+
(c) 1998-2006 (W3C) MIT, ERCIM, Keio University
|
7
|
+
See tidy.h for the copyright notice.
|
8
|
+
|
9
|
+
CVS Info :
|
10
|
+
|
11
|
+
$Author: arnaud02 $
|
12
|
+
$Date: 2006/09/12 15:14:44 $
|
13
|
+
$Revision: 1.8 $
|
14
|
+
|
15
|
+
*/
|
16
|
+
|
17
|
+
#include "forward.h"
|
18
|
+
|
19
|
+
/* entity starting with "&" returns zero on error */
|
20
|
+
/* uint EntityCode( ctmbstr name, uint versions ); */
|
21
|
+
ctmbstr TY_(EntityName)( uint charCode, uint versions );
|
22
|
+
Bool TY_(EntityInfo)( ctmbstr name, Bool isXml, uint* code, uint* versions );
|
23
|
+
|
24
|
+
#endif /* __ENTITIES_H__ */
|