ox 2.14.14 → 2.14.17

Sign up to get free protection for your applications and to get access to all the features.
data/ext/ox/special.c CHANGED
@@ -3,395 +3,388 @@
3
3
  * All rights reserved.
4
4
  */
5
5
 
6
- #include <string.h>
7
- #include <stdbool.h>
8
-
9
6
  #include "special.h"
10
7
 
8
+ #include <stdbool.h>
9
+ #include <string.h>
10
+
11
11
  /*
12
12
  u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
13
13
  u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
14
14
  u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
15
15
  u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
16
16
  */
17
- char*
18
- ox_ucs_to_utf8_chars(char *text, uint64_t u) {
19
- int reading = 0;
20
- int i;
21
- unsigned char c;
17
+ char *ox_ucs_to_utf8_chars(char *text, uint64_t u) {
18
+ int reading = 0;
19
+ int i;
20
+ unsigned char c;
22
21
 
23
22
  if (u <= 0x000000000000007FULL) {
24
- /* 0xxxxxxx */
25
- *text++ = (char)u;
23
+ /* 0xxxxxxx */
24
+ *text++ = (char)u;
26
25
  } else if (u <= 0x00000000000007FFULL) {
27
- /* 110yyyyy 10xxxxxx */
28
- *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
29
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
26
+ /* 110yyyyy 10xxxxxx */
27
+ *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
28
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
30
29
  } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
31
- /* 1110zzzz 10yyyyyy 10xxxxxx */
32
- *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
33
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
34
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
30
+ /* 1110zzzz 10yyyyyy 10xxxxxx */
31
+ *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
32
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
33
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
35
34
  } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
36
- /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
37
- *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
38
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
39
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
40
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
35
+ /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
36
+ *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
37
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
38
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
39
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
41
40
  } else {
42
- /* assume it is UTF-8 encoded directly and not UCS */
43
- for (i = 56; 0 <= i; i -= 8) {
44
- c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
45
- if (reading) {
46
- *text++ = (char)c;
47
- } else if ('\0' != c) {
48
- *text++ = (char)c;
49
- reading = 1;
50
- }
51
- }
41
+ /* assume it is UTF-8 encoded directly and not UCS */
42
+ for (i = 56; 0 <= i; i -= 8) {
43
+ c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
44
+ if (reading) {
45
+ *text++ = (char)c;
46
+ } else if ('\0' != c) {
47
+ *text++ = (char)c;
48
+ reading = 1;
49
+ }
50
+ }
52
51
  }
53
52
  return text;
54
53
  }
55
54
 
56
- #define BUCKET_SIZE 256
57
- #define BUCKET_MASK 255
55
+ #define BUCKET_SIZE 256
56
+ #define BUCKET_MASK 255
58
57
 
59
58
  typedef struct _slot {
60
- const char *key;
61
- uint64_t code;
62
- struct _slot *next;
63
- uint64_t hash;
59
+ const char *key;
60
+ uint64_t code;
61
+ struct _slot *next;
62
+ uint64_t hash;
64
63
  } *Slot;
65
64
 
66
65
  typedef struct _cache {
67
- Slot buckets[BUCKET_SIZE];
66
+ Slot buckets[BUCKET_SIZE];
68
67
  } *Cache;
69
68
 
70
- static struct _cache entity_cache;
71
- static bool inited = false;
69
+ static struct _cache entity_cache;
70
+ static bool inited = false;
72
71
 
73
72
  // HTML entities such as &amp;. This is a complete list from the HTML 5 spec.
74
- static struct _slot entities[] = {
75
- { "AElig", 198 }, // latin capital letter AE
76
- { "Aacute", 193 }, // latin capital letter A with acute
77
- { "Acirc", 194 }, // latin capital letter A with circumflex
78
- { "Agrave", 192 }, // latin capital letter A with grave
79
- { "Alpha", 913 }, // greek capital letter alpha, U+0391
80
- { "Aring", 197 }, // latin capital letter A with ring above
81
- { "Atilde", 195 }, // latin capital letter A with tilde
82
- { "Auml", 196 }, // latin capital letter A with diaeresis
83
- { "Beta", 914 }, // greek capital letter beta, U+0392
84
- { "Ccedil", 199 }, // latin capital letter C with cedilla
85
- { "Chi", 935 }, // greek capital letter chi, U+03A7
86
- { "Dagger", 8225 }, // double dagger, U+2021 ISOpub
87
- { "Delta", 916 }, // greek capital letter delta
88
- { "ETH", 208 }, // latin capital letter ETH, U+00D0 ISOlat1
89
- { "Eacute", 201 }, // latin capital letter E with acute
90
- { "Ecirc", 202 }, // latin capital letter E with circumflex
91
- { "Egrave", 200 }, // latin capital letter E with grave
92
- { "Epsilon", 917 }, // greek capital letter epsilon, U+0395
93
- { "Eta", 919 }, // greek capital letter eta, U+0397
94
- { "Euml", 203 }, // latin capital letter E with diaeresis
95
- { "Gamma", 915 }, // greek capital letter gamma
96
- { "Iacute", 205 }, // latin capital letter I with acute
97
- { "Icirc", 206 }, // latin capital letter I with circumflex
98
- { "Igrave", 204 }, // latin capital letter I with grave
99
- { "Iota", 921 }, // greek capital letter iota, U+0399
100
- { "Iuml", 207 }, // latin capital letter I with diaeresis
101
- { "Kappa", 922 }, // greek capital letter kappa, U+039A
102
- { "Lambda", 923 }, // greek capital letter lambda
103
- { "Mu", 924 }, // greek capital letter mu, U+039C
104
- { "Ntilde", 209 }, // latin capital letter N with tilde
105
- { "Nu", 925 }, // greek capital letter nu, U+039D
106
- { "OElig", 338 }, // - latin capital ligature OE
107
- { "Oacute", 211 }, // latin capital letter O with acute
108
- { "Ocirc", 212 }, // latin capital letter O with circumflex
109
- { "Ograve", 210 }, // latin capital letter O with grave
110
- { "Omega", 937 }, // greek capital letter omega
111
- { "Omicron", 927 }, // greek capital letter omicron, U+039F
112
- { "Oslash", 216 }, // latin capital letter O with stroke
113
- { "Otilde", 213 }, // latin capital letter O with tilde
114
- { "Ouml", 214 }, // latin capital letter O with diaeresis
115
- { "Phi", 934 }, // greek capital letter phi
116
- { "Pi", 928 }, // greek capital letter pi, U+03A0 ISOgrk3
117
- { "Prime", 8243 }, // double prime = seconds = inches
118
- { "Psi", 936 }, // greek capital letter psi
119
- { "Rho", 929 }, // greek capital letter rho, U+03A1
120
- { "Scaron", 352 }, // - latin capital letter S with caron
121
- { "Sigma", 931 }, // greek capital letter sigma
122
- { "THORN", 222 }, // latin capital letter THORN
123
- { "Tau", 932 }, // greek capital letter tau, U+03A4
124
- { "Theta", 920 }, // greek capital letter theta
125
- { "Uacute", 218 }, // latin capital letter U with acute
126
- { "Ucirc", 219 }, // latin capital letter U with circumflex
127
- { "Ugrave", 217 }, // latin capital letter U with grave
128
- { "Upsilon", 933 }, // greek capital letter upsilon
129
- { "Uuml", 220 }, // latin capital letter U with diaeresis
130
- { "Xi", 926 }, // greek capital letter xi, U+039E ISOgrk3
131
- { "Yacute", 221 }, // latin capital letter Y with acute
132
- { "Yuml", 376 }, // - latin capital letter Y with diaeresis
133
- { "Zeta", 918 }, // greek capital letter zeta, U+0396
134
- { "aacute", 225 }, // latin small letter a with acute
135
- { "acirc", 226 }, // latin small letter a with circumflex
136
- { "acute", 180 }, // acute accent = spacing acute
137
- { "aelig", 230 }, // latin small letter ae
138
- { "agrave", 224 }, // latin small letter a with grave
139
- { "alefsym", 8501 },// alef symbol = first transfinite cardinal
140
- { "alpha", 945 }, // greek small letter alpha
141
- { "amp", 38 }, // -- ampersand, U+0026 ISOnum
142
- { "and", 8743 }, // logical and = wedge, U+2227 ISOtech
143
- { "ang", 8736 }, // angle, U+2220 ISOamso
144
- { "apos", 39 }, // -- single quote
145
- { "aring", 229 }, // latin small letter a with ring above
146
- { "asymp", 8776 }, // almost equal to = asymptotic to
147
- { "atilde", 227 }, // latin small letter a with tilde
148
- { "auml", 228 }, // latin small letter a with diaeresis
149
- { "bdquo", 8222 }, // double low-9 quotation mark, U+201E NEW
150
- { "beta", 946 }, // greek small letter beta, U+03B2 ISOgrk3
151
- { "brvbar", 166 }, // broken bar = broken vertical bar
152
- { "bull", 8226 }, // bullet = black small circle
153
- { "cap", 8745 }, // intersection = cap, U+2229 ISOtech
154
- { "ccedil", 231 }, // latin small letter c with cedilla
155
- { "cedil", 184 }, // cedilla = spacing cedilla, U+00B8 ISOdia
156
- { "cent", 162 }, // cent sign, U+00A2 ISOnum
157
- { "chi", 967 }, // greek small letter chi, U+03C7 ISOgrk3
158
- { "circ", 710 }, // - modifier letter circumflex accent
159
- { "clubs", 9827 }, // black club suit = shamrock
160
- { "cong", 8773 }, // approximately equal to, U+2245 ISOtech
161
- { "copy", 169 }, // copyright sign, U+00A9 ISOnum
162
- { "crarr", 8629 }, // downwards arrow with corner leftwards
163
- { "cup", 8746 }, // union = cup, U+222A ISOtech
164
- { "curren", 164 }, // currency sign, U+00A4 ISOnum
165
- { "dArr", 8659 }, // downwards double arrow, U+21D3 ISOamsa
166
- { "dagger", 8224 }, // dagger, U+2020 ISOpub
167
- { "darr", 8595 }, // downwards arrow, U+2193 ISOnum
168
- { "deg", 176 }, // degree sign, U+00B0 ISOnum
169
- { "delta", 948 }, // greek small letter delta
170
- { "diams", 9830 }, // black diamond suit, U+2666 ISOpub
171
- { "divide", 247 }, // division sign, U+00F7 ISOnum
172
- { "eacute", 233 }, // latin small letter e with acute
173
- { "ecirc", 234 }, // latin small letter e with circumflex
174
- { "egrave", 232 }, // latin small letter e with grave
175
- { "empty", 8709 }, // empty set = null set = diameter
176
- { "emsp", 8195 }, // em space, U+2003 ISOpub
177
- { "ensp", 8194 }, // en space, U+2002 ISOpub
178
- { "epsilon", 949 }, // greek small letter epsilon
179
- { "equiv", 8801 }, // identical to, U+2261 ISOtech
180
- { "eta", 951 }, // greek small letter eta, U+03B7 ISOgrk3
181
- { "eth", 240 }, // latin small letter eth, U+00F0 ISOlat1
182
- { "euml", 235 }, // latin small letter e with diaeresis
183
- { "euro", 8364 }, // - euro sign, U+20AC NEW
184
- { "exist", 8707 }, // there exists, U+2203 ISOtech
185
- { "fnof", 402 }, // latin small f with hook = function
186
- { "forall", 8704 }, // for all, U+2200 ISOtech
187
- { "frac12", 189 }, // vulgar fraction one half
188
- { "frac14", 188 }, // vulgar fraction one quarter
189
- { "frac34", 190 }, // vulgar fraction three quarters
190
- { "frasl", 8260 }, // fraction slash, U+2044 NEW
191
- { "gamma", 947 }, // greek small letter gamma
192
- { "ge", 8805 }, // greater-than or equal to
193
- { "gt", 62 }, // -- greater-than sign, U+003E ISOnum
194
- { "hArr", 8660 }, // left right double arrow
195
- { "harr", 8596 }, // left right arrow, U+2194 ISOamsa
196
- { "hearts", 9829 }, // black heart suit = valentine
197
- { "hellip", 8230 }, // horizontal ellipsis = three dot leader
198
- { "iacute", 237 }, // latin small letter i with acute
199
- { "icirc", 238 }, // latin small letter i with circumflex
200
- { "iexcl", 161 }, // inverted exclamation mark, U+00A1 ISOnum
201
- { "igrave", 236 }, // latin small letter i with grave
202
- { "image", 8465 }, // blackletter capital I = imaginary part
203
- { "infin", 8734 }, // infinity, U+221E ISOtech
204
- { "int", 8747 }, // integral, U+222B ISOtech
205
- { "iota", 953 }, // greek small letter iota, U+03B9 ISOgrk3
206
- { "iquest", 191 }, // inverted question mark
207
- { "isin", 8712 }, // element of, U+2208 ISOtech
208
- { "iuml", 239 }, // latin small letter i with diaeresis
209
- { "kappa", 954 }, // greek small letter kappa
210
- { "lArr", 8656 }, // leftwards double arrow, U+21D0 ISOtech
211
- { "lambda", 955 }, // greek small letter lambda
212
- { "lang", 9001 }, // left-pointing angle bracket = bra
213
- { "laquo", 171 }, // left-pointing double angle quotation mark
214
- { "larr", 8592 }, // leftwards arrow, U+2190 ISOnum
215
- { "lceil", 8968 }, // left ceiling = apl upstile
216
- { "ldquo", 8220 }, // left double quotation mark
217
- { "le", 8804 }, // less-than or equal to, U+2264 ISOtech
218
- { "lfloor", 8970 }, // left floor = apl downstile
219
- { "lowast", 8727 }, // asterisk operator, U+2217 ISOtech
220
- { "loz", 9674 }, // lozenge, U+25CA ISOpub
221
- { "lrm", 8206 }, // left-to-right mark, U+200E NEW RFC 2070
222
- { "lsaquo", 8249 }, // single left-pointing angle quotation mark
223
- { "lsquo", 8216 }, // left single quotation mark
224
- { "lt", 60 }, // -- less-than sign, U+003C ISOnum
225
- { "macr", 175 }, // macron = spacing macron = overline
226
- { "mdash", 8212 }, // em dash, U+2014 ISOpub
227
- { "micro", 181 }, // micro sign, U+00B5 ISOnum
228
- { "middot", 183 }, // middle dot = Georgian comma
229
- { "minus", 8722 }, // minus sign, U+2212 ISOtech
230
- { "mu", 956 }, // greek small letter mu, U+03BC ISOgrk3
231
- { "nabla", 8711 }, // nabla = backward difference
232
- { "nbsp", 160 }, // no-break space = non-breaking space
233
- { "ndash", 8211 }, // en dash, U+2013 ISOpub
234
- { "ne", 8800 }, // not equal to, U+2260 ISOtech
235
- { "ni", 8715 }, // contains as member, U+220B ISOtech
236
- { "not", 172 }, // not sign, U+00AC ISOnum
237
- { "notin", 8713 }, // not an element of, U+2209 ISOtech
238
- { "nsub", 8836 }, // not a subset of, U+2284 ISOamsn
239
- { "ntilde", 241 }, // latin small letter n with tilde
240
- { "nu", 957 }, // greek small letter nu, U+03BD ISOgrk3
241
- { "oacute", 243 }, // latin small letter o with acute
242
- { "ocirc", 244 }, // latin small letter o with circumflex
243
- { "oelig", 339 }, // - latin small ligature oe, U+0153 ISOlat2
244
- { "ograve", 242 }, // latin small letter o with grave
245
- { "oline", 8254 }, // overline = spacing overscore
246
- { "omega", 969 }, // greek small letter omega
247
- { "omicron", 959 }, // greek small letter omicron, U+03BF NEW
248
- { "oplus", 8853 }, // circled plus = direct sum
249
- { "or", 8744 }, // logical or = vee, U+2228 ISOtech
250
- { "ordf", 170 }, // feminine ordinal indicator, U+00AA ISOnum
251
- { "ordm", 186 }, // masculine ordinal indicator
252
- { "oslash", 248 }, // latin small letter o with stroke
253
- { "otilde", 245 }, // latin small letter o with tilde
254
- { "otimes", 8855 }, // circled times = vector product
255
- { "ouml", 246 }, // latin small letter o with diaeresis
256
- { "para", 182 }, // pilcrow sign = paragraph sign
257
- { "part", 8706 }, // partial differential, U+2202 ISOtech
258
- { "permil", 8240 }, // per mille sign, U+2030 ISOtech
259
- { "perp", 8869 }, // up tack = orthogonal to = perpendicular
260
- { "phi", 966 }, // greek small letter phi, U+03C6 ISOgrk3
261
- { "pi", 960 }, // greek small letter pi, U+03C0 ISOgrk3
262
- { "piv", 982 }, // greek pi symbol, U+03D6 ISOgrk3
263
- { "plusmn", 177 }, // plus-minus sign = plus-or-minus sign
264
- { "pound", 163 }, // pound sign, U+00A3 ISOnum
265
- { "prime", 8242 }, // prime = minutes = feet, U+2032 ISOtech
266
- { "prod", 8719 }, // n-ary product = product sign
267
- { "prop", 8733 }, // proportional to, U+221D ISOtech
268
- { "psi", 968 }, // greek small letter psi, U+03C8 ISOgrk3
269
- { "quot", 34 }, // -- quotation mark = APL quote
270
- { "rArr", 8658 }, // rightwards double arrow
271
- { "radic", 8730 }, // square root = radical sign
272
- { "rang", 9002 }, // right-pointing angle bracket = ket
273
- { "raquo", 187 }, // right-pointing double angle quotation mark
274
- { "rarr", 8594 }, // rightwards arrow, U+2192 ISOnum
275
- { "rceil", 8969 }, // right ceiling, U+2309 ISOamsc
276
- { "rdquo", 8221 }, // right double quotation mark
277
- { "real", 8476 }, // blackletter capital R = real part symbol
278
- { "reg", 174 }, // registered sign = registered trade mark sign
279
- { "rfloor", 8971 }, // right floor, U+230B ISOamsc
280
- { "rho", 961 }, // greek small letter rho, U+03C1 ISOgrk3
281
- { "rlm", 8207 }, // right-to-left mark, U+200F NEW RFC 2070
282
- { "rsaquo", 8250 }, // single right-pointing angle quotation mark
283
- { "rsquo", 8217 }, // right single quotation mark
284
- { "sbquo", 8218 }, // single low-9 quotation mark, U+201A NEW
285
- { "scaron", 353 }, // - latin small letter s with caron
286
- { "sdot", 8901 }, // dot operator, U+22C5 ISOamsb
287
- { "sect", 167 }, // section sign, U+00A7 ISOnum
288
- { "shy", 173 }, // soft hyphen = discretionary hyphen
289
- { "sigma", 963 }, // greek small letter sigma
290
- { "sigmaf", 962 }, // greek small letter final sigma
291
- { "sim", 8764 }, // tilde operator = varies with = similar to
292
- { "spades", 9824 }, // black spade suit, U+2660 ISOpub
293
- { "sub", 8834 }, // subset of, U+2282 ISOtech
294
- { "sube", 8838 }, // subset of or equal to, U+2286 ISOtech
295
- { "sum", 8721 }, // n-ary sumation, U+2211 ISOamsb
296
- { "sup", 8835 }, // superset of, U+2283 ISOtech
297
- { "sup1", 185 }, // superscript one = superscript digit one
298
- { "sup2", 178 }, // superscript two = superscript digit two
299
- { "sup3", 179 }, // superscript three = superscript digit three
300
- { "supe", 8839 }, // superset of or equal to
301
- { "szlig", 223 }, // latin small letter sharp s = ess-zed
302
- { "tau", 964 }, // greek small letter tau, U+03C4 ISOgrk3
303
- { "there4", 8756 }, // therefore, U+2234 ISOtech
304
- { "theta", 952 }, // greek small letter theta
305
- { "thetasym", 977 },// greek small letter theta symbol
306
- { "thinsp", 8201 }, // thin space, U+2009 ISOpub
307
- { "thorn", 254 }, // latin small letter thorn
308
- { "tilde", 732 }, // - small tilde, U+02DC ISOdia
309
- { "times", 215 }, // multiplication sign, U+00D7 ISOnum
310
- { "trade", 8482 }, // trade mark sign, U+2122 ISOnum
311
- { "uArr", 8657 }, // upwards double arrow, U+21D1 ISOamsa
312
- { "uacute", 250 }, // latin small letter u with acute
313
- { "uarr", 8593 }, // upwards arrow, U+2191 ISOnum-->
314
- { "ucirc", 251 }, // latin small letter u with circumflex
315
- { "ugrave", 249 }, // latin small letter u with grave
316
- { "uml", 168 }, // diaeresis = spacing diaeresis
317
- { "upsih", 978 }, // greek upsilon with hook symbol
318
- { "upsilon", 965 }, // greek small letter upsilon
319
- { "uuml", 252 }, // latin small letter u with diaeresis
320
- { "weierp", 8472 }, // script capital P = power set
321
- { "xi", 958 }, // greek small letter xi, U+03BE ISOgrk3
322
- { "yacute", 253 }, // latin small letter y with acute
323
- { "yen", 165 }, // yen sign = yuan sign, U+00A5 ISOnum
324
- { "yuml", 255 }, // latin small letter y with diaeresis
325
- { "zeta", 950 }, // greek small letter zeta, U+03B6 ISOgrk3
326
- { "zwj", 8205 }, // zero width joiner, U+200D NEW RFC 2070
327
- { "zwnj", 8204 }, // zero width non-joiner
328
- { NULL, 0 },
73
+ static struct _slot entities[] = {
74
+ {"AElig", 198}, // latin capital letter AE
75
+ {"Aacute", 193}, // latin capital letter A with acute
76
+ {"Acirc", 194}, // latin capital letter A with circumflex
77
+ {"Agrave", 192}, // latin capital letter A with grave
78
+ {"Alpha", 913}, // greek capital letter alpha, U+0391
79
+ {"Aring", 197}, // latin capital letter A with ring above
80
+ {"Atilde", 195}, // latin capital letter A with tilde
81
+ {"Auml", 196}, // latin capital letter A with diaeresis
82
+ {"Beta", 914}, // greek capital letter beta, U+0392
83
+ {"Ccedil", 199}, // latin capital letter C with cedilla
84
+ {"Chi", 935}, // greek capital letter chi, U+03A7
85
+ {"Dagger", 8225}, // double dagger, U+2021 ISOpub
86
+ {"Delta", 916}, // greek capital letter delta
87
+ {"ETH", 208}, // latin capital letter ETH, U+00D0 ISOlat1
88
+ {"Eacute", 201}, // latin capital letter E with acute
89
+ {"Ecirc", 202}, // latin capital letter E with circumflex
90
+ {"Egrave", 200}, // latin capital letter E with grave
91
+ {"Epsilon", 917}, // greek capital letter epsilon, U+0395
92
+ {"Eta", 919}, // greek capital letter eta, U+0397
93
+ {"Euml", 203}, // latin capital letter E with diaeresis
94
+ {"Gamma", 915}, // greek capital letter gamma
95
+ {"Iacute", 205}, // latin capital letter I with acute
96
+ {"Icirc", 206}, // latin capital letter I with circumflex
97
+ {"Igrave", 204}, // latin capital letter I with grave
98
+ {"Iota", 921}, // greek capital letter iota, U+0399
99
+ {"Iuml", 207}, // latin capital letter I with diaeresis
100
+ {"Kappa", 922}, // greek capital letter kappa, U+039A
101
+ {"Lambda", 923}, // greek capital letter lambda
102
+ {"Mu", 924}, // greek capital letter mu, U+039C
103
+ {"Ntilde", 209}, // latin capital letter N with tilde
104
+ {"Nu", 925}, // greek capital letter nu, U+039D
105
+ {"OElig", 338}, // - latin capital ligature OE
106
+ {"Oacute", 211}, // latin capital letter O with acute
107
+ {"Ocirc", 212}, // latin capital letter O with circumflex
108
+ {"Ograve", 210}, // latin capital letter O with grave
109
+ {"Omega", 937}, // greek capital letter omega
110
+ {"Omicron", 927}, // greek capital letter omicron, U+039F
111
+ {"Oslash", 216}, // latin capital letter O with stroke
112
+ {"Otilde", 213}, // latin capital letter O with tilde
113
+ {"Ouml", 214}, // latin capital letter O with diaeresis
114
+ {"Phi", 934}, // greek capital letter phi
115
+ {"Pi", 928}, // greek capital letter pi, U+03A0 ISOgrk3
116
+ {"Prime", 8243}, // double prime = seconds = inches
117
+ {"Psi", 936}, // greek capital letter psi
118
+ {"Rho", 929}, // greek capital letter rho, U+03A1
119
+ {"Scaron", 352}, // - latin capital letter S with caron
120
+ {"Sigma", 931}, // greek capital letter sigma
121
+ {"THORN", 222}, // latin capital letter THORN
122
+ {"Tau", 932}, // greek capital letter tau, U+03A4
123
+ {"Theta", 920}, // greek capital letter theta
124
+ {"Uacute", 218}, // latin capital letter U with acute
125
+ {"Ucirc", 219}, // latin capital letter U with circumflex
126
+ {"Ugrave", 217}, // latin capital letter U with grave
127
+ {"Upsilon", 933}, // greek capital letter upsilon
128
+ {"Uuml", 220}, // latin capital letter U with diaeresis
129
+ {"Xi", 926}, // greek capital letter xi, U+039E ISOgrk3
130
+ {"Yacute", 221}, // latin capital letter Y with acute
131
+ {"Yuml", 376}, // - latin capital letter Y with diaeresis
132
+ {"Zeta", 918}, // greek capital letter zeta, U+0396
133
+ {"aacute", 225}, // latin small letter a with acute
134
+ {"acirc", 226}, // latin small letter a with circumflex
135
+ {"acute", 180}, // acute accent = spacing acute
136
+ {"aelig", 230}, // latin small letter ae
137
+ {"agrave", 224}, // latin small letter a with grave
138
+ {"alefsym", 8501}, // alef symbol = first transfinite cardinal
139
+ {"alpha", 945}, // greek small letter alpha
140
+ {"amp", 38}, // -- ampersand, U+0026 ISOnum
141
+ {"and", 8743}, // logical and = wedge, U+2227 ISOtech
142
+ {"ang", 8736}, // angle, U+2220 ISOamso
143
+ {"apos", 39}, // -- single quote
144
+ {"aring", 229}, // latin small letter a with ring above
145
+ {"asymp", 8776}, // almost equal to = asymptotic to
146
+ {"atilde", 227}, // latin small letter a with tilde
147
+ {"auml", 228}, // latin small letter a with diaeresis
148
+ {"bdquo", 8222}, // double low-9 quotation mark, U+201E NEW
149
+ {"beta", 946}, // greek small letter beta, U+03B2 ISOgrk3
150
+ {"brvbar", 166}, // broken bar = broken vertical bar
151
+ {"bull", 8226}, // bullet = black small circle
152
+ {"cap", 8745}, // intersection = cap, U+2229 ISOtech
153
+ {"ccedil", 231}, // latin small letter c with cedilla
154
+ {"cedil", 184}, // cedilla = spacing cedilla, U+00B8 ISOdia
155
+ {"cent", 162}, // cent sign, U+00A2 ISOnum
156
+ {"chi", 967}, // greek small letter chi, U+03C7 ISOgrk3
157
+ {"circ", 710}, // - modifier letter circumflex accent
158
+ {"clubs", 9827}, // black club suit = shamrock
159
+ {"cong", 8773}, // approximately equal to, U+2245 ISOtech
160
+ {"copy", 169}, // copyright sign, U+00A9 ISOnum
161
+ {"crarr", 8629}, // downwards arrow with corner leftwards
162
+ {"cup", 8746}, // union = cup, U+222A ISOtech
163
+ {"curren", 164}, // currency sign, U+00A4 ISOnum
164
+ {"dArr", 8659}, // downwards double arrow, U+21D3 ISOamsa
165
+ {"dagger", 8224}, // dagger, U+2020 ISOpub
166
+ {"darr", 8595}, // downwards arrow, U+2193 ISOnum
167
+ {"deg", 176}, // degree sign, U+00B0 ISOnum
168
+ {"delta", 948}, // greek small letter delta
169
+ {"diams", 9830}, // black diamond suit, U+2666 ISOpub
170
+ {"divide", 247}, // division sign, U+00F7 ISOnum
171
+ {"eacute", 233}, // latin small letter e with acute
172
+ {"ecirc", 234}, // latin small letter e with circumflex
173
+ {"egrave", 232}, // latin small letter e with grave
174
+ {"empty", 8709}, // empty set = null set = diameter
175
+ {"emsp", 8195}, // em space, U+2003 ISOpub
176
+ {"ensp", 8194}, // en space, U+2002 ISOpub
177
+ {"epsilon", 949}, // greek small letter epsilon
178
+ {"equiv", 8801}, // identical to, U+2261 ISOtech
179
+ {"eta", 951}, // greek small letter eta, U+03B7 ISOgrk3
180
+ {"eth", 240}, // latin small letter eth, U+00F0 ISOlat1
181
+ {"euml", 235}, // latin small letter e with diaeresis
182
+ {"euro", 8364}, // - euro sign, U+20AC NEW
183
+ {"exist", 8707}, // there exists, U+2203 ISOtech
184
+ {"fnof", 402}, // latin small f with hook = function
185
+ {"forall", 8704}, // for all, U+2200 ISOtech
186
+ {"frac12", 189}, // vulgar fraction one half
187
+ {"frac14", 188}, // vulgar fraction one quarter
188
+ {"frac34", 190}, // vulgar fraction three quarters
189
+ {"frasl", 8260}, // fraction slash, U+2044 NEW
190
+ {"gamma", 947}, // greek small letter gamma
191
+ {"ge", 8805}, // greater-than or equal to
192
+ {"gt", 62}, // -- greater-than sign, U+003E ISOnum
193
+ {"hArr", 8660}, // left right double arrow
194
+ {"harr", 8596}, // left right arrow, U+2194 ISOamsa
195
+ {"hearts", 9829}, // black heart suit = valentine
196
+ {"hellip", 8230}, // horizontal ellipsis = three dot leader
197
+ {"iacute", 237}, // latin small letter i with acute
198
+ {"icirc", 238}, // latin small letter i with circumflex
199
+ {"iexcl", 161}, // inverted exclamation mark, U+00A1 ISOnum
200
+ {"igrave", 236}, // latin small letter i with grave
201
+ {"image", 8465}, // blackletter capital I = imaginary part
202
+ {"infin", 8734}, // infinity, U+221E ISOtech
203
+ {"int", 8747}, // integral, U+222B ISOtech
204
+ {"iota", 953}, // greek small letter iota, U+03B9 ISOgrk3
205
+ {"iquest", 191}, // inverted question mark
206
+ {"isin", 8712}, // element of, U+2208 ISOtech
207
+ {"iuml", 239}, // latin small letter i with diaeresis
208
+ {"kappa", 954}, // greek small letter kappa
209
+ {"lArr", 8656}, // leftwards double arrow, U+21D0 ISOtech
210
+ {"lambda", 955}, // greek small letter lambda
211
+ {"lang", 9001}, // left-pointing angle bracket = bra
212
+ {"laquo", 171}, // left-pointing double angle quotation mark
213
+ {"larr", 8592}, // leftwards arrow, U+2190 ISOnum
214
+ {"lceil", 8968}, // left ceiling = apl upstile
215
+ {"ldquo", 8220}, // left double quotation mark
216
+ {"le", 8804}, // less-than or equal to, U+2264 ISOtech
217
+ {"lfloor", 8970}, // left floor = apl downstile
218
+ {"lowast", 8727}, // asterisk operator, U+2217 ISOtech
219
+ {"loz", 9674}, // lozenge, U+25CA ISOpub
220
+ {"lrm", 8206}, // left-to-right mark, U+200E NEW RFC 2070
221
+ {"lsaquo", 8249}, // single left-pointing angle quotation mark
222
+ {"lsquo", 8216}, // left single quotation mark
223
+ {"lt", 60}, // -- less-than sign, U+003C ISOnum
224
+ {"macr", 175}, // macron = spacing macron = overline
225
+ {"mdash", 8212}, // em dash, U+2014 ISOpub
226
+ {"micro", 181}, // micro sign, U+00B5 ISOnum
227
+ {"middot", 183}, // middle dot = Georgian comma
228
+ {"minus", 8722}, // minus sign, U+2212 ISOtech
229
+ {"mu", 956}, // greek small letter mu, U+03BC ISOgrk3
230
+ {"nabla", 8711}, // nabla = backward difference
231
+ {"nbsp", 160}, // no-break space = non-breaking space
232
+ {"ndash", 8211}, // en dash, U+2013 ISOpub
233
+ {"ne", 8800}, // not equal to, U+2260 ISOtech
234
+ {"ni", 8715}, // contains as member, U+220B ISOtech
235
+ {"not", 172}, // not sign, U+00AC ISOnum
236
+ {"notin", 8713}, // not an element of, U+2209 ISOtech
237
+ {"nsub", 8836}, // not a subset of, U+2284 ISOamsn
238
+ {"ntilde", 241}, // latin small letter n with tilde
239
+ {"nu", 957}, // greek small letter nu, U+03BD ISOgrk3
240
+ {"oacute", 243}, // latin small letter o with acute
241
+ {"ocirc", 244}, // latin small letter o with circumflex
242
+ {"oelig", 339}, // - latin small ligature oe, U+0153 ISOlat2
243
+ {"ograve", 242}, // latin small letter o with grave
244
+ {"oline", 8254}, // overline = spacing overscore
245
+ {"omega", 969}, // greek small letter omega
246
+ {"omicron", 959}, // greek small letter omicron, U+03BF NEW
247
+ {"oplus", 8853}, // circled plus = direct sum
248
+ {"or", 8744}, // logical or = vee, U+2228 ISOtech
249
+ {"ordf", 170}, // feminine ordinal indicator, U+00AA ISOnum
250
+ {"ordm", 186}, // masculine ordinal indicator
251
+ {"oslash", 248}, // latin small letter o with stroke
252
+ {"otilde", 245}, // latin small letter o with tilde
253
+ {"otimes", 8855}, // circled times = vector product
254
+ {"ouml", 246}, // latin small letter o with diaeresis
255
+ {"para", 182}, // pilcrow sign = paragraph sign
256
+ {"part", 8706}, // partial differential, U+2202 ISOtech
257
+ {"permil", 8240}, // per mille sign, U+2030 ISOtech
258
+ {"perp", 8869}, // up tack = orthogonal to = perpendicular
259
+ {"phi", 966}, // greek small letter phi, U+03C6 ISOgrk3
260
+ {"pi", 960}, // greek small letter pi, U+03C0 ISOgrk3
261
+ {"piv", 982}, // greek pi symbol, U+03D6 ISOgrk3
262
+ {"plusmn", 177}, // plus-minus sign = plus-or-minus sign
263
+ {"pound", 163}, // pound sign, U+00A3 ISOnum
264
+ {"prime", 8242}, // prime = minutes = feet, U+2032 ISOtech
265
+ {"prod", 8719}, // n-ary product = product sign
266
+ {"prop", 8733}, // proportional to, U+221D ISOtech
267
+ {"psi", 968}, // greek small letter psi, U+03C8 ISOgrk3
268
+ {"quot", 34}, // -- quotation mark = APL quote
269
+ {"rArr", 8658}, // rightwards double arrow
270
+ {"radic", 8730}, // square root = radical sign
271
+ {"rang", 9002}, // right-pointing angle bracket = ket
272
+ {"raquo", 187}, // right-pointing double angle quotation mark
273
+ {"rarr", 8594}, // rightwards arrow, U+2192 ISOnum
274
+ {"rceil", 8969}, // right ceiling, U+2309 ISOamsc
275
+ {"rdquo", 8221}, // right double quotation mark
276
+ {"real", 8476}, // blackletter capital R = real part symbol
277
+ {"reg", 174}, // registered sign = registered trade mark sign
278
+ {"rfloor", 8971}, // right floor, U+230B ISOamsc
279
+ {"rho", 961}, // greek small letter rho, U+03C1 ISOgrk3
280
+ {"rlm", 8207}, // right-to-left mark, U+200F NEW RFC 2070
281
+ {"rsaquo", 8250}, // single right-pointing angle quotation mark
282
+ {"rsquo", 8217}, // right single quotation mark
283
+ {"sbquo", 8218}, // single low-9 quotation mark, U+201A NEW
284
+ {"scaron", 353}, // - latin small letter s with caron
285
+ {"sdot", 8901}, // dot operator, U+22C5 ISOamsb
286
+ {"sect", 167}, // section sign, U+00A7 ISOnum
287
+ {"shy", 173}, // soft hyphen = discretionary hyphen
288
+ {"sigma", 963}, // greek small letter sigma
289
+ {"sigmaf", 962}, // greek small letter final sigma
290
+ {"sim", 8764}, // tilde operator = varies with = similar to
291
+ {"spades", 9824}, // black spade suit, U+2660 ISOpub
292
+ {"sub", 8834}, // subset of, U+2282 ISOtech
293
+ {"sube", 8838}, // subset of or equal to, U+2286 ISOtech
294
+ {"sum", 8721}, // n-ary sumation, U+2211 ISOamsb
295
+ {"sup", 8835}, // superset of, U+2283 ISOtech
296
+ {"sup1", 185}, // superscript one = superscript digit one
297
+ {"sup2", 178}, // superscript two = superscript digit two
298
+ {"sup3", 179}, // superscript three = superscript digit three
299
+ {"supe", 8839}, // superset of or equal to
300
+ {"szlig", 223}, // latin small letter sharp s = ess-zed
301
+ {"tau", 964}, // greek small letter tau, U+03C4 ISOgrk3
302
+ {"there4", 8756}, // therefore, U+2234 ISOtech
303
+ {"theta", 952}, // greek small letter theta
304
+ {"thetasym", 977}, // greek small letter theta symbol
305
+ {"thinsp", 8201}, // thin space, U+2009 ISOpub
306
+ {"thorn", 254}, // latin small letter thorn
307
+ {"tilde", 732}, // - small tilde, U+02DC ISOdia
308
+ {"times", 215}, // multiplication sign, U+00D7 ISOnum
309
+ {"trade", 8482}, // trade mark sign, U+2122 ISOnum
310
+ {"uArr", 8657}, // upwards double arrow, U+21D1 ISOamsa
311
+ {"uacute", 250}, // latin small letter u with acute
312
+ {"uarr", 8593}, // upwards arrow, U+2191 ISOnum-->
313
+ {"ucirc", 251}, // latin small letter u with circumflex
314
+ {"ugrave", 249}, // latin small letter u with grave
315
+ {"uml", 168}, // diaeresis = spacing diaeresis
316
+ {"upsih", 978}, // greek upsilon with hook symbol
317
+ {"upsilon", 965}, // greek small letter upsilon
318
+ {"uuml", 252}, // latin small letter u with diaeresis
319
+ {"weierp", 8472}, // script capital P = power set
320
+ {"xi", 958}, // greek small letter xi, U+03BE ISOgrk3
321
+ {"yacute", 253}, // latin small letter y with acute
322
+ {"yen", 165}, // yen sign = yuan sign, U+00A5 ISOnum
323
+ {"yuml", 255}, // latin small letter y with diaeresis
324
+ {"zeta", 950}, // greek small letter zeta, U+03B6 ISOgrk3
325
+ {"zwj", 8205}, // zero width joiner, U+200D NEW RFC 2070
326
+ {"zwnj", 8204}, // zero width non-joiner
327
+ {NULL, 0},
329
328
  };
330
329
 
331
- static uint64_t
332
- calc_hash(const char *key) {
333
- uint64_t h = 0;
330
+ static uint64_t calc_hash(const char *key) {
331
+ uint64_t h = 0;
334
332
 
335
333
  if (NULL != key) {
336
- const uint8_t *k = (const uint8_t*)key;
334
+ const uint8_t *k = (const uint8_t *)key;
337
335
 
338
- for (; 0 != *k; k++) {
339
- // narrow to most used range of 0x4D (77) in size
340
- h = 77 * h + ((*k | 0x20) - 0x2D);
341
- }
336
+ for (; 0 != *k; k++) {
337
+ // narrow to most used range of 0x4D (77) in size
338
+ h = 77 * h + ((*k | 0x20) - 0x2D);
339
+ }
342
340
  }
343
341
  return h;
344
342
  }
345
343
 
346
- static Slot*
347
- get_bucketp(uint64_t h) {
344
+ static Slot *get_bucketp(uint64_t h) {
348
345
  return entity_cache.buckets + (BUCKET_MASK & (h ^ (h << 5) ^ (h >> 7)));
349
346
  }
350
347
 
351
- static void
352
- cache_set(Slot s) {
353
- int64_t h = calc_hash(s->key);
354
- Slot *bucket = get_bucketp(h);
348
+ static void cache_set(Slot s) {
349
+ int64_t h = calc_hash(s->key);
350
+ Slot *bucket = get_bucketp(h);
355
351
 
356
352
  s->hash = h;
357
353
  s->next = *bucket;
358
354
  *bucket = s;
359
355
  }
360
356
 
361
- static Slot
362
- cache_get(const char *key) {
363
- int64_t h = calc_hash(key);
364
- Slot *bucket = get_bucketp(h);
365
- Slot s;
357
+ static Slot cache_get(const char *key) {
358
+ int64_t h = calc_hash(key);
359
+ Slot *bucket = get_bucketp(h);
360
+ Slot s;
366
361
 
367
362
  for (s = *bucket; NULL != s; s = s->next) {
368
- if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
369
- return s;
370
- }
363
+ if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
364
+ return s;
365
+ }
371
366
  }
372
367
  return NULL;
373
368
  }
374
369
 
375
- static void
376
- cache_init() {
377
- Slot e = entities;
370
+ static void cache_init() {
371
+ Slot e = entities;
378
372
 
379
373
  memset(&entity_cache, 0, sizeof(struct _cache));
380
374
  for (; NULL != e->key; e++) {
381
- cache_set(e);
375
+ cache_set(e);
382
376
  }
383
377
  inited = true;
384
378
  }
385
379
 
386
- char*
387
- ox_entity_lookup(char *text, const char *key) {
388
- Slot s = entities;
380
+ char *ox_entity_lookup(char *text, const char *key) {
381
+ Slot s = entities;
389
382
 
390
383
  if (!inited) {
391
- cache_init();
384
+ cache_init();
392
385
  }
393
386
  if (NULL == (s = cache_get(key))) {
394
- return NULL;
387
+ return NULL;
395
388
  }
396
389
  return ox_ucs_to_utf8_chars(text, s->code);
397
390
  }