ox 2.14.14 → 2.14.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/ext/ox/special.c CHANGED
@@ -3,395 +3,388 @@
3
3
  * All rights reserved.
4
4
  */
5
5
 
6
- #include <string.h>
7
- #include <stdbool.h>
8
-
9
6
  #include "special.h"
10
7
 
8
+ #include <stdbool.h>
9
+ #include <string.h>
10
+
11
11
  /*
12
12
  u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
13
13
  u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
14
14
  u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
15
15
  u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
16
16
  */
17
- char*
18
- ox_ucs_to_utf8_chars(char *text, uint64_t u) {
19
- int reading = 0;
20
- int i;
21
- unsigned char c;
17
+ char *ox_ucs_to_utf8_chars(char *text, uint64_t u) {
18
+ int reading = 0;
19
+ int i;
20
+ unsigned char c;
22
21
 
23
22
  if (u <= 0x000000000000007FULL) {
24
- /* 0xxxxxxx */
25
- *text++ = (char)u;
23
+ /* 0xxxxxxx */
24
+ *text++ = (char)u;
26
25
  } else if (u <= 0x00000000000007FFULL) {
27
- /* 110yyyyy 10xxxxxx */
28
- *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
29
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
26
+ /* 110yyyyy 10xxxxxx */
27
+ *text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
28
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
30
29
  } else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
31
- /* 1110zzzz 10yyyyyy 10xxxxxx */
32
- *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
33
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
34
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
30
+ /* 1110zzzz 10yyyyyy 10xxxxxx */
31
+ *text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
32
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
33
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
35
34
  } else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
36
- /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
37
- *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
38
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
39
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
40
- *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
35
+ /* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
36
+ *text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
37
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
38
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
39
+ *text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
41
40
  } else {
42
- /* assume it is UTF-8 encoded directly and not UCS */
43
- for (i = 56; 0 <= i; i -= 8) {
44
- c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
45
- if (reading) {
46
- *text++ = (char)c;
47
- } else if ('\0' != c) {
48
- *text++ = (char)c;
49
- reading = 1;
50
- }
51
- }
41
+ /* assume it is UTF-8 encoded directly and not UCS */
42
+ for (i = 56; 0 <= i; i -= 8) {
43
+ c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
44
+ if (reading) {
45
+ *text++ = (char)c;
46
+ } else if ('\0' != c) {
47
+ *text++ = (char)c;
48
+ reading = 1;
49
+ }
50
+ }
52
51
  }
53
52
  return text;
54
53
  }
55
54
 
56
- #define BUCKET_SIZE 256
57
- #define BUCKET_MASK 255
55
+ #define BUCKET_SIZE 256
56
+ #define BUCKET_MASK 255
58
57
 
59
58
  typedef struct _slot {
60
- const char *key;
61
- uint64_t code;
62
- struct _slot *next;
63
- uint64_t hash;
59
+ const char *key;
60
+ uint64_t code;
61
+ struct _slot *next;
62
+ uint64_t hash;
64
63
  } *Slot;
65
64
 
66
65
  typedef struct _cache {
67
- Slot buckets[BUCKET_SIZE];
66
+ Slot buckets[BUCKET_SIZE];
68
67
  } *Cache;
69
68
 
70
- static struct _cache entity_cache;
71
- static bool inited = false;
69
+ static struct _cache entity_cache;
70
+ static bool inited = false;
72
71
 
73
72
  // HTML entities such as &amp;. This is a complete list from the HTML 5 spec.
74
- static struct _slot entities[] = {
75
- { "AElig", 198 }, // latin capital letter AE
76
- { "Aacute", 193 }, // latin capital letter A with acute
77
- { "Acirc", 194 }, // latin capital letter A with circumflex
78
- { "Agrave", 192 }, // latin capital letter A with grave
79
- { "Alpha", 913 }, // greek capital letter alpha, U+0391
80
- { "Aring", 197 }, // latin capital letter A with ring above
81
- { "Atilde", 195 }, // latin capital letter A with tilde
82
- { "Auml", 196 }, // latin capital letter A with diaeresis
83
- { "Beta", 914 }, // greek capital letter beta, U+0392
84
- { "Ccedil", 199 }, // latin capital letter C with cedilla
85
- { "Chi", 935 }, // greek capital letter chi, U+03A7
86
- { "Dagger", 8225 }, // double dagger, U+2021 ISOpub
87
- { "Delta", 916 }, // greek capital letter delta
88
- { "ETH", 208 }, // latin capital letter ETH, U+00D0 ISOlat1
89
- { "Eacute", 201 }, // latin capital letter E with acute
90
- { "Ecirc", 202 }, // latin capital letter E with circumflex
91
- { "Egrave", 200 }, // latin capital letter E with grave
92
- { "Epsilon", 917 }, // greek capital letter epsilon, U+0395
93
- { "Eta", 919 }, // greek capital letter eta, U+0397
94
- { "Euml", 203 }, // latin capital letter E with diaeresis
95
- { "Gamma", 915 }, // greek capital letter gamma
96
- { "Iacute", 205 }, // latin capital letter I with acute
97
- { "Icirc", 206 }, // latin capital letter I with circumflex
98
- { "Igrave", 204 }, // latin capital letter I with grave
99
- { "Iota", 921 }, // greek capital letter iota, U+0399
100
- { "Iuml", 207 }, // latin capital letter I with diaeresis
101
- { "Kappa", 922 }, // greek capital letter kappa, U+039A
102
- { "Lambda", 923 }, // greek capital letter lambda
103
- { "Mu", 924 }, // greek capital letter mu, U+039C
104
- { "Ntilde", 209 }, // latin capital letter N with tilde
105
- { "Nu", 925 }, // greek capital letter nu, U+039D
106
- { "OElig", 338 }, // - latin capital ligature OE
107
- { "Oacute", 211 }, // latin capital letter O with acute
108
- { "Ocirc", 212 }, // latin capital letter O with circumflex
109
- { "Ograve", 210 }, // latin capital letter O with grave
110
- { "Omega", 937 }, // greek capital letter omega
111
- { "Omicron", 927 }, // greek capital letter omicron, U+039F
112
- { "Oslash", 216 }, // latin capital letter O with stroke
113
- { "Otilde", 213 }, // latin capital letter O with tilde
114
- { "Ouml", 214 }, // latin capital letter O with diaeresis
115
- { "Phi", 934 }, // greek capital letter phi
116
- { "Pi", 928 }, // greek capital letter pi, U+03A0 ISOgrk3
117
- { "Prime", 8243 }, // double prime = seconds = inches
118
- { "Psi", 936 }, // greek capital letter psi
119
- { "Rho", 929 }, // greek capital letter rho, U+03A1
120
- { "Scaron", 352 }, // - latin capital letter S with caron
121
- { "Sigma", 931 }, // greek capital letter sigma
122
- { "THORN", 222 }, // latin capital letter THORN
123
- { "Tau", 932 }, // greek capital letter tau, U+03A4
124
- { "Theta", 920 }, // greek capital letter theta
125
- { "Uacute", 218 }, // latin capital letter U with acute
126
- { "Ucirc", 219 }, // latin capital letter U with circumflex
127
- { "Ugrave", 217 }, // latin capital letter U with grave
128
- { "Upsilon", 933 }, // greek capital letter upsilon
129
- { "Uuml", 220 }, // latin capital letter U with diaeresis
130
- { "Xi", 926 }, // greek capital letter xi, U+039E ISOgrk3
131
- { "Yacute", 221 }, // latin capital letter Y with acute
132
- { "Yuml", 376 }, // - latin capital letter Y with diaeresis
133
- { "Zeta", 918 }, // greek capital letter zeta, U+0396
134
- { "aacute", 225 }, // latin small letter a with acute
135
- { "acirc", 226 }, // latin small letter a with circumflex
136
- { "acute", 180 }, // acute accent = spacing acute
137
- { "aelig", 230 }, // latin small letter ae
138
- { "agrave", 224 }, // latin small letter a with grave
139
- { "alefsym", 8501 },// alef symbol = first transfinite cardinal
140
- { "alpha", 945 }, // greek small letter alpha
141
- { "amp", 38 }, // -- ampersand, U+0026 ISOnum
142
- { "and", 8743 }, // logical and = wedge, U+2227 ISOtech
143
- { "ang", 8736 }, // angle, U+2220 ISOamso
144
- { "apos", 39 }, // -- single quote
145
- { "aring", 229 }, // latin small letter a with ring above
146
- { "asymp", 8776 }, // almost equal to = asymptotic to
147
- { "atilde", 227 }, // latin small letter a with tilde
148
- { "auml", 228 }, // latin small letter a with diaeresis
149
- { "bdquo", 8222 }, // double low-9 quotation mark, U+201E NEW
150
- { "beta", 946 }, // greek small letter beta, U+03B2 ISOgrk3
151
- { "brvbar", 166 }, // broken bar = broken vertical bar
152
- { "bull", 8226 }, // bullet = black small circle
153
- { "cap", 8745 }, // intersection = cap, U+2229 ISOtech
154
- { "ccedil", 231 }, // latin small letter c with cedilla
155
- { "cedil", 184 }, // cedilla = spacing cedilla, U+00B8 ISOdia
156
- { "cent", 162 }, // cent sign, U+00A2 ISOnum
157
- { "chi", 967 }, // greek small letter chi, U+03C7 ISOgrk3
158
- { "circ", 710 }, // - modifier letter circumflex accent
159
- { "clubs", 9827 }, // black club suit = shamrock
160
- { "cong", 8773 }, // approximately equal to, U+2245 ISOtech
161
- { "copy", 169 }, // copyright sign, U+00A9 ISOnum
162
- { "crarr", 8629 }, // downwards arrow with corner leftwards
163
- { "cup", 8746 }, // union = cup, U+222A ISOtech
164
- { "curren", 164 }, // currency sign, U+00A4 ISOnum
165
- { "dArr", 8659 }, // downwards double arrow, U+21D3 ISOamsa
166
- { "dagger", 8224 }, // dagger, U+2020 ISOpub
167
- { "darr", 8595 }, // downwards arrow, U+2193 ISOnum
168
- { "deg", 176 }, // degree sign, U+00B0 ISOnum
169
- { "delta", 948 }, // greek small letter delta
170
- { "diams", 9830 }, // black diamond suit, U+2666 ISOpub
171
- { "divide", 247 }, // division sign, U+00F7 ISOnum
172
- { "eacute", 233 }, // latin small letter e with acute
173
- { "ecirc", 234 }, // latin small letter e with circumflex
174
- { "egrave", 232 }, // latin small letter e with grave
175
- { "empty", 8709 }, // empty set = null set = diameter
176
- { "emsp", 8195 }, // em space, U+2003 ISOpub
177
- { "ensp", 8194 }, // en space, U+2002 ISOpub
178
- { "epsilon", 949 }, // greek small letter epsilon
179
- { "equiv", 8801 }, // identical to, U+2261 ISOtech
180
- { "eta", 951 }, // greek small letter eta, U+03B7 ISOgrk3
181
- { "eth", 240 }, // latin small letter eth, U+00F0 ISOlat1
182
- { "euml", 235 }, // latin small letter e with diaeresis
183
- { "euro", 8364 }, // - euro sign, U+20AC NEW
184
- { "exist", 8707 }, // there exists, U+2203 ISOtech
185
- { "fnof", 402 }, // latin small f with hook = function
186
- { "forall", 8704 }, // for all, U+2200 ISOtech
187
- { "frac12", 189 }, // vulgar fraction one half
188
- { "frac14", 188 }, // vulgar fraction one quarter
189
- { "frac34", 190 }, // vulgar fraction three quarters
190
- { "frasl", 8260 }, // fraction slash, U+2044 NEW
191
- { "gamma", 947 }, // greek small letter gamma
192
- { "ge", 8805 }, // greater-than or equal to
193
- { "gt", 62 }, // -- greater-than sign, U+003E ISOnum
194
- { "hArr", 8660 }, // left right double arrow
195
- { "harr", 8596 }, // left right arrow, U+2194 ISOamsa
196
- { "hearts", 9829 }, // black heart suit = valentine
197
- { "hellip", 8230 }, // horizontal ellipsis = three dot leader
198
- { "iacute", 237 }, // latin small letter i with acute
199
- { "icirc", 238 }, // latin small letter i with circumflex
200
- { "iexcl", 161 }, // inverted exclamation mark, U+00A1 ISOnum
201
- { "igrave", 236 }, // latin small letter i with grave
202
- { "image", 8465 }, // blackletter capital I = imaginary part
203
- { "infin", 8734 }, // infinity, U+221E ISOtech
204
- { "int", 8747 }, // integral, U+222B ISOtech
205
- { "iota", 953 }, // greek small letter iota, U+03B9 ISOgrk3
206
- { "iquest", 191 }, // inverted question mark
207
- { "isin", 8712 }, // element of, U+2208 ISOtech
208
- { "iuml", 239 }, // latin small letter i with diaeresis
209
- { "kappa", 954 }, // greek small letter kappa
210
- { "lArr", 8656 }, // leftwards double arrow, U+21D0 ISOtech
211
- { "lambda", 955 }, // greek small letter lambda
212
- { "lang", 9001 }, // left-pointing angle bracket = bra
213
- { "laquo", 171 }, // left-pointing double angle quotation mark
214
- { "larr", 8592 }, // leftwards arrow, U+2190 ISOnum
215
- { "lceil", 8968 }, // left ceiling = apl upstile
216
- { "ldquo", 8220 }, // left double quotation mark
217
- { "le", 8804 }, // less-than or equal to, U+2264 ISOtech
218
- { "lfloor", 8970 }, // left floor = apl downstile
219
- { "lowast", 8727 }, // asterisk operator, U+2217 ISOtech
220
- { "loz", 9674 }, // lozenge, U+25CA ISOpub
221
- { "lrm", 8206 }, // left-to-right mark, U+200E NEW RFC 2070
222
- { "lsaquo", 8249 }, // single left-pointing angle quotation mark
223
- { "lsquo", 8216 }, // left single quotation mark
224
- { "lt", 60 }, // -- less-than sign, U+003C ISOnum
225
- { "macr", 175 }, // macron = spacing macron = overline
226
- { "mdash", 8212 }, // em dash, U+2014 ISOpub
227
- { "micro", 181 }, // micro sign, U+00B5 ISOnum
228
- { "middot", 183 }, // middle dot = Georgian comma
229
- { "minus", 8722 }, // minus sign, U+2212 ISOtech
230
- { "mu", 956 }, // greek small letter mu, U+03BC ISOgrk3
231
- { "nabla", 8711 }, // nabla = backward difference
232
- { "nbsp", 160 }, // no-break space = non-breaking space
233
- { "ndash", 8211 }, // en dash, U+2013 ISOpub
234
- { "ne", 8800 }, // not equal to, U+2260 ISOtech
235
- { "ni", 8715 }, // contains as member, U+220B ISOtech
236
- { "not", 172 }, // not sign, U+00AC ISOnum
237
- { "notin", 8713 }, // not an element of, U+2209 ISOtech
238
- { "nsub", 8836 }, // not a subset of, U+2284 ISOamsn
239
- { "ntilde", 241 }, // latin small letter n with tilde
240
- { "nu", 957 }, // greek small letter nu, U+03BD ISOgrk3
241
- { "oacute", 243 }, // latin small letter o with acute
242
- { "ocirc", 244 }, // latin small letter o with circumflex
243
- { "oelig", 339 }, // - latin small ligature oe, U+0153 ISOlat2
244
- { "ograve", 242 }, // latin small letter o with grave
245
- { "oline", 8254 }, // overline = spacing overscore
246
- { "omega", 969 }, // greek small letter omega
247
- { "omicron", 959 }, // greek small letter omicron, U+03BF NEW
248
- { "oplus", 8853 }, // circled plus = direct sum
249
- { "or", 8744 }, // logical or = vee, U+2228 ISOtech
250
- { "ordf", 170 }, // feminine ordinal indicator, U+00AA ISOnum
251
- { "ordm", 186 }, // masculine ordinal indicator
252
- { "oslash", 248 }, // latin small letter o with stroke
253
- { "otilde", 245 }, // latin small letter o with tilde
254
- { "otimes", 8855 }, // circled times = vector product
255
- { "ouml", 246 }, // latin small letter o with diaeresis
256
- { "para", 182 }, // pilcrow sign = paragraph sign
257
- { "part", 8706 }, // partial differential, U+2202 ISOtech
258
- { "permil", 8240 }, // per mille sign, U+2030 ISOtech
259
- { "perp", 8869 }, // up tack = orthogonal to = perpendicular
260
- { "phi", 966 }, // greek small letter phi, U+03C6 ISOgrk3
261
- { "pi", 960 }, // greek small letter pi, U+03C0 ISOgrk3
262
- { "piv", 982 }, // greek pi symbol, U+03D6 ISOgrk3
263
- { "plusmn", 177 }, // plus-minus sign = plus-or-minus sign
264
- { "pound", 163 }, // pound sign, U+00A3 ISOnum
265
- { "prime", 8242 }, // prime = minutes = feet, U+2032 ISOtech
266
- { "prod", 8719 }, // n-ary product = product sign
267
- { "prop", 8733 }, // proportional to, U+221D ISOtech
268
- { "psi", 968 }, // greek small letter psi, U+03C8 ISOgrk3
269
- { "quot", 34 }, // -- quotation mark = APL quote
270
- { "rArr", 8658 }, // rightwards double arrow
271
- { "radic", 8730 }, // square root = radical sign
272
- { "rang", 9002 }, // right-pointing angle bracket = ket
273
- { "raquo", 187 }, // right-pointing double angle quotation mark
274
- { "rarr", 8594 }, // rightwards arrow, U+2192 ISOnum
275
- { "rceil", 8969 }, // right ceiling, U+2309 ISOamsc
276
- { "rdquo", 8221 }, // right double quotation mark
277
- { "real", 8476 }, // blackletter capital R = real part symbol
278
- { "reg", 174 }, // registered sign = registered trade mark sign
279
- { "rfloor", 8971 }, // right floor, U+230B ISOamsc
280
- { "rho", 961 }, // greek small letter rho, U+03C1 ISOgrk3
281
- { "rlm", 8207 }, // right-to-left mark, U+200F NEW RFC 2070
282
- { "rsaquo", 8250 }, // single right-pointing angle quotation mark
283
- { "rsquo", 8217 }, // right single quotation mark
284
- { "sbquo", 8218 }, // single low-9 quotation mark, U+201A NEW
285
- { "scaron", 353 }, // - latin small letter s with caron
286
- { "sdot", 8901 }, // dot operator, U+22C5 ISOamsb
287
- { "sect", 167 }, // section sign, U+00A7 ISOnum
288
- { "shy", 173 }, // soft hyphen = discretionary hyphen
289
- { "sigma", 963 }, // greek small letter sigma
290
- { "sigmaf", 962 }, // greek small letter final sigma
291
- { "sim", 8764 }, // tilde operator = varies with = similar to
292
- { "spades", 9824 }, // black spade suit, U+2660 ISOpub
293
- { "sub", 8834 }, // subset of, U+2282 ISOtech
294
- { "sube", 8838 }, // subset of or equal to, U+2286 ISOtech
295
- { "sum", 8721 }, // n-ary sumation, U+2211 ISOamsb
296
- { "sup", 8835 }, // superset of, U+2283 ISOtech
297
- { "sup1", 185 }, // superscript one = superscript digit one
298
- { "sup2", 178 }, // superscript two = superscript digit two
299
- { "sup3", 179 }, // superscript three = superscript digit three
300
- { "supe", 8839 }, // superset of or equal to
301
- { "szlig", 223 }, // latin small letter sharp s = ess-zed
302
- { "tau", 964 }, // greek small letter tau, U+03C4 ISOgrk3
303
- { "there4", 8756 }, // therefore, U+2234 ISOtech
304
- { "theta", 952 }, // greek small letter theta
305
- { "thetasym", 977 },// greek small letter theta symbol
306
- { "thinsp", 8201 }, // thin space, U+2009 ISOpub
307
- { "thorn", 254 }, // latin small letter thorn
308
- { "tilde", 732 }, // - small tilde, U+02DC ISOdia
309
- { "times", 215 }, // multiplication sign, U+00D7 ISOnum
310
- { "trade", 8482 }, // trade mark sign, U+2122 ISOnum
311
- { "uArr", 8657 }, // upwards double arrow, U+21D1 ISOamsa
312
- { "uacute", 250 }, // latin small letter u with acute
313
- { "uarr", 8593 }, // upwards arrow, U+2191 ISOnum-->
314
- { "ucirc", 251 }, // latin small letter u with circumflex
315
- { "ugrave", 249 }, // latin small letter u with grave
316
- { "uml", 168 }, // diaeresis = spacing diaeresis
317
- { "upsih", 978 }, // greek upsilon with hook symbol
318
- { "upsilon", 965 }, // greek small letter upsilon
319
- { "uuml", 252 }, // latin small letter u with diaeresis
320
- { "weierp", 8472 }, // script capital P = power set
321
- { "xi", 958 }, // greek small letter xi, U+03BE ISOgrk3
322
- { "yacute", 253 }, // latin small letter y with acute
323
- { "yen", 165 }, // yen sign = yuan sign, U+00A5 ISOnum
324
- { "yuml", 255 }, // latin small letter y with diaeresis
325
- { "zeta", 950 }, // greek small letter zeta, U+03B6 ISOgrk3
326
- { "zwj", 8205 }, // zero width joiner, U+200D NEW RFC 2070
327
- { "zwnj", 8204 }, // zero width non-joiner
328
- { NULL, 0 },
73
+ static struct _slot entities[] = {
74
+ {"AElig", 198}, // latin capital letter AE
75
+ {"Aacute", 193}, // latin capital letter A with acute
76
+ {"Acirc", 194}, // latin capital letter A with circumflex
77
+ {"Agrave", 192}, // latin capital letter A with grave
78
+ {"Alpha", 913}, // greek capital letter alpha, U+0391
79
+ {"Aring", 197}, // latin capital letter A with ring above
80
+ {"Atilde", 195}, // latin capital letter A with tilde
81
+ {"Auml", 196}, // latin capital letter A with diaeresis
82
+ {"Beta", 914}, // greek capital letter beta, U+0392
83
+ {"Ccedil", 199}, // latin capital letter C with cedilla
84
+ {"Chi", 935}, // greek capital letter chi, U+03A7
85
+ {"Dagger", 8225}, // double dagger, U+2021 ISOpub
86
+ {"Delta", 916}, // greek capital letter delta
87
+ {"ETH", 208}, // latin capital letter ETH, U+00D0 ISOlat1
88
+ {"Eacute", 201}, // latin capital letter E with acute
89
+ {"Ecirc", 202}, // latin capital letter E with circumflex
90
+ {"Egrave", 200}, // latin capital letter E with grave
91
+ {"Epsilon", 917}, // greek capital letter epsilon, U+0395
92
+ {"Eta", 919}, // greek capital letter eta, U+0397
93
+ {"Euml", 203}, // latin capital letter E with diaeresis
94
+ {"Gamma", 915}, // greek capital letter gamma
95
+ {"Iacute", 205}, // latin capital letter I with acute
96
+ {"Icirc", 206}, // latin capital letter I with circumflex
97
+ {"Igrave", 204}, // latin capital letter I with grave
98
+ {"Iota", 921}, // greek capital letter iota, U+0399
99
+ {"Iuml", 207}, // latin capital letter I with diaeresis
100
+ {"Kappa", 922}, // greek capital letter kappa, U+039A
101
+ {"Lambda", 923}, // greek capital letter lambda
102
+ {"Mu", 924}, // greek capital letter mu, U+039C
103
+ {"Ntilde", 209}, // latin capital letter N with tilde
104
+ {"Nu", 925}, // greek capital letter nu, U+039D
105
+ {"OElig", 338}, // - latin capital ligature OE
106
+ {"Oacute", 211}, // latin capital letter O with acute
107
+ {"Ocirc", 212}, // latin capital letter O with circumflex
108
+ {"Ograve", 210}, // latin capital letter O with grave
109
+ {"Omega", 937}, // greek capital letter omega
110
+ {"Omicron", 927}, // greek capital letter omicron, U+039F
111
+ {"Oslash", 216}, // latin capital letter O with stroke
112
+ {"Otilde", 213}, // latin capital letter O with tilde
113
+ {"Ouml", 214}, // latin capital letter O with diaeresis
114
+ {"Phi", 934}, // greek capital letter phi
115
+ {"Pi", 928}, // greek capital letter pi, U+03A0 ISOgrk3
116
+ {"Prime", 8243}, // double prime = seconds = inches
117
+ {"Psi", 936}, // greek capital letter psi
118
+ {"Rho", 929}, // greek capital letter rho, U+03A1
119
+ {"Scaron", 352}, // - latin capital letter S with caron
120
+ {"Sigma", 931}, // greek capital letter sigma
121
+ {"THORN", 222}, // latin capital letter THORN
122
+ {"Tau", 932}, // greek capital letter tau, U+03A4
123
+ {"Theta", 920}, // greek capital letter theta
124
+ {"Uacute", 218}, // latin capital letter U with acute
125
+ {"Ucirc", 219}, // latin capital letter U with circumflex
126
+ {"Ugrave", 217}, // latin capital letter U with grave
127
+ {"Upsilon", 933}, // greek capital letter upsilon
128
+ {"Uuml", 220}, // latin capital letter U with diaeresis
129
+ {"Xi", 926}, // greek capital letter xi, U+039E ISOgrk3
130
+ {"Yacute", 221}, // latin capital letter Y with acute
131
+ {"Yuml", 376}, // - latin capital letter Y with diaeresis
132
+ {"Zeta", 918}, // greek capital letter zeta, U+0396
133
+ {"aacute", 225}, // latin small letter a with acute
134
+ {"acirc", 226}, // latin small letter a with circumflex
135
+ {"acute", 180}, // acute accent = spacing acute
136
+ {"aelig", 230}, // latin small letter ae
137
+ {"agrave", 224}, // latin small letter a with grave
138
+ {"alefsym", 8501}, // alef symbol = first transfinite cardinal
139
+ {"alpha", 945}, // greek small letter alpha
140
+ {"amp", 38}, // -- ampersand, U+0026 ISOnum
141
+ {"and", 8743}, // logical and = wedge, U+2227 ISOtech
142
+ {"ang", 8736}, // angle, U+2220 ISOamso
143
+ {"apos", 39}, // -- single quote
144
+ {"aring", 229}, // latin small letter a with ring above
145
+ {"asymp", 8776}, // almost equal to = asymptotic to
146
+ {"atilde", 227}, // latin small letter a with tilde
147
+ {"auml", 228}, // latin small letter a with diaeresis
148
+ {"bdquo", 8222}, // double low-9 quotation mark, U+201E NEW
149
+ {"beta", 946}, // greek small letter beta, U+03B2 ISOgrk3
150
+ {"brvbar", 166}, // broken bar = broken vertical bar
151
+ {"bull", 8226}, // bullet = black small circle
152
+ {"cap", 8745}, // intersection = cap, U+2229 ISOtech
153
+ {"ccedil", 231}, // latin small letter c with cedilla
154
+ {"cedil", 184}, // cedilla = spacing cedilla, U+00B8 ISOdia
155
+ {"cent", 162}, // cent sign, U+00A2 ISOnum
156
+ {"chi", 967}, // greek small letter chi, U+03C7 ISOgrk3
157
+ {"circ", 710}, // - modifier letter circumflex accent
158
+ {"clubs", 9827}, // black club suit = shamrock
159
+ {"cong", 8773}, // approximately equal to, U+2245 ISOtech
160
+ {"copy", 169}, // copyright sign, U+00A9 ISOnum
161
+ {"crarr", 8629}, // downwards arrow with corner leftwards
162
+ {"cup", 8746}, // union = cup, U+222A ISOtech
163
+ {"curren", 164}, // currency sign, U+00A4 ISOnum
164
+ {"dArr", 8659}, // downwards double arrow, U+21D3 ISOamsa
165
+ {"dagger", 8224}, // dagger, U+2020 ISOpub
166
+ {"darr", 8595}, // downwards arrow, U+2193 ISOnum
167
+ {"deg", 176}, // degree sign, U+00B0 ISOnum
168
+ {"delta", 948}, // greek small letter delta
169
+ {"diams", 9830}, // black diamond suit, U+2666 ISOpub
170
+ {"divide", 247}, // division sign, U+00F7 ISOnum
171
+ {"eacute", 233}, // latin small letter e with acute
172
+ {"ecirc", 234}, // latin small letter e with circumflex
173
+ {"egrave", 232}, // latin small letter e with grave
174
+ {"empty", 8709}, // empty set = null set = diameter
175
+ {"emsp", 8195}, // em space, U+2003 ISOpub
176
+ {"ensp", 8194}, // en space, U+2002 ISOpub
177
+ {"epsilon", 949}, // greek small letter epsilon
178
+ {"equiv", 8801}, // identical to, U+2261 ISOtech
179
+ {"eta", 951}, // greek small letter eta, U+03B7 ISOgrk3
180
+ {"eth", 240}, // latin small letter eth, U+00F0 ISOlat1
181
+ {"euml", 235}, // latin small letter e with diaeresis
182
+ {"euro", 8364}, // - euro sign, U+20AC NEW
183
+ {"exist", 8707}, // there exists, U+2203 ISOtech
184
+ {"fnof", 402}, // latin small f with hook = function
185
+ {"forall", 8704}, // for all, U+2200 ISOtech
186
+ {"frac12", 189}, // vulgar fraction one half
187
+ {"frac14", 188}, // vulgar fraction one quarter
188
+ {"frac34", 190}, // vulgar fraction three quarters
189
+ {"frasl", 8260}, // fraction slash, U+2044 NEW
190
+ {"gamma", 947}, // greek small letter gamma
191
+ {"ge", 8805}, // greater-than or equal to
192
+ {"gt", 62}, // -- greater-than sign, U+003E ISOnum
193
+ {"hArr", 8660}, // left right double arrow
194
+ {"harr", 8596}, // left right arrow, U+2194 ISOamsa
195
+ {"hearts", 9829}, // black heart suit = valentine
196
+ {"hellip", 8230}, // horizontal ellipsis = three dot leader
197
+ {"iacute", 237}, // latin small letter i with acute
198
+ {"icirc", 238}, // latin small letter i with circumflex
199
+ {"iexcl", 161}, // inverted exclamation mark, U+00A1 ISOnum
200
+ {"igrave", 236}, // latin small letter i with grave
201
+ {"image", 8465}, // blackletter capital I = imaginary part
202
+ {"infin", 8734}, // infinity, U+221E ISOtech
203
+ {"int", 8747}, // integral, U+222B ISOtech
204
+ {"iota", 953}, // greek small letter iota, U+03B9 ISOgrk3
205
+ {"iquest", 191}, // inverted question mark
206
+ {"isin", 8712}, // element of, U+2208 ISOtech
207
+ {"iuml", 239}, // latin small letter i with diaeresis
208
+ {"kappa", 954}, // greek small letter kappa
209
+ {"lArr", 8656}, // leftwards double arrow, U+21D0 ISOtech
210
+ {"lambda", 955}, // greek small letter lambda
211
+ {"lang", 9001}, // left-pointing angle bracket = bra
212
+ {"laquo", 171}, // left-pointing double angle quotation mark
213
+ {"larr", 8592}, // leftwards arrow, U+2190 ISOnum
214
+ {"lceil", 8968}, // left ceiling = apl upstile
215
+ {"ldquo", 8220}, // left double quotation mark
216
+ {"le", 8804}, // less-than or equal to, U+2264 ISOtech
217
+ {"lfloor", 8970}, // left floor = apl downstile
218
+ {"lowast", 8727}, // asterisk operator, U+2217 ISOtech
219
+ {"loz", 9674}, // lozenge, U+25CA ISOpub
220
+ {"lrm", 8206}, // left-to-right mark, U+200E NEW RFC 2070
221
+ {"lsaquo", 8249}, // single left-pointing angle quotation mark
222
+ {"lsquo", 8216}, // left single quotation mark
223
+ {"lt", 60}, // -- less-than sign, U+003C ISOnum
224
+ {"macr", 175}, // macron = spacing macron = overline
225
+ {"mdash", 8212}, // em dash, U+2014 ISOpub
226
+ {"micro", 181}, // micro sign, U+00B5 ISOnum
227
+ {"middot", 183}, // middle dot = Georgian comma
228
+ {"minus", 8722}, // minus sign, U+2212 ISOtech
229
+ {"mu", 956}, // greek small letter mu, U+03BC ISOgrk3
230
+ {"nabla", 8711}, // nabla = backward difference
231
+ {"nbsp", 160}, // no-break space = non-breaking space
232
+ {"ndash", 8211}, // en dash, U+2013 ISOpub
233
+ {"ne", 8800}, // not equal to, U+2260 ISOtech
234
+ {"ni", 8715}, // contains as member, U+220B ISOtech
235
+ {"not", 172}, // not sign, U+00AC ISOnum
236
+ {"notin", 8713}, // not an element of, U+2209 ISOtech
237
+ {"nsub", 8836}, // not a subset of, U+2284 ISOamsn
238
+ {"ntilde", 241}, // latin small letter n with tilde
239
+ {"nu", 957}, // greek small letter nu, U+03BD ISOgrk3
240
+ {"oacute", 243}, // latin small letter o with acute
241
+ {"ocirc", 244}, // latin small letter o with circumflex
242
+ {"oelig", 339}, // - latin small ligature oe, U+0153 ISOlat2
243
+ {"ograve", 242}, // latin small letter o with grave
244
+ {"oline", 8254}, // overline = spacing overscore
245
+ {"omega", 969}, // greek small letter omega
246
+ {"omicron", 959}, // greek small letter omicron, U+03BF NEW
247
+ {"oplus", 8853}, // circled plus = direct sum
248
+ {"or", 8744}, // logical or = vee, U+2228 ISOtech
249
+ {"ordf", 170}, // feminine ordinal indicator, U+00AA ISOnum
250
+ {"ordm", 186}, // masculine ordinal indicator
251
+ {"oslash", 248}, // latin small letter o with stroke
252
+ {"otilde", 245}, // latin small letter o with tilde
253
+ {"otimes", 8855}, // circled times = vector product
254
+ {"ouml", 246}, // latin small letter o with diaeresis
255
+ {"para", 182}, // pilcrow sign = paragraph sign
256
+ {"part", 8706}, // partial differential, U+2202 ISOtech
257
+ {"permil", 8240}, // per mille sign, U+2030 ISOtech
258
+ {"perp", 8869}, // up tack = orthogonal to = perpendicular
259
+ {"phi", 966}, // greek small letter phi, U+03C6 ISOgrk3
260
+ {"pi", 960}, // greek small letter pi, U+03C0 ISOgrk3
261
+ {"piv", 982}, // greek pi symbol, U+03D6 ISOgrk3
262
+ {"plusmn", 177}, // plus-minus sign = plus-or-minus sign
263
+ {"pound", 163}, // pound sign, U+00A3 ISOnum
264
+ {"prime", 8242}, // prime = minutes = feet, U+2032 ISOtech
265
+ {"prod", 8719}, // n-ary product = product sign
266
+ {"prop", 8733}, // proportional to, U+221D ISOtech
267
+ {"psi", 968}, // greek small letter psi, U+03C8 ISOgrk3
268
+ {"quot", 34}, // -- quotation mark = APL quote
269
+ {"rArr", 8658}, // rightwards double arrow
270
+ {"radic", 8730}, // square root = radical sign
271
+ {"rang", 9002}, // right-pointing angle bracket = ket
272
+ {"raquo", 187}, // right-pointing double angle quotation mark
273
+ {"rarr", 8594}, // rightwards arrow, U+2192 ISOnum
274
+ {"rceil", 8969}, // right ceiling, U+2309 ISOamsc
275
+ {"rdquo", 8221}, // right double quotation mark
276
+ {"real", 8476}, // blackletter capital R = real part symbol
277
+ {"reg", 174}, // registered sign = registered trade mark sign
278
+ {"rfloor", 8971}, // right floor, U+230B ISOamsc
279
+ {"rho", 961}, // greek small letter rho, U+03C1 ISOgrk3
280
+ {"rlm", 8207}, // right-to-left mark, U+200F NEW RFC 2070
281
+ {"rsaquo", 8250}, // single right-pointing angle quotation mark
282
+ {"rsquo", 8217}, // right single quotation mark
283
+ {"sbquo", 8218}, // single low-9 quotation mark, U+201A NEW
284
+ {"scaron", 353}, // - latin small letter s with caron
285
+ {"sdot", 8901}, // dot operator, U+22C5 ISOamsb
286
+ {"sect", 167}, // section sign, U+00A7 ISOnum
287
+ {"shy", 173}, // soft hyphen = discretionary hyphen
288
+ {"sigma", 963}, // greek small letter sigma
289
+ {"sigmaf", 962}, // greek small letter final sigma
290
+ {"sim", 8764}, // tilde operator = varies with = similar to
291
+ {"spades", 9824}, // black spade suit, U+2660 ISOpub
292
+ {"sub", 8834}, // subset of, U+2282 ISOtech
293
+ {"sube", 8838}, // subset of or equal to, U+2286 ISOtech
294
+ {"sum", 8721}, // n-ary sumation, U+2211 ISOamsb
295
+ {"sup", 8835}, // superset of, U+2283 ISOtech
296
+ {"sup1", 185}, // superscript one = superscript digit one
297
+ {"sup2", 178}, // superscript two = superscript digit two
298
+ {"sup3", 179}, // superscript three = superscript digit three
299
+ {"supe", 8839}, // superset of or equal to
300
+ {"szlig", 223}, // latin small letter sharp s = ess-zed
301
+ {"tau", 964}, // greek small letter tau, U+03C4 ISOgrk3
302
+ {"there4", 8756}, // therefore, U+2234 ISOtech
303
+ {"theta", 952}, // greek small letter theta
304
+ {"thetasym", 977}, // greek small letter theta symbol
305
+ {"thinsp", 8201}, // thin space, U+2009 ISOpub
306
+ {"thorn", 254}, // latin small letter thorn
307
+ {"tilde", 732}, // - small tilde, U+02DC ISOdia
308
+ {"times", 215}, // multiplication sign, U+00D7 ISOnum
309
+ {"trade", 8482}, // trade mark sign, U+2122 ISOnum
310
+ {"uArr", 8657}, // upwards double arrow, U+21D1 ISOamsa
311
+ {"uacute", 250}, // latin small letter u with acute
312
+ {"uarr", 8593}, // upwards arrow, U+2191 ISOnum-->
313
+ {"ucirc", 251}, // latin small letter u with circumflex
314
+ {"ugrave", 249}, // latin small letter u with grave
315
+ {"uml", 168}, // diaeresis = spacing diaeresis
316
+ {"upsih", 978}, // greek upsilon with hook symbol
317
+ {"upsilon", 965}, // greek small letter upsilon
318
+ {"uuml", 252}, // latin small letter u with diaeresis
319
+ {"weierp", 8472}, // script capital P = power set
320
+ {"xi", 958}, // greek small letter xi, U+03BE ISOgrk3
321
+ {"yacute", 253}, // latin small letter y with acute
322
+ {"yen", 165}, // yen sign = yuan sign, U+00A5 ISOnum
323
+ {"yuml", 255}, // latin small letter y with diaeresis
324
+ {"zeta", 950}, // greek small letter zeta, U+03B6 ISOgrk3
325
+ {"zwj", 8205}, // zero width joiner, U+200D NEW RFC 2070
326
+ {"zwnj", 8204}, // zero width non-joiner
327
+ {NULL, 0},
329
328
  };
330
329
 
331
- static uint64_t
332
- calc_hash(const char *key) {
333
- uint64_t h = 0;
330
+ static uint64_t calc_hash(const char *key) {
331
+ uint64_t h = 0;
334
332
 
335
333
  if (NULL != key) {
336
- const uint8_t *k = (const uint8_t*)key;
334
+ const uint8_t *k = (const uint8_t *)key;
337
335
 
338
- for (; 0 != *k; k++) {
339
- // narrow to most used range of 0x4D (77) in size
340
- h = 77 * h + ((*k | 0x20) - 0x2D);
341
- }
336
+ for (; 0 != *k; k++) {
337
+ // narrow to most used range of 0x4D (77) in size
338
+ h = 77 * h + ((*k | 0x20) - 0x2D);
339
+ }
342
340
  }
343
341
  return h;
344
342
  }
345
343
 
346
- static Slot*
347
- get_bucketp(uint64_t h) {
344
+ static Slot *get_bucketp(uint64_t h) {
348
345
  return entity_cache.buckets + (BUCKET_MASK & (h ^ (h << 5) ^ (h >> 7)));
349
346
  }
350
347
 
351
- static void
352
- cache_set(Slot s) {
353
- int64_t h = calc_hash(s->key);
354
- Slot *bucket = get_bucketp(h);
348
+ static void cache_set(Slot s) {
349
+ int64_t h = calc_hash(s->key);
350
+ Slot *bucket = get_bucketp(h);
355
351
 
356
352
  s->hash = h;
357
353
  s->next = *bucket;
358
354
  *bucket = s;
359
355
  }
360
356
 
361
- static Slot
362
- cache_get(const char *key) {
363
- int64_t h = calc_hash(key);
364
- Slot *bucket = get_bucketp(h);
365
- Slot s;
357
+ static Slot cache_get(const char *key) {
358
+ int64_t h = calc_hash(key);
359
+ Slot *bucket = get_bucketp(h);
360
+ Slot s;
366
361
 
367
362
  for (s = *bucket; NULL != s; s = s->next) {
368
- if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
369
- return s;
370
- }
363
+ if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
364
+ return s;
365
+ }
371
366
  }
372
367
  return NULL;
373
368
  }
374
369
 
375
- static void
376
- cache_init() {
377
- Slot e = entities;
370
+ static void cache_init() {
371
+ Slot e = entities;
378
372
 
379
373
  memset(&entity_cache, 0, sizeof(struct _cache));
380
374
  for (; NULL != e->key; e++) {
381
- cache_set(e);
375
+ cache_set(e);
382
376
  }
383
377
  inited = true;
384
378
  }
385
379
 
386
- char*
387
- ox_entity_lookup(char *text, const char *key) {
388
- Slot s = entities;
380
+ char *ox_entity_lookup(char *text, const char *key) {
381
+ Slot s = entities;
389
382
 
390
383
  if (!inited) {
391
- cache_init();
384
+ cache_init();
392
385
  }
393
386
  if (NULL == (s = cache_get(key))) {
394
- return NULL;
387
+ return NULL;
395
388
  }
396
389
  return ox_ucs_to_utf8_chars(text, s->code);
397
390
  }