ox 2.14.14 → 2.14.15
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +1 -1
- data/ext/ox/attr.h +33 -39
- data/ext/ox/base64.c +48 -42
- data/ext/ox/base64.h +4 -4
- data/ext/ox/buf.h +80 -86
- data/ext/ox/builder.c +378 -423
- data/ext/ox/cache.c +2 -2
- data/ext/ox/cache8.c +37 -40
- data/ext/ox/cache8.h +7 -7
- data/ext/ox/dump.c +838 -867
- data/ext/ox/err.c +16 -13
- data/ext/ox/err.h +11 -12
- data/ext/ox/extconf.rb +5 -5
- data/ext/ox/gen_load.c +135 -137
- data/ext/ox/hash_load.c +130 -148
- data/ext/ox/helper.h +32 -39
- data/ext/ox/intern.c +1 -2
- data/ext/ox/obj_load.c +590 -644
- data/ext/ox/ox.c +2 -2
- data/ext/ox/ox.h +5 -5
- data/ext/ox/parse.c +836 -874
- data/ext/ox/sax.c +38 -23
- data/ext/ox/sax.h +2 -2
- data/ext/ox/sax_as.c +78 -94
- data/ext/ox/sax_buf.c +85 -94
- data/ext/ox/sax_buf.h +101 -120
- data/ext/ox/sax_hint.c +175 -184
- data/ext/ox/sax_hint.h +19 -19
- data/ext/ox/sax_stack.h +59 -45
- data/ext/ox/slotcache.c +2 -2
- data/ext/ox/slotcache.h +4 -4
- data/ext/ox/special.c +320 -327
- data/ext/ox/special.h +2 -2
- data/ext/ox/type.h +19 -19
- data/lib/ox/bag.rb +13 -9
- data/lib/ox/cdata.rb +0 -2
- data/lib/ox/comment.rb +0 -2
- data/lib/ox/doctype.rb +0 -2
- data/lib/ox/document.rb +3 -5
- data/lib/ox/element.rb +41 -26
- data/lib/ox/error.rb +0 -3
- data/lib/ox/hasattrs.rb +7 -8
- data/lib/ox/instruct.rb +4 -6
- data/lib/ox/node.rb +3 -4
- data/lib/ox/raw.rb +0 -2
- data/lib/ox/sax.rb +20 -36
- data/lib/ox/version.rb +1 -2
- data/lib/ox/xmlrpc_adapter.rb +4 -6
- data/lib/ox.rb +15 -16
- metadata +6 -5
data/ext/ox/special.c
CHANGED
@@ -3,395 +3,388 @@
|
|
3
3
|
* All rights reserved.
|
4
4
|
*/
|
5
5
|
|
6
|
-
#include <string.h>
|
7
|
-
#include <stdbool.h>
|
8
|
-
|
9
6
|
#include "special.h"
|
10
7
|
|
8
|
+
#include <stdbool.h>
|
9
|
+
#include <string.h>
|
10
|
+
|
11
11
|
/*
|
12
12
|
u0000..u007F 00000000000000xxxxxxx 0xxxxxxx
|
13
13
|
u0080..u07FF 0000000000yyyyyxxxxxx 110yyyyy 10xxxxxx
|
14
14
|
u0800..uD7FF, uE000..uFFFF 00000zzzzyyyyyyxxxxxx 1110zzzz 10yyyyyy 10xxxxxx
|
15
15
|
u10000..u10FFFF uuuzzzzzzyyyyyyxxxxxx 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx
|
16
16
|
*/
|
17
|
-
char*
|
18
|
-
|
19
|
-
int
|
20
|
-
|
21
|
-
unsigned char c;
|
17
|
+
char *ox_ucs_to_utf8_chars(char *text, uint64_t u) {
|
18
|
+
int reading = 0;
|
19
|
+
int i;
|
20
|
+
unsigned char c;
|
22
21
|
|
23
22
|
if (u <= 0x000000000000007FULL) {
|
24
|
-
|
25
|
-
|
23
|
+
/* 0xxxxxxx */
|
24
|
+
*text++ = (char)u;
|
26
25
|
} else if (u <= 0x00000000000007FFULL) {
|
27
|
-
|
28
|
-
|
29
|
-
|
26
|
+
/* 110yyyyy 10xxxxxx */
|
27
|
+
*text++ = (char)(0x00000000000000C0ULL | (0x000000000000001FULL & (u >> 6)));
|
28
|
+
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
|
30
29
|
} else if (u <= 0x000000000000D7FFULL || (0x000000000000E000ULL <= u && u <= 0x000000000000FFFFULL)) {
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
30
|
+
/* 1110zzzz 10yyyyyy 10xxxxxx */
|
31
|
+
*text++ = (char)(0x00000000000000E0ULL | (0x000000000000000FULL & (u >> 12)));
|
32
|
+
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
|
33
|
+
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
|
35
34
|
} else if (0x0000000000010000ULL <= u && u <= 0x000000000010FFFFULL) {
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
/* 11110uuu 10zzzzzz 10yyyyyy 10xxxxxx */
|
36
|
+
*text++ = (char)(0x00000000000000F0ULL | (0x0000000000000007ULL & (u >> 18)));
|
37
|
+
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 12)));
|
38
|
+
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & (u >> 6)));
|
39
|
+
*text++ = (char)(0x0000000000000080ULL | (0x000000000000003FULL & u));
|
41
40
|
} else {
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
41
|
+
/* assume it is UTF-8 encoded directly and not UCS */
|
42
|
+
for (i = 56; 0 <= i; i -= 8) {
|
43
|
+
c = (unsigned char)((u >> i) & 0x00000000000000FFULL);
|
44
|
+
if (reading) {
|
45
|
+
*text++ = (char)c;
|
46
|
+
} else if ('\0' != c) {
|
47
|
+
*text++ = (char)c;
|
48
|
+
reading = 1;
|
49
|
+
}
|
50
|
+
}
|
52
51
|
}
|
53
52
|
return text;
|
54
53
|
}
|
55
54
|
|
56
|
-
#define BUCKET_SIZE
|
57
|
-
#define BUCKET_MASK
|
55
|
+
#define BUCKET_SIZE 256
|
56
|
+
#define BUCKET_MASK 255
|
58
57
|
|
59
58
|
typedef struct _slot {
|
60
|
-
const char
|
61
|
-
uint64_t
|
62
|
-
struct _slot
|
63
|
-
uint64_t
|
59
|
+
const char *key;
|
60
|
+
uint64_t code;
|
61
|
+
struct _slot *next;
|
62
|
+
uint64_t hash;
|
64
63
|
} *Slot;
|
65
64
|
|
66
65
|
typedef struct _cache {
|
67
|
-
Slot
|
66
|
+
Slot buckets[BUCKET_SIZE];
|
68
67
|
} *Cache;
|
69
68
|
|
70
|
-
static struct _cache
|
71
|
-
static bool
|
69
|
+
static struct _cache entity_cache;
|
70
|
+
static bool inited = false;
|
72
71
|
|
73
72
|
// HTML entities such as &. This is a complete list from the HTML 5 spec.
|
74
|
-
static struct _slot
|
75
|
-
{
|
76
|
-
{
|
77
|
-
{
|
78
|
-
{
|
79
|
-
{
|
80
|
-
{
|
81
|
-
{
|
82
|
-
{
|
83
|
-
{
|
84
|
-
{
|
85
|
-
{
|
86
|
-
{
|
87
|
-
{
|
88
|
-
{
|
89
|
-
{
|
90
|
-
{
|
91
|
-
{
|
92
|
-
{
|
93
|
-
{
|
94
|
-
{
|
95
|
-
{
|
96
|
-
{
|
97
|
-
{
|
98
|
-
{
|
99
|
-
{
|
100
|
-
{
|
101
|
-
{
|
102
|
-
{
|
103
|
-
{
|
104
|
-
{
|
105
|
-
{
|
106
|
-
{
|
107
|
-
{
|
108
|
-
{
|
109
|
-
{
|
110
|
-
{
|
111
|
-
{
|
112
|
-
{
|
113
|
-
{
|
114
|
-
{
|
115
|
-
{
|
116
|
-
{
|
117
|
-
{
|
118
|
-
{
|
119
|
-
{
|
120
|
-
{
|
121
|
-
{
|
122
|
-
{
|
123
|
-
{
|
124
|
-
{
|
125
|
-
{
|
126
|
-
{
|
127
|
-
{
|
128
|
-
{
|
129
|
-
{
|
130
|
-
{
|
131
|
-
{
|
132
|
-
{
|
133
|
-
{
|
134
|
-
{
|
135
|
-
{
|
136
|
-
{
|
137
|
-
{
|
138
|
-
{
|
139
|
-
{
|
140
|
-
{
|
141
|
-
{
|
142
|
-
{
|
143
|
-
{
|
144
|
-
{
|
145
|
-
{
|
146
|
-
{
|
147
|
-
{
|
148
|
-
{
|
149
|
-
{
|
150
|
-
{
|
151
|
-
{
|
152
|
-
{
|
153
|
-
{
|
154
|
-
{
|
155
|
-
{
|
156
|
-
{
|
157
|
-
{
|
158
|
-
{
|
159
|
-
{
|
160
|
-
{
|
161
|
-
{
|
162
|
-
{
|
163
|
-
{
|
164
|
-
{
|
165
|
-
{
|
166
|
-
{
|
167
|
-
{
|
168
|
-
{
|
169
|
-
{
|
170
|
-
{
|
171
|
-
{
|
172
|
-
{
|
173
|
-
{
|
174
|
-
{
|
175
|
-
{
|
176
|
-
{
|
177
|
-
{
|
178
|
-
{
|
179
|
-
{
|
180
|
-
{
|
181
|
-
{
|
182
|
-
{
|
183
|
-
{
|
184
|
-
{
|
185
|
-
{
|
186
|
-
{
|
187
|
-
{
|
188
|
-
{
|
189
|
-
{
|
190
|
-
{
|
191
|
-
{
|
192
|
-
{
|
193
|
-
{
|
194
|
-
{
|
195
|
-
{
|
196
|
-
{
|
197
|
-
{
|
198
|
-
{
|
199
|
-
{
|
200
|
-
{
|
201
|
-
{
|
202
|
-
{
|
203
|
-
{
|
204
|
-
{
|
205
|
-
{
|
206
|
-
{
|
207
|
-
{
|
208
|
-
{
|
209
|
-
{
|
210
|
-
{
|
211
|
-
{
|
212
|
-
{
|
213
|
-
{
|
214
|
-
{
|
215
|
-
{
|
216
|
-
{
|
217
|
-
{
|
218
|
-
{
|
219
|
-
{
|
220
|
-
{
|
221
|
-
{
|
222
|
-
{
|
223
|
-
{
|
224
|
-
{
|
225
|
-
{
|
226
|
-
{
|
227
|
-
{
|
228
|
-
{
|
229
|
-
{
|
230
|
-
{
|
231
|
-
{
|
232
|
-
{
|
233
|
-
{
|
234
|
-
{
|
235
|
-
{
|
236
|
-
{
|
237
|
-
{
|
238
|
-
{
|
239
|
-
{
|
240
|
-
{
|
241
|
-
{
|
242
|
-
{
|
243
|
-
{
|
244
|
-
{
|
245
|
-
{
|
246
|
-
{
|
247
|
-
{
|
248
|
-
{
|
249
|
-
{
|
250
|
-
{
|
251
|
-
{
|
252
|
-
{
|
253
|
-
{
|
254
|
-
{
|
255
|
-
{
|
256
|
-
{
|
257
|
-
{
|
258
|
-
{
|
259
|
-
{
|
260
|
-
{
|
261
|
-
{
|
262
|
-
{
|
263
|
-
{
|
264
|
-
{
|
265
|
-
{
|
266
|
-
{
|
267
|
-
{
|
268
|
-
{
|
269
|
-
{
|
270
|
-
{
|
271
|
-
{
|
272
|
-
{
|
273
|
-
{
|
274
|
-
{
|
275
|
-
{
|
276
|
-
{
|
277
|
-
{
|
278
|
-
{
|
279
|
-
{
|
280
|
-
{
|
281
|
-
{
|
282
|
-
{
|
283
|
-
{
|
284
|
-
{
|
285
|
-
{
|
286
|
-
{
|
287
|
-
{
|
288
|
-
{
|
289
|
-
{
|
290
|
-
{
|
291
|
-
{
|
292
|
-
{
|
293
|
-
{
|
294
|
-
{
|
295
|
-
{
|
296
|
-
{
|
297
|
-
{
|
298
|
-
{
|
299
|
-
{
|
300
|
-
{
|
301
|
-
{
|
302
|
-
{
|
303
|
-
{
|
304
|
-
{
|
305
|
-
{
|
306
|
-
{
|
307
|
-
{
|
308
|
-
{
|
309
|
-
{
|
310
|
-
{
|
311
|
-
{
|
312
|
-
{
|
313
|
-
{
|
314
|
-
{
|
315
|
-
{
|
316
|
-
{
|
317
|
-
{
|
318
|
-
{
|
319
|
-
{
|
320
|
-
{
|
321
|
-
{
|
322
|
-
{
|
323
|
-
{
|
324
|
-
{
|
325
|
-
{
|
326
|
-
{
|
327
|
-
{
|
328
|
-
{
|
73
|
+
static struct _slot entities[] = {
|
74
|
+
{"AElig", 198}, // latin capital letter AE
|
75
|
+
{"Aacute", 193}, // latin capital letter A with acute
|
76
|
+
{"Acirc", 194}, // latin capital letter A with circumflex
|
77
|
+
{"Agrave", 192}, // latin capital letter A with grave
|
78
|
+
{"Alpha", 913}, // greek capital letter alpha, U+0391
|
79
|
+
{"Aring", 197}, // latin capital letter A with ring above
|
80
|
+
{"Atilde", 195}, // latin capital letter A with tilde
|
81
|
+
{"Auml", 196}, // latin capital letter A with diaeresis
|
82
|
+
{"Beta", 914}, // greek capital letter beta, U+0392
|
83
|
+
{"Ccedil", 199}, // latin capital letter C with cedilla
|
84
|
+
{"Chi", 935}, // greek capital letter chi, U+03A7
|
85
|
+
{"Dagger", 8225}, // double dagger, U+2021 ISOpub
|
86
|
+
{"Delta", 916}, // greek capital letter delta
|
87
|
+
{"ETH", 208}, // latin capital letter ETH, U+00D0 ISOlat1
|
88
|
+
{"Eacute", 201}, // latin capital letter E with acute
|
89
|
+
{"Ecirc", 202}, // latin capital letter E with circumflex
|
90
|
+
{"Egrave", 200}, // latin capital letter E with grave
|
91
|
+
{"Epsilon", 917}, // greek capital letter epsilon, U+0395
|
92
|
+
{"Eta", 919}, // greek capital letter eta, U+0397
|
93
|
+
{"Euml", 203}, // latin capital letter E with diaeresis
|
94
|
+
{"Gamma", 915}, // greek capital letter gamma
|
95
|
+
{"Iacute", 205}, // latin capital letter I with acute
|
96
|
+
{"Icirc", 206}, // latin capital letter I with circumflex
|
97
|
+
{"Igrave", 204}, // latin capital letter I with grave
|
98
|
+
{"Iota", 921}, // greek capital letter iota, U+0399
|
99
|
+
{"Iuml", 207}, // latin capital letter I with diaeresis
|
100
|
+
{"Kappa", 922}, // greek capital letter kappa, U+039A
|
101
|
+
{"Lambda", 923}, // greek capital letter lambda
|
102
|
+
{"Mu", 924}, // greek capital letter mu, U+039C
|
103
|
+
{"Ntilde", 209}, // latin capital letter N with tilde
|
104
|
+
{"Nu", 925}, // greek capital letter nu, U+039D
|
105
|
+
{"OElig", 338}, // - latin capital ligature OE
|
106
|
+
{"Oacute", 211}, // latin capital letter O with acute
|
107
|
+
{"Ocirc", 212}, // latin capital letter O with circumflex
|
108
|
+
{"Ograve", 210}, // latin capital letter O with grave
|
109
|
+
{"Omega", 937}, // greek capital letter omega
|
110
|
+
{"Omicron", 927}, // greek capital letter omicron, U+039F
|
111
|
+
{"Oslash", 216}, // latin capital letter O with stroke
|
112
|
+
{"Otilde", 213}, // latin capital letter O with tilde
|
113
|
+
{"Ouml", 214}, // latin capital letter O with diaeresis
|
114
|
+
{"Phi", 934}, // greek capital letter phi
|
115
|
+
{"Pi", 928}, // greek capital letter pi, U+03A0 ISOgrk3
|
116
|
+
{"Prime", 8243}, // double prime = seconds = inches
|
117
|
+
{"Psi", 936}, // greek capital letter psi
|
118
|
+
{"Rho", 929}, // greek capital letter rho, U+03A1
|
119
|
+
{"Scaron", 352}, // - latin capital letter S with caron
|
120
|
+
{"Sigma", 931}, // greek capital letter sigma
|
121
|
+
{"THORN", 222}, // latin capital letter THORN
|
122
|
+
{"Tau", 932}, // greek capital letter tau, U+03A4
|
123
|
+
{"Theta", 920}, // greek capital letter theta
|
124
|
+
{"Uacute", 218}, // latin capital letter U with acute
|
125
|
+
{"Ucirc", 219}, // latin capital letter U with circumflex
|
126
|
+
{"Ugrave", 217}, // latin capital letter U with grave
|
127
|
+
{"Upsilon", 933}, // greek capital letter upsilon
|
128
|
+
{"Uuml", 220}, // latin capital letter U with diaeresis
|
129
|
+
{"Xi", 926}, // greek capital letter xi, U+039E ISOgrk3
|
130
|
+
{"Yacute", 221}, // latin capital letter Y with acute
|
131
|
+
{"Yuml", 376}, // - latin capital letter Y with diaeresis
|
132
|
+
{"Zeta", 918}, // greek capital letter zeta, U+0396
|
133
|
+
{"aacute", 225}, // latin small letter a with acute
|
134
|
+
{"acirc", 226}, // latin small letter a with circumflex
|
135
|
+
{"acute", 180}, // acute accent = spacing acute
|
136
|
+
{"aelig", 230}, // latin small letter ae
|
137
|
+
{"agrave", 224}, // latin small letter a with grave
|
138
|
+
{"alefsym", 8501}, // alef symbol = first transfinite cardinal
|
139
|
+
{"alpha", 945}, // greek small letter alpha
|
140
|
+
{"amp", 38}, // -- ampersand, U+0026 ISOnum
|
141
|
+
{"and", 8743}, // logical and = wedge, U+2227 ISOtech
|
142
|
+
{"ang", 8736}, // angle, U+2220 ISOamso
|
143
|
+
{"apos", 39}, // -- single quote
|
144
|
+
{"aring", 229}, // latin small letter a with ring above
|
145
|
+
{"asymp", 8776}, // almost equal to = asymptotic to
|
146
|
+
{"atilde", 227}, // latin small letter a with tilde
|
147
|
+
{"auml", 228}, // latin small letter a with diaeresis
|
148
|
+
{"bdquo", 8222}, // double low-9 quotation mark, U+201E NEW
|
149
|
+
{"beta", 946}, // greek small letter beta, U+03B2 ISOgrk3
|
150
|
+
{"brvbar", 166}, // broken bar = broken vertical bar
|
151
|
+
{"bull", 8226}, // bullet = black small circle
|
152
|
+
{"cap", 8745}, // intersection = cap, U+2229 ISOtech
|
153
|
+
{"ccedil", 231}, // latin small letter c with cedilla
|
154
|
+
{"cedil", 184}, // cedilla = spacing cedilla, U+00B8 ISOdia
|
155
|
+
{"cent", 162}, // cent sign, U+00A2 ISOnum
|
156
|
+
{"chi", 967}, // greek small letter chi, U+03C7 ISOgrk3
|
157
|
+
{"circ", 710}, // - modifier letter circumflex accent
|
158
|
+
{"clubs", 9827}, // black club suit = shamrock
|
159
|
+
{"cong", 8773}, // approximately equal to, U+2245 ISOtech
|
160
|
+
{"copy", 169}, // copyright sign, U+00A9 ISOnum
|
161
|
+
{"crarr", 8629}, // downwards arrow with corner leftwards
|
162
|
+
{"cup", 8746}, // union = cup, U+222A ISOtech
|
163
|
+
{"curren", 164}, // currency sign, U+00A4 ISOnum
|
164
|
+
{"dArr", 8659}, // downwards double arrow, U+21D3 ISOamsa
|
165
|
+
{"dagger", 8224}, // dagger, U+2020 ISOpub
|
166
|
+
{"darr", 8595}, // downwards arrow, U+2193 ISOnum
|
167
|
+
{"deg", 176}, // degree sign, U+00B0 ISOnum
|
168
|
+
{"delta", 948}, // greek small letter delta
|
169
|
+
{"diams", 9830}, // black diamond suit, U+2666 ISOpub
|
170
|
+
{"divide", 247}, // division sign, U+00F7 ISOnum
|
171
|
+
{"eacute", 233}, // latin small letter e with acute
|
172
|
+
{"ecirc", 234}, // latin small letter e with circumflex
|
173
|
+
{"egrave", 232}, // latin small letter e with grave
|
174
|
+
{"empty", 8709}, // empty set = null set = diameter
|
175
|
+
{"emsp", 8195}, // em space, U+2003 ISOpub
|
176
|
+
{"ensp", 8194}, // en space, U+2002 ISOpub
|
177
|
+
{"epsilon", 949}, // greek small letter epsilon
|
178
|
+
{"equiv", 8801}, // identical to, U+2261 ISOtech
|
179
|
+
{"eta", 951}, // greek small letter eta, U+03B7 ISOgrk3
|
180
|
+
{"eth", 240}, // latin small letter eth, U+00F0 ISOlat1
|
181
|
+
{"euml", 235}, // latin small letter e with diaeresis
|
182
|
+
{"euro", 8364}, // - euro sign, U+20AC NEW
|
183
|
+
{"exist", 8707}, // there exists, U+2203 ISOtech
|
184
|
+
{"fnof", 402}, // latin small f with hook = function
|
185
|
+
{"forall", 8704}, // for all, U+2200 ISOtech
|
186
|
+
{"frac12", 189}, // vulgar fraction one half
|
187
|
+
{"frac14", 188}, // vulgar fraction one quarter
|
188
|
+
{"frac34", 190}, // vulgar fraction three quarters
|
189
|
+
{"frasl", 8260}, // fraction slash, U+2044 NEW
|
190
|
+
{"gamma", 947}, // greek small letter gamma
|
191
|
+
{"ge", 8805}, // greater-than or equal to
|
192
|
+
{"gt", 62}, // -- greater-than sign, U+003E ISOnum
|
193
|
+
{"hArr", 8660}, // left right double arrow
|
194
|
+
{"harr", 8596}, // left right arrow, U+2194 ISOamsa
|
195
|
+
{"hearts", 9829}, // black heart suit = valentine
|
196
|
+
{"hellip", 8230}, // horizontal ellipsis = three dot leader
|
197
|
+
{"iacute", 237}, // latin small letter i with acute
|
198
|
+
{"icirc", 238}, // latin small letter i with circumflex
|
199
|
+
{"iexcl", 161}, // inverted exclamation mark, U+00A1 ISOnum
|
200
|
+
{"igrave", 236}, // latin small letter i with grave
|
201
|
+
{"image", 8465}, // blackletter capital I = imaginary part
|
202
|
+
{"infin", 8734}, // infinity, U+221E ISOtech
|
203
|
+
{"int", 8747}, // integral, U+222B ISOtech
|
204
|
+
{"iota", 953}, // greek small letter iota, U+03B9 ISOgrk3
|
205
|
+
{"iquest", 191}, // inverted question mark
|
206
|
+
{"isin", 8712}, // element of, U+2208 ISOtech
|
207
|
+
{"iuml", 239}, // latin small letter i with diaeresis
|
208
|
+
{"kappa", 954}, // greek small letter kappa
|
209
|
+
{"lArr", 8656}, // leftwards double arrow, U+21D0 ISOtech
|
210
|
+
{"lambda", 955}, // greek small letter lambda
|
211
|
+
{"lang", 9001}, // left-pointing angle bracket = bra
|
212
|
+
{"laquo", 171}, // left-pointing double angle quotation mark
|
213
|
+
{"larr", 8592}, // leftwards arrow, U+2190 ISOnum
|
214
|
+
{"lceil", 8968}, // left ceiling = apl upstile
|
215
|
+
{"ldquo", 8220}, // left double quotation mark
|
216
|
+
{"le", 8804}, // less-than or equal to, U+2264 ISOtech
|
217
|
+
{"lfloor", 8970}, // left floor = apl downstile
|
218
|
+
{"lowast", 8727}, // asterisk operator, U+2217 ISOtech
|
219
|
+
{"loz", 9674}, // lozenge, U+25CA ISOpub
|
220
|
+
{"lrm", 8206}, // left-to-right mark, U+200E NEW RFC 2070
|
221
|
+
{"lsaquo", 8249}, // single left-pointing angle quotation mark
|
222
|
+
{"lsquo", 8216}, // left single quotation mark
|
223
|
+
{"lt", 60}, // -- less-than sign, U+003C ISOnum
|
224
|
+
{"macr", 175}, // macron = spacing macron = overline
|
225
|
+
{"mdash", 8212}, // em dash, U+2014 ISOpub
|
226
|
+
{"micro", 181}, // micro sign, U+00B5 ISOnum
|
227
|
+
{"middot", 183}, // middle dot = Georgian comma
|
228
|
+
{"minus", 8722}, // minus sign, U+2212 ISOtech
|
229
|
+
{"mu", 956}, // greek small letter mu, U+03BC ISOgrk3
|
230
|
+
{"nabla", 8711}, // nabla = backward difference
|
231
|
+
{"nbsp", 160}, // no-break space = non-breaking space
|
232
|
+
{"ndash", 8211}, // en dash, U+2013 ISOpub
|
233
|
+
{"ne", 8800}, // not equal to, U+2260 ISOtech
|
234
|
+
{"ni", 8715}, // contains as member, U+220B ISOtech
|
235
|
+
{"not", 172}, // not sign, U+00AC ISOnum
|
236
|
+
{"notin", 8713}, // not an element of, U+2209 ISOtech
|
237
|
+
{"nsub", 8836}, // not a subset of, U+2284 ISOamsn
|
238
|
+
{"ntilde", 241}, // latin small letter n with tilde
|
239
|
+
{"nu", 957}, // greek small letter nu, U+03BD ISOgrk3
|
240
|
+
{"oacute", 243}, // latin small letter o with acute
|
241
|
+
{"ocirc", 244}, // latin small letter o with circumflex
|
242
|
+
{"oelig", 339}, // - latin small ligature oe, U+0153 ISOlat2
|
243
|
+
{"ograve", 242}, // latin small letter o with grave
|
244
|
+
{"oline", 8254}, // overline = spacing overscore
|
245
|
+
{"omega", 969}, // greek small letter omega
|
246
|
+
{"omicron", 959}, // greek small letter omicron, U+03BF NEW
|
247
|
+
{"oplus", 8853}, // circled plus = direct sum
|
248
|
+
{"or", 8744}, // logical or = vee, U+2228 ISOtech
|
249
|
+
{"ordf", 170}, // feminine ordinal indicator, U+00AA ISOnum
|
250
|
+
{"ordm", 186}, // masculine ordinal indicator
|
251
|
+
{"oslash", 248}, // latin small letter o with stroke
|
252
|
+
{"otilde", 245}, // latin small letter o with tilde
|
253
|
+
{"otimes", 8855}, // circled times = vector product
|
254
|
+
{"ouml", 246}, // latin small letter o with diaeresis
|
255
|
+
{"para", 182}, // pilcrow sign = paragraph sign
|
256
|
+
{"part", 8706}, // partial differential, U+2202 ISOtech
|
257
|
+
{"permil", 8240}, // per mille sign, U+2030 ISOtech
|
258
|
+
{"perp", 8869}, // up tack = orthogonal to = perpendicular
|
259
|
+
{"phi", 966}, // greek small letter phi, U+03C6 ISOgrk3
|
260
|
+
{"pi", 960}, // greek small letter pi, U+03C0 ISOgrk3
|
261
|
+
{"piv", 982}, // greek pi symbol, U+03D6 ISOgrk3
|
262
|
+
{"plusmn", 177}, // plus-minus sign = plus-or-minus sign
|
263
|
+
{"pound", 163}, // pound sign, U+00A3 ISOnum
|
264
|
+
{"prime", 8242}, // prime = minutes = feet, U+2032 ISOtech
|
265
|
+
{"prod", 8719}, // n-ary product = product sign
|
266
|
+
{"prop", 8733}, // proportional to, U+221D ISOtech
|
267
|
+
{"psi", 968}, // greek small letter psi, U+03C8 ISOgrk3
|
268
|
+
{"quot", 34}, // -- quotation mark = APL quote
|
269
|
+
{"rArr", 8658}, // rightwards double arrow
|
270
|
+
{"radic", 8730}, // square root = radical sign
|
271
|
+
{"rang", 9002}, // right-pointing angle bracket = ket
|
272
|
+
{"raquo", 187}, // right-pointing double angle quotation mark
|
273
|
+
{"rarr", 8594}, // rightwards arrow, U+2192 ISOnum
|
274
|
+
{"rceil", 8969}, // right ceiling, U+2309 ISOamsc
|
275
|
+
{"rdquo", 8221}, // right double quotation mark
|
276
|
+
{"real", 8476}, // blackletter capital R = real part symbol
|
277
|
+
{"reg", 174}, // registered sign = registered trade mark sign
|
278
|
+
{"rfloor", 8971}, // right floor, U+230B ISOamsc
|
279
|
+
{"rho", 961}, // greek small letter rho, U+03C1 ISOgrk3
|
280
|
+
{"rlm", 8207}, // right-to-left mark, U+200F NEW RFC 2070
|
281
|
+
{"rsaquo", 8250}, // single right-pointing angle quotation mark
|
282
|
+
{"rsquo", 8217}, // right single quotation mark
|
283
|
+
{"sbquo", 8218}, // single low-9 quotation mark, U+201A NEW
|
284
|
+
{"scaron", 353}, // - latin small letter s with caron
|
285
|
+
{"sdot", 8901}, // dot operator, U+22C5 ISOamsb
|
286
|
+
{"sect", 167}, // section sign, U+00A7 ISOnum
|
287
|
+
{"shy", 173}, // soft hyphen = discretionary hyphen
|
288
|
+
{"sigma", 963}, // greek small letter sigma
|
289
|
+
{"sigmaf", 962}, // greek small letter final sigma
|
290
|
+
{"sim", 8764}, // tilde operator = varies with = similar to
|
291
|
+
{"spades", 9824}, // black spade suit, U+2660 ISOpub
|
292
|
+
{"sub", 8834}, // subset of, U+2282 ISOtech
|
293
|
+
{"sube", 8838}, // subset of or equal to, U+2286 ISOtech
|
294
|
+
{"sum", 8721}, // n-ary sumation, U+2211 ISOamsb
|
295
|
+
{"sup", 8835}, // superset of, U+2283 ISOtech
|
296
|
+
{"sup1", 185}, // superscript one = superscript digit one
|
297
|
+
{"sup2", 178}, // superscript two = superscript digit two
|
298
|
+
{"sup3", 179}, // superscript three = superscript digit three
|
299
|
+
{"supe", 8839}, // superset of or equal to
|
300
|
+
{"szlig", 223}, // latin small letter sharp s = ess-zed
|
301
|
+
{"tau", 964}, // greek small letter tau, U+03C4 ISOgrk3
|
302
|
+
{"there4", 8756}, // therefore, U+2234 ISOtech
|
303
|
+
{"theta", 952}, // greek small letter theta
|
304
|
+
{"thetasym", 977}, // greek small letter theta symbol
|
305
|
+
{"thinsp", 8201}, // thin space, U+2009 ISOpub
|
306
|
+
{"thorn", 254}, // latin small letter thorn
|
307
|
+
{"tilde", 732}, // - small tilde, U+02DC ISOdia
|
308
|
+
{"times", 215}, // multiplication sign, U+00D7 ISOnum
|
309
|
+
{"trade", 8482}, // trade mark sign, U+2122 ISOnum
|
310
|
+
{"uArr", 8657}, // upwards double arrow, U+21D1 ISOamsa
|
311
|
+
{"uacute", 250}, // latin small letter u with acute
|
312
|
+
{"uarr", 8593}, // upwards arrow, U+2191 ISOnum-->
|
313
|
+
{"ucirc", 251}, // latin small letter u with circumflex
|
314
|
+
{"ugrave", 249}, // latin small letter u with grave
|
315
|
+
{"uml", 168}, // diaeresis = spacing diaeresis
|
316
|
+
{"upsih", 978}, // greek upsilon with hook symbol
|
317
|
+
{"upsilon", 965}, // greek small letter upsilon
|
318
|
+
{"uuml", 252}, // latin small letter u with diaeresis
|
319
|
+
{"weierp", 8472}, // script capital P = power set
|
320
|
+
{"xi", 958}, // greek small letter xi, U+03BE ISOgrk3
|
321
|
+
{"yacute", 253}, // latin small letter y with acute
|
322
|
+
{"yen", 165}, // yen sign = yuan sign, U+00A5 ISOnum
|
323
|
+
{"yuml", 255}, // latin small letter y with diaeresis
|
324
|
+
{"zeta", 950}, // greek small letter zeta, U+03B6 ISOgrk3
|
325
|
+
{"zwj", 8205}, // zero width joiner, U+200D NEW RFC 2070
|
326
|
+
{"zwnj", 8204}, // zero width non-joiner
|
327
|
+
{NULL, 0},
|
329
328
|
};
|
330
329
|
|
331
|
-
static uint64_t
|
332
|
-
|
333
|
-
uint64_t h = 0;
|
330
|
+
static uint64_t calc_hash(const char *key) {
|
331
|
+
uint64_t h = 0;
|
334
332
|
|
335
333
|
if (NULL != key) {
|
336
|
-
|
334
|
+
const uint8_t *k = (const uint8_t *)key;
|
337
335
|
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
336
|
+
for (; 0 != *k; k++) {
|
337
|
+
// narrow to most used range of 0x4D (77) in size
|
338
|
+
h = 77 * h + ((*k | 0x20) - 0x2D);
|
339
|
+
}
|
342
340
|
}
|
343
341
|
return h;
|
344
342
|
}
|
345
343
|
|
346
|
-
static Slot*
|
347
|
-
get_bucketp(uint64_t h) {
|
344
|
+
static Slot *get_bucketp(uint64_t h) {
|
348
345
|
return entity_cache.buckets + (BUCKET_MASK & (h ^ (h << 5) ^ (h >> 7)));
|
349
346
|
}
|
350
347
|
|
351
|
-
static void
|
352
|
-
|
353
|
-
|
354
|
-
Slot *bucket = get_bucketp(h);
|
348
|
+
static void cache_set(Slot s) {
|
349
|
+
int64_t h = calc_hash(s->key);
|
350
|
+
Slot *bucket = get_bucketp(h);
|
355
351
|
|
356
352
|
s->hash = h;
|
357
353
|
s->next = *bucket;
|
358
354
|
*bucket = s;
|
359
355
|
}
|
360
356
|
|
361
|
-
static Slot
|
362
|
-
|
363
|
-
|
364
|
-
Slot
|
365
|
-
Slot s;
|
357
|
+
static Slot cache_get(const char *key) {
|
358
|
+
int64_t h = calc_hash(key);
|
359
|
+
Slot *bucket = get_bucketp(h);
|
360
|
+
Slot s;
|
366
361
|
|
367
362
|
for (s = *bucket; NULL != s; s = s->next) {
|
368
|
-
|
369
|
-
|
370
|
-
|
363
|
+
if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
|
364
|
+
return s;
|
365
|
+
}
|
371
366
|
}
|
372
367
|
return NULL;
|
373
368
|
}
|
374
369
|
|
375
|
-
static void
|
376
|
-
|
377
|
-
Slot e = entities;
|
370
|
+
static void cache_init() {
|
371
|
+
Slot e = entities;
|
378
372
|
|
379
373
|
memset(&entity_cache, 0, sizeof(struct _cache));
|
380
374
|
for (; NULL != e->key; e++) {
|
381
|
-
|
375
|
+
cache_set(e);
|
382
376
|
}
|
383
377
|
inited = true;
|
384
378
|
}
|
385
379
|
|
386
|
-
char*
|
387
|
-
|
388
|
-
Slot s = entities;
|
380
|
+
char *ox_entity_lookup(char *text, const char *key) {
|
381
|
+
Slot s = entities;
|
389
382
|
|
390
383
|
if (!inited) {
|
391
|
-
|
384
|
+
cache_init();
|
392
385
|
}
|
393
386
|
if (NULL == (s = cache_get(key))) {
|
394
|
-
|
387
|
+
return NULL;
|
395
388
|
}
|
396
389
|
return ox_ucs_to_utf8_chars(text, s->code);
|
397
390
|
}
|