ox 2.12.1 → 2.13.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +16 -0
- data/ext/ox/parse.c +30 -24
- data/ext/ox/sax.c +22 -2
- data/ext/ox/special.c +345 -0
- data/ext/ox/special.h +1 -0
- data/lib/ox/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1ec1cf99d4e2684029f06dbf745c239ab00335d87017a4bd44fc792d908b9f33
|
4
|
+
data.tar.gz: 9f69f9506a7d83644c33b2c6658515fd0e1cfb8b4ce63e128fc8509a40fb31a0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f296b41ddf11021ed71a4d61acd68a78741ef4cd0c7f23efa5be9ebef6254dbe60b749df0d4a9377d80d94d0ba46fff36fd283ae4111a58deea32ed0062a4cdc
|
7
|
+
data.tar.gz: c31bcbfc3271f92000c16e357a5aef34fbc105aa3afbc608d9a934f7d2c1c1f6d8af32de02250edf48032f5262bfd0f9b17316031a43f41e6ce5e10c8e1024a9
|
data/CHANGELOG.md
CHANGED
@@ -4,6 +4,22 @@ All changes to the Ox gem are documented here. Releases follow semantic versioni
|
|
4
4
|
|
5
5
|
## [Unreleased]
|
6
6
|
|
7
|
+
## [2.13.1] - 2020-01-30
|
8
|
+
|
9
|
+
HTML Sequences
|
10
|
+
|
11
|
+
### Added
|
12
|
+
|
13
|
+
- All HTML 4 sequence are now supported.
|
14
|
+
|
15
|
+
## [2.13.0] - 2020-01-25
|
16
|
+
|
17
|
+
HTML Escape Sequences
|
18
|
+
|
19
|
+
### Added
|
20
|
+
|
21
|
+
- All HTML 4 escape sequences are now parsed.
|
22
|
+
|
7
23
|
## [2.12.1] - 2020-01-05
|
8
24
|
|
9
25
|
Ruby 2.7.0
|
data/ext/ox/parse.c
CHANGED
@@ -1000,11 +1000,13 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1000
1000
|
char *b, buf[32];
|
1001
1001
|
char *end = buf + sizeof(buf) - 1;
|
1002
1002
|
char *s;
|
1003
|
+
long blen = 0;
|
1003
1004
|
|
1004
1005
|
for (b = buf, s = pi->s; b < end; b++, s++) {
|
1005
1006
|
*b = *s;
|
1006
1007
|
if (';' == *s) {
|
1007
1008
|
*(b + 1) = '\0';
|
1009
|
+
blen = b - buf;
|
1008
1010
|
s++;
|
1009
1011
|
break;
|
1010
1012
|
}
|
@@ -1047,30 +1049,20 @@ read_coded_chars(PInfo pi, char *text) {
|
|
1047
1049
|
} else {
|
1048
1050
|
/*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
|
1049
1051
|
set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
|
1050
|
-
return
|
1052
|
+
return NULL;
|
1051
1053
|
}
|
1052
1054
|
pi->s = s;
|
1053
1055
|
}
|
1054
|
-
} else if (0 == strcasecmp(buf, "nbsp;")) {
|
1055
|
-
pi->s = s;
|
1056
|
-
*text++ = ' ';
|
1057
|
-
} else if (0 == strcasecmp(buf, "lt;")) {
|
1058
|
-
pi->s = s;
|
1059
|
-
*text++ = '<';
|
1060
|
-
} else if (0 == strcasecmp(buf, "gt;")) {
|
1061
|
-
pi->s = s;
|
1062
|
-
*text++ = '>';
|
1063
|
-
} else if (0 == strcasecmp(buf, "amp;")) {
|
1064
|
-
pi->s = s;
|
1065
|
-
*text++ = '&';
|
1066
|
-
} else if (0 == strcasecmp(buf, "quot;")) {
|
1067
|
-
pi->s = s;
|
1068
|
-
*text++ = '"';
|
1069
|
-
} else if (0 == strcasecmp(buf, "apos;")) {
|
1070
|
-
pi->s = s;
|
1071
|
-
*text++ = '\'';
|
1072
1056
|
} else {
|
1073
|
-
*
|
1057
|
+
char *t2;
|
1058
|
+
|
1059
|
+
buf[blen] = '\0';
|
1060
|
+
if (NULL == (t2 = ox_entity_lookup(text, buf))) {
|
1061
|
+
*text++ = '&';
|
1062
|
+
} else {
|
1063
|
+
text = t2;
|
1064
|
+
pi->s = s;
|
1065
|
+
}
|
1074
1066
|
}
|
1075
1067
|
return text;
|
1076
1068
|
}
|
@@ -1153,16 +1145,30 @@ collapse_special(PInfo pi, char *str) {
|
|
1153
1145
|
*b++ = '&';
|
1154
1146
|
continue;
|
1155
1147
|
} else {
|
1156
|
-
|
1148
|
+
char key[16];
|
1149
|
+
char *k = key;
|
1150
|
+
char *kend = key + sizeof(key) - 1;
|
1151
|
+
|
1152
|
+
*k++ = *s;
|
1157
1153
|
while (';' != *s++) {
|
1158
1154
|
if ('\0' == *s) {
|
1159
1155
|
set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
|
1160
1156
|
return EDOM;
|
1161
1157
|
}
|
1158
|
+
if (kend <= k) {
|
1159
|
+
k = key;
|
1160
|
+
break;
|
1161
|
+
}
|
1162
|
+
*k++ = *s;
|
1162
1163
|
}
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1164
|
+
k--;
|
1165
|
+
*k = '\0';
|
1166
|
+
if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
|
1167
|
+
set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
|
1168
|
+
c = '?';
|
1169
|
+
return 0;
|
1170
|
+
}
|
1171
|
+
continue;
|
1166
1172
|
}
|
1167
1173
|
*b++ = (char)c;
|
1168
1174
|
}
|
data/ext/ox/sax.c
CHANGED
@@ -1668,8 +1668,28 @@ ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
|
|
1668
1668
|
c = '\'';
|
1669
1669
|
s += 5;
|
1670
1670
|
} else {
|
1671
|
-
|
1672
|
-
|
1671
|
+
char key[16];
|
1672
|
+
char *k = key;
|
1673
|
+
char *kend = key + sizeof(key) - 1;
|
1674
|
+
char *bn;
|
1675
|
+
char *s2 = s;
|
1676
|
+
|
1677
|
+
for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
|
1678
|
+
if (kend <= k) {
|
1679
|
+
k = key;
|
1680
|
+
break;
|
1681
|
+
}
|
1682
|
+
*k = *s2;
|
1683
|
+
}
|
1684
|
+
*k = '\0';
|
1685
|
+
if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
|
1686
|
+
ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
|
1687
|
+
c = '&';
|
1688
|
+
} else {
|
1689
|
+
b = bn;
|
1690
|
+
s = s2 + 1;
|
1691
|
+
continue;
|
1692
|
+
}
|
1673
1693
|
}
|
1674
1694
|
*b++ = (char)c;
|
1675
1695
|
col++;
|
data/ext/ox/special.c
CHANGED
@@ -3,6 +3,9 @@
|
|
3
3
|
* All rights reserved.
|
4
4
|
*/
|
5
5
|
|
6
|
+
#include <string.h>
|
7
|
+
#include <stdbool.h>
|
8
|
+
|
6
9
|
#include "special.h"
|
7
10
|
|
8
11
|
/*
|
@@ -49,3 +52,345 @@ ox_ucs_to_utf8_chars(char *text, uint64_t u) {
|
|
49
52
|
}
|
50
53
|
return text;
|
51
54
|
}
|
55
|
+
|
56
|
+
#define BUCKET_SIZE 256
|
57
|
+
#define BUCKET_MASK 255
|
58
|
+
|
59
|
+
typedef struct _slot {
|
60
|
+
const char *key;
|
61
|
+
uint64_t code;
|
62
|
+
struct _slot *next;
|
63
|
+
uint64_t hash;
|
64
|
+
} *Slot;
|
65
|
+
|
66
|
+
typedef struct _cache {
|
67
|
+
Slot buckets[BUCKET_SIZE];
|
68
|
+
} *Cache;
|
69
|
+
|
70
|
+
static struct _cache entity_cache;
|
71
|
+
static bool inited = false;
|
72
|
+
|
73
|
+
// HTML entities such as &. This is a complete list from the HTML 4 spec.
|
74
|
+
static struct _slot entities[] = {
|
75
|
+
{ "AElig", 198 }, // latin capital letter AE
|
76
|
+
{ "Aacute", 193 }, // latin capital letter A with acute
|
77
|
+
{ "Acirc", 194 }, // latin capital letter A with circumflex
|
78
|
+
{ "Agrave", 192 }, // latin capital letter A with grave
|
79
|
+
{ "Alpha", 913 }, // greek capital letter alpha, U+0391
|
80
|
+
{ "Aring", 197 }, // latin capital letter A with ring above
|
81
|
+
{ "Atilde", 195 }, // latin capital letter A with tilde
|
82
|
+
{ "Auml", 196 }, // latin capital letter A with diaeresis
|
83
|
+
{ "Beta", 914 }, // greek capital letter beta, U+0392
|
84
|
+
{ "Ccedil", 199 }, // latin capital letter C with cedilla
|
85
|
+
{ "Chi", 935 }, // greek capital letter chi, U+03A7
|
86
|
+
{ "Dagger", 8225 }, // double dagger, U+2021 ISOpub
|
87
|
+
{ "Delta", 916 }, // greek capital letter delta
|
88
|
+
{ "ETH", 208 }, // latin capital letter ETH, U+00D0 ISOlat1
|
89
|
+
{ "Eacute", 201 }, // latin capital letter E with acute
|
90
|
+
{ "Ecirc", 202 }, // latin capital letter E with circumflex
|
91
|
+
{ "Egrave", 200 }, // latin capital letter E with grave
|
92
|
+
{ "Epsilon", 917 }, // greek capital letter epsilon, U+0395
|
93
|
+
{ "Eta", 919 }, // greek capital letter eta, U+0397
|
94
|
+
{ "Euml", 203 }, // latin capital letter E with diaeresis
|
95
|
+
{ "Gamma", 915 }, // greek capital letter gamma
|
96
|
+
{ "Iacute", 205 }, // latin capital letter I with acute
|
97
|
+
{ "Icirc", 206 }, // latin capital letter I with circumflex
|
98
|
+
{ "Igrave", 204 }, // latin capital letter I with grave
|
99
|
+
{ "Iota", 921 }, // greek capital letter iota, U+0399
|
100
|
+
{ "Iuml", 207 }, // latin capital letter I with diaeresis
|
101
|
+
{ "Kappa", 922 }, // greek capital letter kappa, U+039A
|
102
|
+
{ "Lambda", 923 }, // greek capital letter lambda
|
103
|
+
{ "Mu", 924 }, // greek capital letter mu, U+039C
|
104
|
+
{ "Ntilde", 209 }, // latin capital letter N with tilde
|
105
|
+
{ "Nu", 925 }, // greek capital letter nu, U+039D
|
106
|
+
{ "OElig", 338 }, // - latin capital ligature OE
|
107
|
+
{ "Oacute", 211 }, // latin capital letter O with acute
|
108
|
+
{ "Ocirc", 212 }, // latin capital letter O with circumflex
|
109
|
+
{ "Ograve", 210 }, // latin capital letter O with grave
|
110
|
+
{ "Omega", 937 }, // greek capital letter omega
|
111
|
+
{ "Omicron", 927 }, // greek capital letter omicron, U+039F
|
112
|
+
{ "Oslash", 216 }, // latin capital letter O with stroke
|
113
|
+
{ "Otilde", 213 }, // latin capital letter O with tilde
|
114
|
+
{ "Ouml", 214 }, // latin capital letter O with diaeresis
|
115
|
+
{ "Phi", 934 }, // greek capital letter phi
|
116
|
+
{ "Pi", 928 }, // greek capital letter pi, U+03A0 ISOgrk3
|
117
|
+
{ "Prime", 8243 }, // double prime = seconds = inches
|
118
|
+
{ "Psi", 936 }, // greek capital letter psi
|
119
|
+
{ "Rho", 929 }, // greek capital letter rho, U+03A1
|
120
|
+
{ "Scaron", 352 }, // - latin capital letter S with caron
|
121
|
+
{ "Sigma", 931 }, // greek capital letter sigma
|
122
|
+
{ "THORN", 222 }, // latin capital letter THORN
|
123
|
+
{ "Tau", 932 }, // greek capital letter tau, U+03A4
|
124
|
+
{ "Theta", 920 }, // greek capital letter theta
|
125
|
+
{ "Uacute", 218 }, // latin capital letter U with acute
|
126
|
+
{ "Ucirc", 219 }, // latin capital letter U with circumflex
|
127
|
+
{ "Ugrave", 217 }, // latin capital letter U with grave
|
128
|
+
{ "Upsilon", 933 }, // greek capital letter upsilon
|
129
|
+
{ "Uuml", 220 }, // latin capital letter U with diaeresis
|
130
|
+
{ "Xi", 926 }, // greek capital letter xi, U+039E ISOgrk3
|
131
|
+
{ "Yacute", 221 }, // latin capital letter Y with acute
|
132
|
+
{ "Yuml", 376 }, // - latin capital letter Y with diaeresis
|
133
|
+
{ "Zeta", 918 }, // greek capital letter zeta, U+0396
|
134
|
+
{ "aacute", 225 }, // latin small letter a with acute
|
135
|
+
{ "acirc", 226 }, // latin small letter a with circumflex
|
136
|
+
{ "acute", 180 }, // acute accent = spacing acute
|
137
|
+
{ "aelig", 230 }, // latin small letter ae
|
138
|
+
{ "agrave", 224 }, // latin small letter a with grave
|
139
|
+
{ "alefsym", 8501 }, // alef symbol = first transfinite cardinal
|
140
|
+
{ "alpha", 945 }, // greek small letter alpha
|
141
|
+
{ "amp", 38 }, // -- ampersand, U+0026 ISOnum
|
142
|
+
{ "and", 8743 }, // logical and = wedge, U+2227 ISOtech
|
143
|
+
{ "ang", 8736 }, // angle, U+2220 ISOamso
|
144
|
+
{ "aring", 229 }, // latin small letter a with ring above
|
145
|
+
{ "asymp", 8776 }, // almost equal to = asymptotic to
|
146
|
+
{ "atilde", 227 }, // latin small letter a with tilde
|
147
|
+
{ "auml", 228 }, // latin small letter a with diaeresis
|
148
|
+
{ "bdquo", 8222 }, // double low-9 quotation mark, U+201E NEW
|
149
|
+
{ "beta", 946 }, // greek small letter beta, U+03B2 ISOgrk3
|
150
|
+
{ "brvbar", 166 }, // broken bar = broken vertical bar
|
151
|
+
{ "bull", 8226 }, // bullet = black small circle
|
152
|
+
{ "cap", 8745 }, // intersection = cap, U+2229 ISOtech
|
153
|
+
{ "ccedil", 231 }, // latin small letter c with cedilla
|
154
|
+
{ "cedil", 184 }, // cedilla = spacing cedilla, U+00B8 ISOdia
|
155
|
+
{ "cent", 162 }, // cent sign, U+00A2 ISOnum
|
156
|
+
{ "chi", 967 }, // greek small letter chi, U+03C7 ISOgrk3
|
157
|
+
{ "circ", 710 }, // - modifier letter circumflex accent
|
158
|
+
{ "clubs", 9827 }, // black club suit = shamrock
|
159
|
+
{ "cong", 8773 }, // approximately equal to, U+2245 ISOtech
|
160
|
+
{ "copy", 169 }, // copyright sign, U+00A9 ISOnum
|
161
|
+
{ "crarr", 8629 }, // downwards arrow with corner leftwards
|
162
|
+
{ "cup", 8746 }, // union = cup, U+222A ISOtech
|
163
|
+
{ "curren", 164 }, // currency sign, U+00A4 ISOnum
|
164
|
+
{ "dArr", 8659 }, // downwards double arrow, U+21D3 ISOamsa
|
165
|
+
{ "dagger", 8224 }, // dagger, U+2020 ISOpub
|
166
|
+
{ "darr", 8595 }, // downwards arrow, U+2193 ISOnum
|
167
|
+
{ "deg", 176 }, // degree sign, U+00B0 ISOnum
|
168
|
+
{ "delta", 948 }, // greek small letter delta
|
169
|
+
{ "diams", 9830 }, // black diamond suit, U+2666 ISOpub
|
170
|
+
{ "divide", 247 }, // division sign, U+00F7 ISOnum
|
171
|
+
{ "eacute", 233 }, // latin small letter e with acute
|
172
|
+
{ "ecirc", 234 }, // latin small letter e with circumflex
|
173
|
+
{ "egrave", 232 }, // latin small letter e with grave
|
174
|
+
{ "empty", 8709 }, // empty set = null set = diameter
|
175
|
+
{ "emsp", 8195 }, // em space, U+2003 ISOpub
|
176
|
+
{ "ensp", 8194 }, // en space, U+2002 ISOpub
|
177
|
+
{ "epsilon", 949 }, // greek small letter epsilon
|
178
|
+
{ "equiv", 8801 }, // identical to, U+2261 ISOtech
|
179
|
+
{ "eta", 951 }, // greek small letter eta, U+03B7 ISOgrk3
|
180
|
+
{ "eth", 240 }, // latin small letter eth, U+00F0 ISOlat1
|
181
|
+
{ "euml", 235 }, // latin small letter e with diaeresis
|
182
|
+
{ "euro", 8364 }, // - euro sign, U+20AC NEW
|
183
|
+
{ "exist", 8707 }, // there exists, U+2203 ISOtech
|
184
|
+
{ "fnof", 402 }, // latin small f with hook = function
|
185
|
+
{ "forall", 8704 }, // for all, U+2200 ISOtech
|
186
|
+
{ "frac12", 189 }, // vulgar fraction one half
|
187
|
+
{ "frac14", 188 }, // vulgar fraction one quarter
|
188
|
+
{ "frac34", 190 }, // vulgar fraction three quarters
|
189
|
+
{ "frasl", 8260 }, // fraction slash, U+2044 NEW
|
190
|
+
{ "gamma", 947 }, // greek small letter gamma
|
191
|
+
{ "ge", 8805 }, // greater-than or equal to
|
192
|
+
{ "gt", 62 }, // -- greater-than sign, U+003E ISOnum
|
193
|
+
{ "hArr", 8660 }, // left right double arrow
|
194
|
+
{ "harr", 8596 }, // left right arrow, U+2194 ISOamsa
|
195
|
+
{ "hearts", 9829 }, // black heart suit = valentine
|
196
|
+
{ "hellip", 8230 }, // horizontal ellipsis = three dot leader
|
197
|
+
{ "iacute", 237 }, // latin small letter i with acute
|
198
|
+
{ "icirc", 238 }, // latin small letter i with circumflex
|
199
|
+
{ "iexcl", 161 }, // inverted exclamation mark, U+00A1 ISOnum
|
200
|
+
{ "igrave", 236 }, // latin small letter i with grave
|
201
|
+
{ "image", 8465 }, // blackletter capital I = imaginary part
|
202
|
+
{ "infin", 8734 }, // infinity, U+221E ISOtech
|
203
|
+
{ "int", 8747 }, // integral, U+222B ISOtech
|
204
|
+
{ "iota", 953 }, // greek small letter iota, U+03B9 ISOgrk3
|
205
|
+
{ "iquest", 191 }, // inverted question mark
|
206
|
+
{ "isin", 8712 }, // element of, U+2208 ISOtech
|
207
|
+
{ "iuml", 239 }, // latin small letter i with diaeresis
|
208
|
+
{ "kappa", 954 }, // greek small letter kappa
|
209
|
+
{ "lArr", 8656 }, // leftwards double arrow, U+21D0 ISOtech
|
210
|
+
{ "lambda", 955 }, // greek small letter lambda
|
211
|
+
{ "lang", 9001 }, // left-pointing angle bracket = bra
|
212
|
+
{ "laquo", 171 }, // left-pointing double angle quotation mark
|
213
|
+
{ "larr", 8592 }, // leftwards arrow, U+2190 ISOnum
|
214
|
+
{ "lceil", 8968 }, // left ceiling = apl upstile
|
215
|
+
{ "ldquo", 8220 }, // left double quotation mark
|
216
|
+
{ "le", 8804 }, // less-than or equal to, U+2264 ISOtech
|
217
|
+
{ "lfloor", 8970 }, // left floor = apl downstile
|
218
|
+
{ "lowast", 8727 }, // asterisk operator, U+2217 ISOtech
|
219
|
+
{ "loz", 9674 }, // lozenge, U+25CA ISOpub
|
220
|
+
{ "lrm", 8206 }, // left-to-right mark, U+200E NEW RFC 2070
|
221
|
+
{ "lsaquo", 8249 }, // single left-pointing angle quotation mark
|
222
|
+
{ "lsquo", 8216 }, // left single quotation mark
|
223
|
+
{ "lt", 60 }, // -- less-than sign, U+003C ISOnum
|
224
|
+
{ "macr", 175 }, // macron = spacing macron = overline
|
225
|
+
{ "mdash", 8212 }, // em dash, U+2014 ISOpub
|
226
|
+
{ "micro", 181 }, // micro sign, U+00B5 ISOnum
|
227
|
+
{ "middot", 183 }, // middle dot = Georgian comma
|
228
|
+
{ "minus", 8722 }, // minus sign, U+2212 ISOtech
|
229
|
+
{ "mu", 956 }, // greek small letter mu, U+03BC ISOgrk3
|
230
|
+
{ "nabla", 8711 }, // nabla = backward difference
|
231
|
+
{ "nbsp", 160 }, // no-break space = non-breaking space
|
232
|
+
{ "ndash", 8211 }, // en dash, U+2013 ISOpub
|
233
|
+
{ "ne", 8800 }, // not equal to, U+2260 ISOtech
|
234
|
+
{ "ni", 8715 }, // contains as member, U+220B ISOtech
|
235
|
+
{ "not", 172 }, // not sign, U+00AC ISOnum
|
236
|
+
{ "notin", 8713 }, // not an element of, U+2209 ISOtech
|
237
|
+
{ "nsub", 8836 }, // not a subset of, U+2284 ISOamsn
|
238
|
+
{ "ntilde", 241 }, // latin small letter n with tilde
|
239
|
+
{ "nu", 957 }, // greek small letter nu, U+03BD ISOgrk3
|
240
|
+
{ "oacute", 243 }, // latin small letter o with acute
|
241
|
+
{ "ocirc", 244 }, // latin small letter o with circumflex
|
242
|
+
{ "oelig", 339 }, // - latin small ligature oe, U+0153 ISOlat2
|
243
|
+
{ "ograve", 242 }, // latin small letter o with grave
|
244
|
+
{ "oline", 8254 }, // overline = spacing overscore
|
245
|
+
{ "omega", 969 }, // greek small letter omega
|
246
|
+
{ "omicron", 959 }, // greek small letter omicron, U+03BF NEW
|
247
|
+
{ "oplus", 8853 }, // circled plus = direct sum
|
248
|
+
{ "or", 8744 }, // logical or = vee, U+2228 ISOtech
|
249
|
+
{ "ordf", 170 }, // feminine ordinal indicator, U+00AA ISOnum
|
250
|
+
{ "ordm", 186 }, // masculine ordinal indicator
|
251
|
+
{ "oslash", 248 }, // latin small letter o with stroke
|
252
|
+
{ "otilde", 245 }, // latin small letter o with tilde
|
253
|
+
{ "otimes", 8855 }, // circled times = vector product
|
254
|
+
{ "ouml", 246 }, // latin small letter o with diaeresis
|
255
|
+
{ "para", 182 }, // pilcrow sign = paragraph sign
|
256
|
+
{ "part", 8706 }, // partial differential, U+2202 ISOtech
|
257
|
+
{ "permil", 8240 }, // per mille sign, U+2030 ISOtech
|
258
|
+
{ "perp", 8869 }, // up tack = orthogonal to = perpendicular
|
259
|
+
{ "phi", 966 }, // greek small letter phi, U+03C6 ISOgrk3
|
260
|
+
{ "pi", 960 }, // greek small letter pi, U+03C0 ISOgrk3
|
261
|
+
{ "piv", 982 }, // greek pi symbol, U+03D6 ISOgrk3
|
262
|
+
{ "plusmn", 177 }, // plus-minus sign = plus-or-minus sign
|
263
|
+
{ "pound", 163 }, // pound sign, U+00A3 ISOnum
|
264
|
+
{ "prime", 8242 }, // prime = minutes = feet, U+2032 ISOtech
|
265
|
+
{ "prod", 8719 }, // n-ary product = product sign
|
266
|
+
{ "prop", 8733 }, // proportional to, U+221D ISOtech
|
267
|
+
{ "psi", 968 }, // greek small letter psi, U+03C8 ISOgrk3
|
268
|
+
{ "quot", 34 }, // -- quotation mark = APL quote
|
269
|
+
{ "rArr", 8658 }, // rightwards double arrow
|
270
|
+
{ "radic", 8730 }, // square root = radical sign
|
271
|
+
{ "rang", 9002 }, // right-pointing angle bracket = ket
|
272
|
+
{ "raquo", 187 }, // right-pointing double angle quotation mark
|
273
|
+
{ "rarr", 8594 }, // rightwards arrow, U+2192 ISOnum
|
274
|
+
{ "rceil", 8969 }, // right ceiling, U+2309 ISOamsc
|
275
|
+
{ "rdquo", 8221 }, // right double quotation mark
|
276
|
+
{ "real", 8476 }, // blackletter capital R = real part symbol
|
277
|
+
{ "reg", 174 }, // registered sign = registered trade mark sign
|
278
|
+
{ "rfloor", 8971 }, // right floor, U+230B ISOamsc
|
279
|
+
{ "rho", 961 }, // greek small letter rho, U+03C1 ISOgrk3
|
280
|
+
{ "rlm", 8207 }, // right-to-left mark, U+200F NEW RFC 2070
|
281
|
+
{ "rsaquo", 8250 }, // single right-pointing angle quotation mark
|
282
|
+
{ "rsquo", 8217 }, // right single quotation mark
|
283
|
+
{ "sbquo", 8218 }, // single low-9 quotation mark, U+201A NEW
|
284
|
+
{ "scaron", 353 }, // - latin small letter s with caron
|
285
|
+
{ "sdot", 8901 }, // dot operator, U+22C5 ISOamsb
|
286
|
+
{ "sect", 167 }, // section sign, U+00A7 ISOnum
|
287
|
+
{ "shy", 173 }, // soft hyphen = discretionary hyphen
|
288
|
+
{ "sigma", 963 }, // greek small letter sigma
|
289
|
+
{ "sigmaf", 962 }, // greek small letter final sigma
|
290
|
+
{ "sim", 8764 }, // tilde operator = varies with = similar to
|
291
|
+
{ "spades", 9824 }, // black spade suit, U+2660 ISOpub
|
292
|
+
{ "sub", 8834 }, // subset of, U+2282 ISOtech
|
293
|
+
{ "sube", 8838 }, // subset of or equal to, U+2286 ISOtech
|
294
|
+
{ "sum", 8721 }, // n-ary sumation, U+2211 ISOamsb
|
295
|
+
{ "sup", 8835 }, // superset of, U+2283 ISOtech
|
296
|
+
{ "sup1", 185 }, // superscript one = superscript digit one
|
297
|
+
{ "sup2", 178 }, // superscript two = superscript digit two
|
298
|
+
{ "sup3", 179 }, // superscript three = superscript digit three
|
299
|
+
{ "supe", 8839 }, // superset of or equal to
|
300
|
+
{ "szlig", 223 }, // latin small letter sharp s = ess-zed
|
301
|
+
{ "tau", 964 }, // greek small letter tau, U+03C4 ISOgrk3
|
302
|
+
{ "there4", 8756 }, // therefore, U+2234 ISOtech
|
303
|
+
{ "theta", 952 }, // greek small letter theta
|
304
|
+
{ "thetasym", 977 }, // greek small letter theta symbol
|
305
|
+
{ "thinsp", 8201 }, // thin space, U+2009 ISOpub
|
306
|
+
{ "thorn", 254 }, // latin small letter thorn
|
307
|
+
{ "tilde", 732 }, // - small tilde, U+02DC ISOdia
|
308
|
+
{ "times", 215 }, // multiplication sign, U+00D7 ISOnum
|
309
|
+
{ "trade", 8482 }, // trade mark sign, U+2122 ISOnum
|
310
|
+
{ "uArr", 8657 }, // upwards double arrow, U+21D1 ISOamsa
|
311
|
+
{ "uacute", 250 }, // latin small letter u with acute
|
312
|
+
{ "uarr", 8593 }, // upwards arrow, U+2191 ISOnum-->
|
313
|
+
{ "ucirc", 251 }, // latin small letter u with circumflex
|
314
|
+
{ "ugrave", 249 }, // latin small letter u with grave
|
315
|
+
{ "uml", 168 }, // diaeresis = spacing diaeresis
|
316
|
+
{ "upsih", 978 }, // greek upsilon with hook symbol
|
317
|
+
{ "upsilon", 965 }, // greek small letter upsilon
|
318
|
+
{ "uuml", 252 }, // latin small letter u with diaeresis
|
319
|
+
{ "weierp", 8472 }, // script capital P = power set
|
320
|
+
{ "xi", 958 }, // greek small letter xi, U+03BE ISOgrk3
|
321
|
+
{ "yacute", 253 }, // latin small letter y with acute
|
322
|
+
{ "yen", 165 }, // yen sign = yuan sign, U+00A5 ISOnum
|
323
|
+
{ "yuml", 255 }, // latin small letter y with diaeresis
|
324
|
+
{ "zeta", 950 }, // greek small letter zeta, U+03B6 ISOgrk3
|
325
|
+
{ "zwj", 8205 }, // zero width joiner, U+200D NEW RFC 2070
|
326
|
+
{ "zwnj", 8204 }, // zero width non-joiner
|
327
|
+
{ NULL, 0 },
|
328
|
+
};
|
329
|
+
|
330
|
+
static uint64_t
|
331
|
+
calc_hash(const char *key) {
|
332
|
+
uint64_t h = 0;
|
333
|
+
|
334
|
+
if (NULL != key) {
|
335
|
+
const uint8_t *k = (const uint8_t*)key;
|
336
|
+
|
337
|
+
for (; 0 != *k; k++) {
|
338
|
+
// narrow to most used range of 0x4D (77) in size
|
339
|
+
h = 77 * h + ((*k | 0x20) - 0x2D);
|
340
|
+
}
|
341
|
+
}
|
342
|
+
return h;
|
343
|
+
}
|
344
|
+
|
345
|
+
static Slot*
|
346
|
+
get_bucketp(uint64_t h) {
|
347
|
+
return entity_cache.buckets + (BUCKET_MASK & (h ^ (h << 5) ^ (h >> 7)));
|
348
|
+
}
|
349
|
+
|
350
|
+
static void
|
351
|
+
cache_set(Slot s) {
|
352
|
+
int64_t h = calc_hash(s->key);
|
353
|
+
Slot *bucket = get_bucketp(h);
|
354
|
+
|
355
|
+
s->hash = h;
|
356
|
+
s->next = *bucket;
|
357
|
+
*bucket = s;
|
358
|
+
}
|
359
|
+
|
360
|
+
static Slot
|
361
|
+
cache_get(const char *key) {
|
362
|
+
int64_t h = calc_hash(key);
|
363
|
+
Slot *bucket = get_bucketp(h);
|
364
|
+
Slot s;
|
365
|
+
|
366
|
+
for (s = *bucket; NULL != s; s = s->next) {
|
367
|
+
if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
|
368
|
+
return s;
|
369
|
+
}
|
370
|
+
}
|
371
|
+
return NULL;
|
372
|
+
}
|
373
|
+
|
374
|
+
static void
|
375
|
+
cache_init() {
|
376
|
+
Slot e = entities;
|
377
|
+
|
378
|
+
memset(&entity_cache, 0, sizeof(struct _cache));
|
379
|
+
for (; NULL != e->key; e++) {
|
380
|
+
cache_set(e);
|
381
|
+
}
|
382
|
+
inited = true;
|
383
|
+
}
|
384
|
+
|
385
|
+
char*
|
386
|
+
ox_entity_lookup(char *text, const char *key) {
|
387
|
+
Slot s = entities;
|
388
|
+
|
389
|
+
if (!inited) {
|
390
|
+
cache_init();
|
391
|
+
}
|
392
|
+
if (NULL == (s = cache_get(key))) {
|
393
|
+
return NULL;
|
394
|
+
}
|
395
|
+
return ox_ucs_to_utf8_chars(text, s->code);
|
396
|
+
}
|
data/ext/ox/special.h
CHANGED
data/lib/ox/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ox
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.13.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Peter Ohler
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-01-
|
11
|
+
date: 2020-01-30 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description: "A fast XML parser and object serializer that uses only standard C lib.\n\nOptimized
|
14
14
|
XML (Ox), as the name implies was written to provide speed optimized\nXML handling.
|