ox 2.12.1 → 2.13.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d2959b76c94fd92a6234a9cf211f27263ac36da3e0b74bc724edc2f278585a4
4
- data.tar.gz: eae845e3147144ee70d3b8f41603295c962c00a202282af8b0bd42cacde02ade
3
+ metadata.gz: 1ec1cf99d4e2684029f06dbf745c239ab00335d87017a4bd44fc792d908b9f33
4
+ data.tar.gz: 9f69f9506a7d83644c33b2c6658515fd0e1cfb8b4ce63e128fc8509a40fb31a0
5
5
  SHA512:
6
- metadata.gz: fe53b32f1daae7e8444ce0efa476f3b69fee96b72b5f61dcda51aca3769bac24a2edbfc7aae914bcb073e845af7487d7004fec0b8c6408bee1b3515c6bff6315
7
- data.tar.gz: ee673dbd381c0983cf592090c7bf01d99cbd23d9c5a2740892c9db688bf294824e4510ceff4bb31a0e340c13db8b78bb09badae1186c1929edd27ec59b8da9b2
6
+ metadata.gz: f296b41ddf11021ed71a4d61acd68a78741ef4cd0c7f23efa5be9ebef6254dbe60b749df0d4a9377d80d94d0ba46fff36fd283ae4111a58deea32ed0062a4cdc
7
+ data.tar.gz: c31bcbfc3271f92000c16e357a5aef34fbc105aa3afbc608d9a934f7d2c1c1f6d8af32de02250edf48032f5262bfd0f9b17316031a43f41e6ce5e10c8e1024a9
@@ -4,6 +4,22 @@ All changes to the Ox gem are documented here. Releases follow semantic versioni
4
4
 
5
5
  ## [Unreleased]
6
6
 
7
+ ## [2.13.1] - 2020-01-30
8
+
9
+ HTML Sequences
10
+
11
+ ### Added
12
+
13
+ - All HTML 4 sequence are now supported.
14
+
15
+ ## [2.13.0] - 2020-01-25
16
+
17
+ HTML Escape Sequences
18
+
19
+ ### Added
20
+
21
+ - All HTML 4 escape sequences are now parsed.
22
+
7
23
  ## [2.12.1] - 2020-01-05
8
24
 
9
25
  Ruby 2.7.0
@@ -1000,11 +1000,13 @@ read_coded_chars(PInfo pi, char *text) {
1000
1000
  char *b, buf[32];
1001
1001
  char *end = buf + sizeof(buf) - 1;
1002
1002
  char *s;
1003
+ long blen = 0;
1003
1004
 
1004
1005
  for (b = buf, s = pi->s; b < end; b++, s++) {
1005
1006
  *b = *s;
1006
1007
  if (';' == *s) {
1007
1008
  *(b + 1) = '\0';
1009
+ blen = b - buf;
1008
1010
  s++;
1009
1011
  break;
1010
1012
  }
@@ -1047,30 +1049,20 @@ read_coded_chars(PInfo pi, char *text) {
1047
1049
  } else {
1048
1050
  /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
1049
1051
  set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1050
- return 0;
1052
+ return NULL;
1051
1053
  }
1052
1054
  pi->s = s;
1053
1055
  }
1054
- } else if (0 == strcasecmp(buf, "nbsp;")) {
1055
- pi->s = s;
1056
- *text++ = ' ';
1057
- } else if (0 == strcasecmp(buf, "lt;")) {
1058
- pi->s = s;
1059
- *text++ = '<';
1060
- } else if (0 == strcasecmp(buf, "gt;")) {
1061
- pi->s = s;
1062
- *text++ = '>';
1063
- } else if (0 == strcasecmp(buf, "amp;")) {
1064
- pi->s = s;
1065
- *text++ = '&';
1066
- } else if (0 == strcasecmp(buf, "quot;")) {
1067
- pi->s = s;
1068
- *text++ = '"';
1069
- } else if (0 == strcasecmp(buf, "apos;")) {
1070
- pi->s = s;
1071
- *text++ = '\'';
1072
1056
  } else {
1073
- *text++ = '&';
1057
+ char *t2;
1058
+
1059
+ buf[blen] = '\0';
1060
+ if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1061
+ *text++ = '&';
1062
+ } else {
1063
+ text = t2;
1064
+ pi->s = s;
1065
+ }
1074
1066
  }
1075
1067
  return text;
1076
1068
  }
@@ -1153,16 +1145,30 @@ collapse_special(PInfo pi, char *str) {
1153
1145
  *b++ = '&';
1154
1146
  continue;
1155
1147
  } else {
1156
- c = '?';
1148
+ char key[16];
1149
+ char *k = key;
1150
+ char *kend = key + sizeof(key) - 1;
1151
+
1152
+ *k++ = *s;
1157
1153
  while (';' != *s++) {
1158
1154
  if ('\0' == *s) {
1159
1155
  set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
1160
1156
  return EDOM;
1161
1157
  }
1158
+ if (kend <= k) {
1159
+ k = key;
1160
+ break;
1161
+ }
1162
+ *k++ = *s;
1162
1163
  }
1163
- s++;
1164
- set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1165
- return 0;
1164
+ k--;
1165
+ *k = '\0';
1166
+ if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1167
+ set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1168
+ c = '?';
1169
+ return 0;
1170
+ }
1171
+ continue;
1166
1172
  }
1167
1173
  *b++ = (char)c;
1168
1174
  }
@@ -1668,8 +1668,28 @@ ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
1668
1668
  c = '\'';
1669
1669
  s += 5;
1670
1670
  } else {
1671
- ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
1672
- c = '&';
1671
+ char key[16];
1672
+ char *k = key;
1673
+ char *kend = key + sizeof(key) - 1;
1674
+ char *bn;
1675
+ char *s2 = s;
1676
+
1677
+ for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
1678
+ if (kend <= k) {
1679
+ k = key;
1680
+ break;
1681
+ }
1682
+ *k = *s2;
1683
+ }
1684
+ *k = '\0';
1685
+ if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
1686
+ ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
1687
+ c = '&';
1688
+ } else {
1689
+ b = bn;
1690
+ s = s2 + 1;
1691
+ continue;
1692
+ }
1673
1693
  }
1674
1694
  *b++ = (char)c;
1675
1695
  col++;
@@ -3,6 +3,9 @@
3
3
  * All rights reserved.
4
4
  */
5
5
 
6
+ #include <string.h>
7
+ #include <stdbool.h>
8
+
6
9
  #include "special.h"
7
10
 
8
11
  /*
@@ -49,3 +52,345 @@ ox_ucs_to_utf8_chars(char *text, uint64_t u) {
49
52
  }
50
53
  return text;
51
54
  }
55
+
56
+ #define BUCKET_SIZE 256
57
+ #define BUCKET_MASK 255
58
+
59
+ typedef struct _slot {
60
+ const char *key;
61
+ uint64_t code;
62
+ struct _slot *next;
63
+ uint64_t hash;
64
+ } *Slot;
65
+
66
+ typedef struct _cache {
67
+ Slot buckets[BUCKET_SIZE];
68
+ } *Cache;
69
+
70
+ static struct _cache entity_cache;
71
+ static bool inited = false;
72
+
73
+ // HTML entities such as &amp;. This is a complete list from the HTML 4 spec.
74
+ static struct _slot entities[] = {
75
+ { "AElig", 198 }, // latin capital letter AE
76
+ { "Aacute", 193 }, // latin capital letter A with acute
77
+ { "Acirc", 194 }, // latin capital letter A with circumflex
78
+ { "Agrave", 192 }, // latin capital letter A with grave
79
+ { "Alpha", 913 }, // greek capital letter alpha, U+0391
80
+ { "Aring", 197 }, // latin capital letter A with ring above
81
+ { "Atilde", 195 }, // latin capital letter A with tilde
82
+ { "Auml", 196 }, // latin capital letter A with diaeresis
83
+ { "Beta", 914 }, // greek capital letter beta, U+0392
84
+ { "Ccedil", 199 }, // latin capital letter C with cedilla
85
+ { "Chi", 935 }, // greek capital letter chi, U+03A7
86
+ { "Dagger", 8225 }, // double dagger, U+2021 ISOpub
87
+ { "Delta", 916 }, // greek capital letter delta
88
+ { "ETH", 208 }, // latin capital letter ETH, U+00D0 ISOlat1
89
+ { "Eacute", 201 }, // latin capital letter E with acute
90
+ { "Ecirc", 202 }, // latin capital letter E with circumflex
91
+ { "Egrave", 200 }, // latin capital letter E with grave
92
+ { "Epsilon", 917 }, // greek capital letter epsilon, U+0395
93
+ { "Eta", 919 }, // greek capital letter eta, U+0397
94
+ { "Euml", 203 }, // latin capital letter E with diaeresis
95
+ { "Gamma", 915 }, // greek capital letter gamma
96
+ { "Iacute", 205 }, // latin capital letter I with acute
97
+ { "Icirc", 206 }, // latin capital letter I with circumflex
98
+ { "Igrave", 204 }, // latin capital letter I with grave
99
+ { "Iota", 921 }, // greek capital letter iota, U+0399
100
+ { "Iuml", 207 }, // latin capital letter I with diaeresis
101
+ { "Kappa", 922 }, // greek capital letter kappa, U+039A
102
+ { "Lambda", 923 }, // greek capital letter lambda
103
+ { "Mu", 924 }, // greek capital letter mu, U+039C
104
+ { "Ntilde", 209 }, // latin capital letter N with tilde
105
+ { "Nu", 925 }, // greek capital letter nu, U+039D
106
+ { "OElig", 338 }, // - latin capital ligature OE
107
+ { "Oacute", 211 }, // latin capital letter O with acute
108
+ { "Ocirc", 212 }, // latin capital letter O with circumflex
109
+ { "Ograve", 210 }, // latin capital letter O with grave
110
+ { "Omega", 937 }, // greek capital letter omega
111
+ { "Omicron", 927 }, // greek capital letter omicron, U+039F
112
+ { "Oslash", 216 }, // latin capital letter O with stroke
113
+ { "Otilde", 213 }, // latin capital letter O with tilde
114
+ { "Ouml", 214 }, // latin capital letter O with diaeresis
115
+ { "Phi", 934 }, // greek capital letter phi
116
+ { "Pi", 928 }, // greek capital letter pi, U+03A0 ISOgrk3
117
+ { "Prime", 8243 }, // double prime = seconds = inches
118
+ { "Psi", 936 }, // greek capital letter psi
119
+ { "Rho", 929 }, // greek capital letter rho, U+03A1
120
+ { "Scaron", 352 }, // - latin capital letter S with caron
121
+ { "Sigma", 931 }, // greek capital letter sigma
122
+ { "THORN", 222 }, // latin capital letter THORN
123
+ { "Tau", 932 }, // greek capital letter tau, U+03A4
124
+ { "Theta", 920 }, // greek capital letter theta
125
+ { "Uacute", 218 }, // latin capital letter U with acute
126
+ { "Ucirc", 219 }, // latin capital letter U with circumflex
127
+ { "Ugrave", 217 }, // latin capital letter U with grave
128
+ { "Upsilon", 933 }, // greek capital letter upsilon
129
+ { "Uuml", 220 }, // latin capital letter U with diaeresis
130
+ { "Xi", 926 }, // greek capital letter xi, U+039E ISOgrk3
131
+ { "Yacute", 221 }, // latin capital letter Y with acute
132
+ { "Yuml", 376 }, // - latin capital letter Y with diaeresis
133
+ { "Zeta", 918 }, // greek capital letter zeta, U+0396
134
+ { "aacute", 225 }, // latin small letter a with acute
135
+ { "acirc", 226 }, // latin small letter a with circumflex
136
+ { "acute", 180 }, // acute accent = spacing acute
137
+ { "aelig", 230 }, // latin small letter ae
138
+ { "agrave", 224 }, // latin small letter a with grave
139
+ { "alefsym", 8501 }, // alef symbol = first transfinite cardinal
140
+ { "alpha", 945 }, // greek small letter alpha
141
+ { "amp", 38 }, // -- ampersand, U+0026 ISOnum
142
+ { "and", 8743 }, // logical and = wedge, U+2227 ISOtech
143
+ { "ang", 8736 }, // angle, U+2220 ISOamso
144
+ { "aring", 229 }, // latin small letter a with ring above
145
+ { "asymp", 8776 }, // almost equal to = asymptotic to
146
+ { "atilde", 227 }, // latin small letter a with tilde
147
+ { "auml", 228 }, // latin small letter a with diaeresis
148
+ { "bdquo", 8222 }, // double low-9 quotation mark, U+201E NEW
149
+ { "beta", 946 }, // greek small letter beta, U+03B2 ISOgrk3
150
+ { "brvbar", 166 }, // broken bar = broken vertical bar
151
+ { "bull", 8226 }, // bullet = black small circle
152
+ { "cap", 8745 }, // intersection = cap, U+2229 ISOtech
153
+ { "ccedil", 231 }, // latin small letter c with cedilla
154
+ { "cedil", 184 }, // cedilla = spacing cedilla, U+00B8 ISOdia
155
+ { "cent", 162 }, // cent sign, U+00A2 ISOnum
156
+ { "chi", 967 }, // greek small letter chi, U+03C7 ISOgrk3
157
+ { "circ", 710 }, // - modifier letter circumflex accent
158
+ { "clubs", 9827 }, // black club suit = shamrock
159
+ { "cong", 8773 }, // approximately equal to, U+2245 ISOtech
160
+ { "copy", 169 }, // copyright sign, U+00A9 ISOnum
161
+ { "crarr", 8629 }, // downwards arrow with corner leftwards
162
+ { "cup", 8746 }, // union = cup, U+222A ISOtech
163
+ { "curren", 164 }, // currency sign, U+00A4 ISOnum
164
+ { "dArr", 8659 }, // downwards double arrow, U+21D3 ISOamsa
165
+ { "dagger", 8224 }, // dagger, U+2020 ISOpub
166
+ { "darr", 8595 }, // downwards arrow, U+2193 ISOnum
167
+ { "deg", 176 }, // degree sign, U+00B0 ISOnum
168
+ { "delta", 948 }, // greek small letter delta
169
+ { "diams", 9830 }, // black diamond suit, U+2666 ISOpub
170
+ { "divide", 247 }, // division sign, U+00F7 ISOnum
171
+ { "eacute", 233 }, // latin small letter e with acute
172
+ { "ecirc", 234 }, // latin small letter e with circumflex
173
+ { "egrave", 232 }, // latin small letter e with grave
174
+ { "empty", 8709 }, // empty set = null set = diameter
175
+ { "emsp", 8195 }, // em space, U+2003 ISOpub
176
+ { "ensp", 8194 }, // en space, U+2002 ISOpub
177
+ { "epsilon", 949 }, // greek small letter epsilon
178
+ { "equiv", 8801 }, // identical to, U+2261 ISOtech
179
+ { "eta", 951 }, // greek small letter eta, U+03B7 ISOgrk3
180
+ { "eth", 240 }, // latin small letter eth, U+00F0 ISOlat1
181
+ { "euml", 235 }, // latin small letter e with diaeresis
182
+ { "euro", 8364 }, // - euro sign, U+20AC NEW
183
+ { "exist", 8707 }, // there exists, U+2203 ISOtech
184
+ { "fnof", 402 }, // latin small f with hook = function
185
+ { "forall", 8704 }, // for all, U+2200 ISOtech
186
+ { "frac12", 189 }, // vulgar fraction one half
187
+ { "frac14", 188 }, // vulgar fraction one quarter
188
+ { "frac34", 190 }, // vulgar fraction three quarters
189
+ { "frasl", 8260 }, // fraction slash, U+2044 NEW
190
+ { "gamma", 947 }, // greek small letter gamma
191
+ { "ge", 8805 }, // greater-than or equal to
192
+ { "gt", 62 }, // -- greater-than sign, U+003E ISOnum
193
+ { "hArr", 8660 }, // left right double arrow
194
+ { "harr", 8596 }, // left right arrow, U+2194 ISOamsa
195
+ { "hearts", 9829 }, // black heart suit = valentine
196
+ { "hellip", 8230 }, // horizontal ellipsis = three dot leader
197
+ { "iacute", 237 }, // latin small letter i with acute
198
+ { "icirc", 238 }, // latin small letter i with circumflex
199
+ { "iexcl", 161 }, // inverted exclamation mark, U+00A1 ISOnum
200
+ { "igrave", 236 }, // latin small letter i with grave
201
+ { "image", 8465 }, // blackletter capital I = imaginary part
202
+ { "infin", 8734 }, // infinity, U+221E ISOtech
203
+ { "int", 8747 }, // integral, U+222B ISOtech
204
+ { "iota", 953 }, // greek small letter iota, U+03B9 ISOgrk3
205
+ { "iquest", 191 }, // inverted question mark
206
+ { "isin", 8712 }, // element of, U+2208 ISOtech
207
+ { "iuml", 239 }, // latin small letter i with diaeresis
208
+ { "kappa", 954 }, // greek small letter kappa
209
+ { "lArr", 8656 }, // leftwards double arrow, U+21D0 ISOtech
210
+ { "lambda", 955 }, // greek small letter lambda
211
+ { "lang", 9001 }, // left-pointing angle bracket = bra
212
+ { "laquo", 171 }, // left-pointing double angle quotation mark
213
+ { "larr", 8592 }, // leftwards arrow, U+2190 ISOnum
214
+ { "lceil", 8968 }, // left ceiling = apl upstile
215
+ { "ldquo", 8220 }, // left double quotation mark
216
+ { "le", 8804 }, // less-than or equal to, U+2264 ISOtech
217
+ { "lfloor", 8970 }, // left floor = apl downstile
218
+ { "lowast", 8727 }, // asterisk operator, U+2217 ISOtech
219
+ { "loz", 9674 }, // lozenge, U+25CA ISOpub
220
+ { "lrm", 8206 }, // left-to-right mark, U+200E NEW RFC 2070
221
+ { "lsaquo", 8249 }, // single left-pointing angle quotation mark
222
+ { "lsquo", 8216 }, // left single quotation mark
223
+ { "lt", 60 }, // -- less-than sign, U+003C ISOnum
224
+ { "macr", 175 }, // macron = spacing macron = overline
225
+ { "mdash", 8212 }, // em dash, U+2014 ISOpub
226
+ { "micro", 181 }, // micro sign, U+00B5 ISOnum
227
+ { "middot", 183 }, // middle dot = Georgian comma
228
+ { "minus", 8722 }, // minus sign, U+2212 ISOtech
229
+ { "mu", 956 }, // greek small letter mu, U+03BC ISOgrk3
230
+ { "nabla", 8711 }, // nabla = backward difference
231
+ { "nbsp", 160 }, // no-break space = non-breaking space
232
+ { "ndash", 8211 }, // en dash, U+2013 ISOpub
233
+ { "ne", 8800 }, // not equal to, U+2260 ISOtech
234
+ { "ni", 8715 }, // contains as member, U+220B ISOtech
235
+ { "not", 172 }, // not sign, U+00AC ISOnum
236
+ { "notin", 8713 }, // not an element of, U+2209 ISOtech
237
+ { "nsub", 8836 }, // not a subset of, U+2284 ISOamsn
238
+ { "ntilde", 241 }, // latin small letter n with tilde
239
+ { "nu", 957 }, // greek small letter nu, U+03BD ISOgrk3
240
+ { "oacute", 243 }, // latin small letter o with acute
241
+ { "ocirc", 244 }, // latin small letter o with circumflex
242
+ { "oelig", 339 }, // - latin small ligature oe, U+0153 ISOlat2
243
+ { "ograve", 242 }, // latin small letter o with grave
244
+ { "oline", 8254 }, // overline = spacing overscore
245
+ { "omega", 969 }, // greek small letter omega
246
+ { "omicron", 959 }, // greek small letter omicron, U+03BF NEW
247
+ { "oplus", 8853 }, // circled plus = direct sum
248
+ { "or", 8744 }, // logical or = vee, U+2228 ISOtech
249
+ { "ordf", 170 }, // feminine ordinal indicator, U+00AA ISOnum
250
+ { "ordm", 186 }, // masculine ordinal indicator
251
+ { "oslash", 248 }, // latin small letter o with stroke
252
+ { "otilde", 245 }, // latin small letter o with tilde
253
+ { "otimes", 8855 }, // circled times = vector product
254
+ { "ouml", 246 }, // latin small letter o with diaeresis
255
+ { "para", 182 }, // pilcrow sign = paragraph sign
256
+ { "part", 8706 }, // partial differential, U+2202 ISOtech
257
+ { "permil", 8240 }, // per mille sign, U+2030 ISOtech
258
+ { "perp", 8869 }, // up tack = orthogonal to = perpendicular
259
+ { "phi", 966 }, // greek small letter phi, U+03C6 ISOgrk3
260
+ { "pi", 960 }, // greek small letter pi, U+03C0 ISOgrk3
261
+ { "piv", 982 }, // greek pi symbol, U+03D6 ISOgrk3
262
+ { "plusmn", 177 }, // plus-minus sign = plus-or-minus sign
263
+ { "pound", 163 }, // pound sign, U+00A3 ISOnum
264
+ { "prime", 8242 }, // prime = minutes = feet, U+2032 ISOtech
265
+ { "prod", 8719 }, // n-ary product = product sign
266
+ { "prop", 8733 }, // proportional to, U+221D ISOtech
267
+ { "psi", 968 }, // greek small letter psi, U+03C8 ISOgrk3
268
+ { "quot", 34 }, // -- quotation mark = APL quote
269
+ { "rArr", 8658 }, // rightwards double arrow
270
+ { "radic", 8730 }, // square root = radical sign
271
+ { "rang", 9002 }, // right-pointing angle bracket = ket
272
+ { "raquo", 187 }, // right-pointing double angle quotation mark
273
+ { "rarr", 8594 }, // rightwards arrow, U+2192 ISOnum
274
+ { "rceil", 8969 }, // right ceiling, U+2309 ISOamsc
275
+ { "rdquo", 8221 }, // right double quotation mark
276
+ { "real", 8476 }, // blackletter capital R = real part symbol
277
+ { "reg", 174 }, // registered sign = registered trade mark sign
278
+ { "rfloor", 8971 }, // right floor, U+230B ISOamsc
279
+ { "rho", 961 }, // greek small letter rho, U+03C1 ISOgrk3
280
+ { "rlm", 8207 }, // right-to-left mark, U+200F NEW RFC 2070
281
+ { "rsaquo", 8250 }, // single right-pointing angle quotation mark
282
+ { "rsquo", 8217 }, // right single quotation mark
283
+ { "sbquo", 8218 }, // single low-9 quotation mark, U+201A NEW
284
+ { "scaron", 353 }, // - latin small letter s with caron
285
+ { "sdot", 8901 }, // dot operator, U+22C5 ISOamsb
286
+ { "sect", 167 }, // section sign, U+00A7 ISOnum
287
+ { "shy", 173 }, // soft hyphen = discretionary hyphen
288
+ { "sigma", 963 }, // greek small letter sigma
289
+ { "sigmaf", 962 }, // greek small letter final sigma
290
+ { "sim", 8764 }, // tilde operator = varies with = similar to
291
+ { "spades", 9824 }, // black spade suit, U+2660 ISOpub
292
+ { "sub", 8834 }, // subset of, U+2282 ISOtech
293
+ { "sube", 8838 }, // subset of or equal to, U+2286 ISOtech
294
+ { "sum", 8721 }, // n-ary sumation, U+2211 ISOamsb
295
+ { "sup", 8835 }, // superset of, U+2283 ISOtech
296
+ { "sup1", 185 }, // superscript one = superscript digit one
297
+ { "sup2", 178 }, // superscript two = superscript digit two
298
+ { "sup3", 179 }, // superscript three = superscript digit three
299
+ { "supe", 8839 }, // superset of or equal to
300
+ { "szlig", 223 }, // latin small letter sharp s = ess-zed
301
+ { "tau", 964 }, // greek small letter tau, U+03C4 ISOgrk3
302
+ { "there4", 8756 }, // therefore, U+2234 ISOtech
303
+ { "theta", 952 }, // greek small letter theta
304
+ { "thetasym", 977 }, // greek small letter theta symbol
305
+ { "thinsp", 8201 }, // thin space, U+2009 ISOpub
306
+ { "thorn", 254 }, // latin small letter thorn
307
+ { "tilde", 732 }, // - small tilde, U+02DC ISOdia
308
+ { "times", 215 }, // multiplication sign, U+00D7 ISOnum
309
+ { "trade", 8482 }, // trade mark sign, U+2122 ISOnum
310
+ { "uArr", 8657 }, // upwards double arrow, U+21D1 ISOamsa
311
+ { "uacute", 250 }, // latin small letter u with acute
312
+ { "uarr", 8593 }, // upwards arrow, U+2191 ISOnum-->
313
+ { "ucirc", 251 }, // latin small letter u with circumflex
314
+ { "ugrave", 249 }, // latin small letter u with grave
315
+ { "uml", 168 }, // diaeresis = spacing diaeresis
316
+ { "upsih", 978 }, // greek upsilon with hook symbol
317
+ { "upsilon", 965 }, // greek small letter upsilon
318
+ { "uuml", 252 }, // latin small letter u with diaeresis
319
+ { "weierp", 8472 }, // script capital P = power set
320
+ { "xi", 958 }, // greek small letter xi, U+03BE ISOgrk3
321
+ { "yacute", 253 }, // latin small letter y with acute
322
+ { "yen", 165 }, // yen sign = yuan sign, U+00A5 ISOnum
323
+ { "yuml", 255 }, // latin small letter y with diaeresis
324
+ { "zeta", 950 }, // greek small letter zeta, U+03B6 ISOgrk3
325
+ { "zwj", 8205 }, // zero width joiner, U+200D NEW RFC 2070
326
+ { "zwnj", 8204 }, // zero width non-joiner
327
+ { NULL, 0 },
328
+ };
329
+
330
+ static uint64_t
331
+ calc_hash(const char *key) {
332
+ uint64_t h = 0;
333
+
334
+ if (NULL != key) {
335
+ const uint8_t *k = (const uint8_t*)key;
336
+
337
+ for (; 0 != *k; k++) {
338
+ // narrow to most used range of 0x4D (77) in size
339
+ h = 77 * h + ((*k | 0x20) - 0x2D);
340
+ }
341
+ }
342
+ return h;
343
+ }
344
+
345
+ static Slot*
346
+ get_bucketp(uint64_t h) {
347
+ return entity_cache.buckets + (BUCKET_MASK & (h ^ (h << 5) ^ (h >> 7)));
348
+ }
349
+
350
+ static void
351
+ cache_set(Slot s) {
352
+ int64_t h = calc_hash(s->key);
353
+ Slot *bucket = get_bucketp(h);
354
+
355
+ s->hash = h;
356
+ s->next = *bucket;
357
+ *bucket = s;
358
+ }
359
+
360
+ static Slot
361
+ cache_get(const char *key) {
362
+ int64_t h = calc_hash(key);
363
+ Slot *bucket = get_bucketp(h);
364
+ Slot s;
365
+
366
+ for (s = *bucket; NULL != s; s = s->next) {
367
+ if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
368
+ return s;
369
+ }
370
+ }
371
+ return NULL;
372
+ }
373
+
374
+ static void
375
+ cache_init() {
376
+ Slot e = entities;
377
+
378
+ memset(&entity_cache, 0, sizeof(struct _cache));
379
+ for (; NULL != e->key; e++) {
380
+ cache_set(e);
381
+ }
382
+ inited = true;
383
+ }
384
+
385
+ char*
386
+ ox_entity_lookup(char *text, const char *key) {
387
+ Slot s = entities;
388
+
389
+ if (!inited) {
390
+ cache_init();
391
+ }
392
+ if (NULL == (s = cache_get(key))) {
393
+ return NULL;
394
+ }
395
+ return ox_ucs_to_utf8_chars(text, s->code);
396
+ }
@@ -9,5 +9,6 @@
9
9
  #include <stdint.h>
10
10
 
11
11
  extern char* ox_ucs_to_utf8_chars(char *text, uint64_t u);
12
+ extern char* ox_entity_lookup(char *text, const char *key);
12
13
 
13
14
  #endif /* OX_SPECIAL_H */
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Ox
3
3
  # Current version of the module.
4
- VERSION = '2.12.1'
4
+ VERSION = '2.13.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ox
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.12.1
4
+ version: 2.13.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Ohler
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-05 00:00:00.000000000 Z
11
+ date: 2020-01-30 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: "A fast XML parser and object serializer that uses only standard C lib.\n\nOptimized
14
14
  XML (Ox), as the name implies was written to provide speed optimized\nXML handling.