RubyGems - ox - Versions diffs - 2.12.1 → 2.13.1 - Mend

ox 2.12.1 → 2.13.1

Files changed (8) hide show

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 4d2959b76c94fd92a6234a9cf211f27263ac36da3e0b74bc724edc2f278585a4
-  data.tar.gz: eae845e3147144ee70d3b8f41603295c962c00a202282af8b0bd42cacde02ade
+  metadata.gz: 1ec1cf99d4e2684029f06dbf745c239ab00335d87017a4bd44fc792d908b9f33
+  data.tar.gz: 9f69f9506a7d83644c33b2c6658515fd0e1cfb8b4ce63e128fc8509a40fb31a0
 SHA512:
-  metadata.gz: fe53b32f1daae7e8444ce0efa476f3b69fee96b72b5f61dcda51aca3769bac24a2edbfc7aae914bcb073e845af7487d7004fec0b8c6408bee1b3515c6bff6315
-  data.tar.gz: ee673dbd381c0983cf592090c7bf01d99cbd23d9c5a2740892c9db688bf294824e4510ceff4bb31a0e340c13db8b78bb09badae1186c1929edd27ec59b8da9b2
+  metadata.gz: f296b41ddf11021ed71a4d61acd68a78741ef4cd0c7f23efa5be9ebef6254dbe60b749df0d4a9377d80d94d0ba46fff36fd283ae4111a58deea32ed0062a4cdc
+  data.tar.gz: c31bcbfc3271f92000c16e357a5aef34fbc105aa3afbc608d9a934f7d2c1c1f6d8af32de02250edf48032f5262bfd0f9b17316031a43f41e6ce5e10c8e1024a9

data/CHANGELOG.md CHANGED

@@ -4,6 +4,22 @@ All changes to the Ox gem are documented here. Releases follow semantic versioni
 ## [Unreleased]
+## [2.13.1] - 2020-01-30
+HTML Sequences
+### Added
+- All HTML 4 sequence are now supported.
+## [2.13.0] - 2020-01-25
+HTML Escape Sequences
+### Added
+- All HTML 4 escape sequences are now parsed.
 ## [2.12.1] - 2020-01-05
 Ruby 2.7.0

data/ext/ox/parse.c CHANGED

@@ -1000,11 +1000,13 @@ read_coded_chars(PInfo pi, char *text) {
     char	*b, buf[32];
     char	*end = buf + sizeof(buf) - 1;
     char	*s;
+    long	blen = 0;
     for (b = buf, s = pi->s; b < end; b++, s++) {
 	*b = *s;
 	if (';' == *s) {
 	    *(b + 1) = '\0';
+	    blen = b - buf;
 	    s++;
 	    break;
 	}
@@ -1047,30 +1049,20 @@ read_coded_chars(PInfo pi, char *text) {
 	    } else {
 		/*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
 		set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
-		return 0;
+		return NULL;
 	    }
 	    pi->s = s;
 	}
-    } else if (0 == strcasecmp(buf, "nbsp;")) {
-	pi->s = s;
-	*text++ = ' ';
-    } else if (0 == strcasecmp(buf, "lt;")) {
-	pi->s = s;
-	*text++ = '<';
-    } else if (0 == strcasecmp(buf, "gt;")) {
-	pi->s = s;
-	*text++ = '>';
-    } else if (0 == strcasecmp(buf, "amp;")) {
-	pi->s = s;
-	*text++ = '&';
-    } else if (0 == strcasecmp(buf, "quot;")) {
-	pi->s = s;
-	*text++ = '"';
-    } else if (0 == strcasecmp(buf, "apos;")) {
-	pi->s = s;
-	*text++ = '\'';
     } else {
-	*text++ = '&';
+	char	*t2;
+	buf[blen] = '\0';
+	if (NULL == (t2 = ox_entity_lookup(text, buf))) {
+	    *text++ = '&';
+	} else {
+	    text = t2;
+	    pi->s = s;
+	}
     }
     return text;
 }
@@ -1153,16 +1145,30 @@ collapse_special(PInfo pi, char *str) {
 		    *b++ = '&';
 		    continue;
 		} else {
-		    c = '?';
+		    char	key[16];
+		    char	*k = key;
+		    char	*kend = key + sizeof(key) - 1;
+		    *k++ = *s;
 		    while (';' != *s++) {
 			if ('\0' == *s) {
 			    set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
 			    return EDOM;
 			}
+			if (kend <= k) {
+			    k = key;
+			    break;
+			}
+			*k++ = *s;
 		    }
-		    s++;
-		    set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
-		    return 0;
+		    k--;
+		    *k = '\0';
+		    if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
+			set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
+			c = '?';
+			return 0;
+		    }
+		    continue;
 		}
 		*b++ = (char)c;
 	    }

data/ext/ox/sax.c CHANGED

@@ -1668,8 +1668,28 @@ ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
                 c = '\'';
                 s += 5;
             } else {
-		ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
-		c = '&';
+		char	key[16];
+		char	*k = key;
+		char	*kend = key + sizeof(key) - 1;
+		char	*bn;
+		char	*s2 = s;
+		for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
+		    if (kend <= k) {
+			k = key;
+			break;
+		    }
+		    *k = *s2;
+		}
+		*k = '\0';
+		if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
+		    ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
+		    c = '&';
+		} else {
+		    b = bn;
+		    s = s2 + 1;
+		    continue;
+		}
             }
             *b++ = (char)c;
 	    col++;

data/ext/ox/special.c CHANGED

@@ -3,6 +3,9 @@
  * All rights reserved.
  */
+#include <string.h>
+#include <stdbool.h>
 #include "special.h"
 /*
@@ -49,3 +52,345 @@ ox_ucs_to_utf8_chars(char *text, uint64_t u) {
     }
     return text;
 }
+#define BUCKET_SIZE	256
+#define BUCKET_MASK	255
+typedef struct _slot {
+    const char		*key;
+    uint64_t		code;
+    struct _slot	*next;
+    uint64_t		hash;
+} *Slot;
+typedef struct _cache {
+    Slot		buckets[BUCKET_SIZE];
+} *Cache;
+static struct _cache	entity_cache;
+static bool		inited = false;
+// HTML entities such as &amp;. This is a complete list from the HTML 4 spec.
+static struct _slot	entities[] = {
+    { "AElig", 198 },	// latin capital letter AE
+    { "Aacute", 193 },	// latin capital letter A with acute
+    { "Acirc", 194 },	// latin capital letter A with circumflex
+    { "Agrave", 192 },	// latin capital letter A with grave
+    { "Alpha", 913 },	// greek capital letter alpha, U+0391
+    { "Aring", 197 },	// latin capital letter A with ring above
+    { "Atilde", 195 },	// latin capital letter A with tilde
+    { "Auml", 196 },	// latin capital letter A with diaeresis
+    { "Beta", 914 },	// greek capital letter beta, U+0392
+    { "Ccedil", 199 },	// latin capital letter C with cedilla
+    { "Chi", 935 },	// greek capital letter chi, U+03A7
+    { "Dagger", 8225 },	// double dagger, U+2021 ISOpub
+    { "Delta", 916 },	// greek capital letter delta
+    { "ETH", 208 },	// latin capital letter ETH, U+00D0 ISOlat1
+    { "Eacute", 201 },	// latin capital letter E with acute
+    { "Ecirc", 202 },	// latin capital letter E with circumflex
+    { "Egrave", 200 },	// latin capital letter E with grave
+    { "Epsilon", 917 },	// greek capital letter epsilon, U+0395
+    { "Eta", 919 },	// greek capital letter eta, U+0397
+    { "Euml", 203 },	// latin capital letter E with diaeresis
+    { "Gamma", 915 },	// greek capital letter gamma
+    { "Iacute", 205 },	// latin capital letter I with acute
+    { "Icirc", 206 },	// latin capital letter I with circumflex
+    { "Igrave", 204 },	// latin capital letter I with grave
+    { "Iota", 921 },	// greek capital letter iota, U+0399
+    { "Iuml", 207 },	// latin capital letter I with diaeresis
+    { "Kappa", 922 },	// greek capital letter kappa, U+039A
+    { "Lambda", 923 },	// greek capital letter lambda
+    { "Mu", 924 },	// greek capital letter mu, U+039C
+    { "Ntilde", 209 },	// latin capital letter N with tilde
+    { "Nu", 925 },	// greek capital letter nu, U+039D
+    { "OElig", 338 },	// - latin capital ligature OE
+    { "Oacute", 211 },	// latin capital letter O with acute
+    { "Ocirc", 212 },	// latin capital letter O with circumflex
+    { "Ograve", 210 },	// latin capital letter O with grave
+    { "Omega", 937 },	// greek capital letter omega
+    { "Omicron", 927 },	// greek capital letter omicron, U+039F
+    { "Oslash", 216 },	// latin capital letter O with stroke
+    { "Otilde", 213 },	// latin capital letter O with tilde
+    { "Ouml", 214 },	// latin capital letter O with diaeresis
+    { "Phi", 934 },	// greek capital letter phi
+    { "Pi", 928 },	// greek capital letter pi, U+03A0 ISOgrk3
+    { "Prime", 8243 },	// double prime = seconds = inches
+    { "Psi", 936 },	// greek capital letter psi
+    { "Rho", 929 },	// greek capital letter rho, U+03A1
+    { "Scaron", 352 },	// - latin capital letter S with caron
+    { "Sigma", 931 },	// greek capital letter sigma
+    { "THORN", 222 },	// latin capital letter THORN
+    { "Tau", 932 },	// greek capital letter tau, U+03A4
+    { "Theta", 920 },	// greek capital letter theta
+    { "Uacute", 218 },	// latin capital letter U with acute
+    { "Ucirc", 219 },	// latin capital letter U with circumflex
+    { "Ugrave", 217 },	// latin capital letter U with grave
+    { "Upsilon", 933 },	// greek capital letter upsilon
+    { "Uuml", 220 },	// latin capital letter U with diaeresis
+    { "Xi", 926 },	// greek capital letter xi, U+039E ISOgrk3
+    { "Yacute", 221 },	// latin capital letter Y with acute
+    { "Yuml", 376 },	// - latin capital letter Y with diaeresis
+    { "Zeta", 918 },	// greek capital letter zeta, U+0396
+    { "aacute", 225 },	// latin small letter a with acute
+    { "acirc", 226 },	// latin small letter a with circumflex
+    { "acute", 180 },	// acute accent = spacing acute
+    { "aelig", 230 },	// latin small letter ae
+    { "agrave", 224 },	// latin small letter a with grave
+    { "alefsym", 8501 },	// alef symbol = first transfinite cardinal
+    { "alpha", 945 },	// greek small letter alpha
+    { "amp", 38 },	// -- ampersand, U+0026 ISOnum
+    { "and", 8743 },	// logical and = wedge, U+2227 ISOtech
+    { "ang", 8736 },	// angle, U+2220 ISOamso
+    { "aring", 229 },	// latin small letter a with ring above
+    { "asymp", 8776 },	// almost equal to = asymptotic to
+    { "atilde", 227 },	// latin small letter a with tilde
+    { "auml", 228 },	// latin small letter a with diaeresis
+    { "bdquo", 8222 },	// double low-9 quotation mark, U+201E NEW
+    { "beta", 946 },	// greek small letter beta, U+03B2 ISOgrk3
+    { "brvbar", 166 },	// broken bar = broken vertical bar
+    { "bull", 8226 },	// bullet = black small circle
+    { "cap", 8745 },	// intersection = cap, U+2229 ISOtech
+    { "ccedil", 231 },	// latin small letter c with cedilla
+    { "cedil", 184 },	// cedilla = spacing cedilla, U+00B8 ISOdia
+    { "cent", 162 },	// cent sign, U+00A2 ISOnum
+    { "chi", 967 },	// greek small letter chi, U+03C7 ISOgrk3
+    { "circ", 710 },	// - modifier letter circumflex accent
+    { "clubs", 9827 },	// black club suit = shamrock
+    { "cong", 8773 },	// approximately equal to, U+2245 ISOtech
+    { "copy", 169 },	// copyright sign, U+00A9 ISOnum
+    { "crarr", 8629 },	// downwards arrow with corner leftwards
+    { "cup", 8746 },	// union = cup, U+222A ISOtech
+    { "curren", 164 },	// currency sign, U+00A4 ISOnum
+    { "dArr", 8659 },	// downwards double arrow, U+21D3 ISOamsa
+    { "dagger", 8224 },	// dagger, U+2020 ISOpub
+    { "darr", 8595 },	// downwards arrow, U+2193 ISOnum
+    { "deg", 176 },	// degree sign, U+00B0 ISOnum
+    { "delta", 948 },	// greek small letter delta
+    { "diams", 9830 },	// black diamond suit, U+2666 ISOpub
+    { "divide", 247 },	// division sign, U+00F7 ISOnum
+    { "eacute", 233 },	// latin small letter e with acute
+    { "ecirc", 234 },	// latin small letter e with circumflex
+    { "egrave", 232 },	// latin small letter e with grave
+    { "empty", 8709 },	// empty set = null set = diameter
+    { "emsp", 8195 },	// em space, U+2003 ISOpub
+    { "ensp", 8194 },	// en space, U+2002 ISOpub
+    { "epsilon", 949 },	// greek small letter epsilon
+    { "equiv", 8801 },	// identical to, U+2261 ISOtech
+    { "eta", 951 },	// greek small letter eta, U+03B7 ISOgrk3
+    { "eth", 240 },	// latin small letter eth, U+00F0 ISOlat1
+    { "euml", 235 },	// latin small letter e with diaeresis
+    { "euro", 8364 },	// - euro sign, U+20AC NEW
+    { "exist", 8707 },	// there exists, U+2203 ISOtech
+    { "fnof", 402 },	// latin small f with hook = function
+    { "forall", 8704 },	// for all, U+2200 ISOtech
+    { "frac12", 189 },	// vulgar fraction one half
+    { "frac14", 188 },	// vulgar fraction one quarter
+    { "frac34", 190 },	// vulgar fraction three quarters
+    { "frasl", 8260 },	// fraction slash, U+2044 NEW
+    { "gamma", 947 },	// greek small letter gamma
+    { "ge", 8805 },	// greater-than or equal to
+    { "gt", 62 },	// -- greater-than sign, U+003E ISOnum
+    { "hArr", 8660 },	// left right double arrow
+    { "harr", 8596 },	// left right arrow, U+2194 ISOamsa
+    { "hearts", 9829 },	// black heart suit = valentine
+    { "hellip", 8230 },	// horizontal ellipsis = three dot leader
+    { "iacute", 237 },	// latin small letter i with acute
+    { "icirc", 238 },	// latin small letter i with circumflex
+    { "iexcl", 161 },	// inverted exclamation mark, U+00A1 ISOnum
+    { "igrave", 236 },	// latin small letter i with grave
+    { "image", 8465 },	// blackletter capital I = imaginary part
+    { "infin", 8734 },	// infinity, U+221E ISOtech
+    { "int", 8747 },	// integral, U+222B ISOtech
+    { "iota", 953 },	// greek small letter iota, U+03B9 ISOgrk3
+    { "iquest", 191 },	// inverted question mark
+    { "isin", 8712 },	// element of, U+2208 ISOtech
+    { "iuml", 239 },	// latin small letter i with diaeresis
+    { "kappa", 954 },	// greek small letter kappa
+    { "lArr", 8656 },	// leftwards double arrow, U+21D0 ISOtech
+    { "lambda", 955 },	// greek small letter lambda
+    { "lang", 9001 },	// left-pointing angle bracket = bra
+    { "laquo", 171 },	// left-pointing double angle quotation mark
+    { "larr", 8592 },	// leftwards arrow, U+2190 ISOnum
+    { "lceil", 8968 },	// left ceiling = apl upstile
+    { "ldquo", 8220 },	// left double quotation mark
+    { "le", 8804 },	// less-than or equal to, U+2264 ISOtech
+    { "lfloor", 8970 },	// left floor = apl downstile
+    { "lowast", 8727 },	// asterisk operator, U+2217 ISOtech
+    { "loz", 9674 },	// lozenge, U+25CA ISOpub
+    { "lrm", 8206 },	// left-to-right mark, U+200E NEW RFC 2070
+    { "lsaquo", 8249 },	// single left-pointing angle quotation mark
+    { "lsquo", 8216 },	// left single quotation mark
+    { "lt", 60 },	// -- less-than sign, U+003C ISOnum
+    { "macr", 175 },	// macron = spacing macron = overline
+    { "mdash", 8212 },	// em dash, U+2014 ISOpub
+    { "micro", 181 },	// micro sign, U+00B5 ISOnum
+    { "middot", 183 },	// middle dot = Georgian comma
+    { "minus", 8722 },	// minus sign, U+2212 ISOtech
+    { "mu", 956 },	// greek small letter mu, U+03BC ISOgrk3
+    { "nabla", 8711 },	// nabla = backward difference
+    { "nbsp", 160 },	// no-break space = non-breaking space
+    { "ndash", 8211 },	// en dash, U+2013 ISOpub
+    { "ne", 8800 },	// not equal to, U+2260 ISOtech
+    { "ni", 8715 },	// contains as member, U+220B ISOtech
+    { "not", 172 },	// not sign, U+00AC ISOnum
+    { "notin", 8713 },	// not an element of, U+2209 ISOtech
+    { "nsub", 8836 },	// not a subset of, U+2284 ISOamsn
+    { "ntilde", 241 },	// latin small letter n with tilde
+    { "nu", 957 },	// greek small letter nu, U+03BD ISOgrk3
+    { "oacute", 243 },	// latin small letter o with acute
+    { "ocirc", 244 },	// latin small letter o with circumflex
+    { "oelig", 339 },	// - latin small ligature oe, U+0153 ISOlat2
+    { "ograve", 242 },	// latin small letter o with grave
+    { "oline", 8254 },	// overline = spacing overscore
+    { "omega", 969 },	// greek small letter omega
+    { "omicron", 959 },	// greek small letter omicron, U+03BF NEW
+    { "oplus", 8853 },	// circled plus = direct sum
+    { "or", 8744 },	// logical or = vee, U+2228 ISOtech
+    { "ordf", 170 },	// feminine ordinal indicator, U+00AA ISOnum
+    { "ordm", 186 },	// masculine ordinal indicator
+    { "oslash", 248 },	// latin small letter o with stroke
+    { "otilde", 245 },	// latin small letter o with tilde
+    { "otimes", 8855 },	// circled times = vector product
+    { "ouml", 246 },	// latin small letter o with diaeresis
+    { "para", 182 },	// pilcrow sign = paragraph sign
+    { "part", 8706 },	// partial differential, U+2202 ISOtech
+    { "permil", 8240 },	// per mille sign, U+2030 ISOtech
+    { "perp", 8869 },	// up tack = orthogonal to = perpendicular
+    { "phi", 966 },	// greek small letter phi, U+03C6 ISOgrk3
+    { "pi", 960 },	// greek small letter pi, U+03C0 ISOgrk3
+    { "piv", 982 },	// greek pi symbol, U+03D6 ISOgrk3
+    { "plusmn", 177 },	// plus-minus sign = plus-or-minus sign
+    { "pound", 163 },	// pound sign, U+00A3 ISOnum
+    { "prime", 8242 },	// prime = minutes = feet, U+2032 ISOtech
+    { "prod", 8719 },	// n-ary product = product sign
+    { "prop", 8733 },	// proportional to, U+221D ISOtech
+    { "psi", 968 },	// greek small letter psi, U+03C8 ISOgrk3
+    { "quot", 34 },	// -- quotation mark = APL quote
+    { "rArr", 8658 },	// rightwards double arrow
+    { "radic", 8730 },	// square root = radical sign
+    { "rang", 9002 },	// right-pointing angle bracket = ket
+    { "raquo", 187 },	// right-pointing double angle quotation mark
+    { "rarr", 8594 },	// rightwards arrow, U+2192 ISOnum
+    { "rceil", 8969 },	// right ceiling, U+2309 ISOamsc
+    { "rdquo", 8221 },	// right double quotation mark
+    { "real", 8476 },	// blackletter capital R = real part symbol
+    { "reg", 174 },	// registered sign = registered trade mark sign
+    { "rfloor", 8971 },	// right floor, U+230B ISOamsc
+    { "rho", 961 },	// greek small letter rho, U+03C1 ISOgrk3
+    { "rlm", 8207 },	// right-to-left mark, U+200F NEW RFC 2070
+    { "rsaquo", 8250 },	// single right-pointing angle quotation mark
+    { "rsquo", 8217 },	// right single quotation mark
+    { "sbquo", 8218 },	// single low-9 quotation mark, U+201A NEW
+    { "scaron", 353 },	// - latin small letter s with caron
+    { "sdot", 8901 },	// dot operator, U+22C5 ISOamsb
+    { "sect", 167 },	// section sign, U+00A7 ISOnum
+    { "shy", 173 },	// soft hyphen = discretionary hyphen
+    { "sigma", 963 },	// greek small letter sigma
+    { "sigmaf", 962 },	// greek small letter final sigma
+    { "sim", 8764 },	// tilde operator = varies with = similar to
+    { "spades", 9824 },	// black spade suit, U+2660 ISOpub
+    { "sub", 8834 },	// subset of, U+2282 ISOtech
+    { "sube", 8838 },	// subset of or equal to, U+2286 ISOtech
+    { "sum", 8721 },	// n-ary sumation, U+2211 ISOamsb
+    { "sup", 8835 },	// superset of, U+2283 ISOtech
+    { "sup1", 185 },	// superscript one = superscript digit one
+    { "sup2", 178 },	// superscript two = superscript digit two
+    { "sup3", 179 },	// superscript three = superscript digit three
+    { "supe", 8839 },	// superset of or equal to
+    { "szlig", 223 },	// latin small letter sharp s = ess-zed
+    { "tau", 964 },	// greek small letter tau, U+03C4 ISOgrk3
+    { "there4", 8756 },	// therefore, U+2234 ISOtech
+    { "theta", 952 },	// greek small letter theta
+    { "thetasym", 977 },	// greek small letter theta symbol
+    { "thinsp", 8201 },	// thin space, U+2009 ISOpub
+    { "thorn", 254 },	// latin small letter thorn
+    { "tilde", 732 },	// - small tilde, U+02DC ISOdia
+    { "times", 215 },	// multiplication sign, U+00D7 ISOnum
+    { "trade", 8482 },	// trade mark sign, U+2122 ISOnum
+    { "uArr", 8657 },	// upwards double arrow, U+21D1 ISOamsa
+    { "uacute", 250 },	// latin small letter u with acute
+    { "uarr", 8593 },	// upwards arrow, U+2191 ISOnum-->
+    { "ucirc", 251 },	// latin small letter u with circumflex
+    { "ugrave", 249 },	// latin small letter u with grave
+    { "uml", 168 },	// diaeresis = spacing diaeresis
+    { "upsih", 978 },	// greek upsilon with hook symbol
+    { "upsilon", 965 },	// greek small letter upsilon
+    { "uuml", 252 },	// latin small letter u with diaeresis
+    { "weierp", 8472 },	// script capital P = power set
+    { "xi", 958 },	// greek small letter xi, U+03BE ISOgrk3
+    { "yacute", 253 },	// latin small letter y with acute
+    { "yen", 165 },	// yen sign = yuan sign, U+00A5 ISOnum
+    { "yuml", 255 },	// latin small letter y with diaeresis
+    { "zeta", 950 },	// greek small letter zeta, U+03B6 ISOgrk3
+    { "zwj", 8205 },	// zero width joiner, U+200D NEW RFC 2070
+    { "zwnj", 8204 },	// zero width non-joiner
+    { NULL, 0 },
+};
+static uint64_t
+calc_hash(const char *key) {
+    uint64_t	h = 0;
+    if (NULL != key) {
+	const uint8_t	*k = (const uint8_t*)key;
+	for (; 0 != *k; k++) {
+	    // narrow to most used range of 0x4D (77) in size
+	    h = 77 * h + ((*k | 0x20) - 0x2D);
+	}
+    }
+    return h;
+}
+static Slot*
+get_bucketp(uint64_t h) {
+    return entity_cache.buckets + (BUCKET_MASK & (h ^ (h << 5) ^ (h >> 7)));
+}
+static void
+cache_set(Slot s) {
+    int64_t	h = calc_hash(s->key);
+    Slot	*bucket = get_bucketp(h);
+    s->hash = h;
+    s->next = *bucket;
+    *bucket = s;
+}
+static Slot
+cache_get(const char *key) {
+    int64_t	h = calc_hash(key);
+    Slot	*bucket = get_bucketp(h);
+    Slot	s;
+    for (s = *bucket; NULL != s; s = s->next) {
+	if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
+	    return s;
+	}
+    }
+    return NULL;
+}
+static void
+cache_init() {
+    Slot	e = entities;
+    memset(&entity_cache, 0, sizeof(struct _cache));
+    for (; NULL != e->key; e++) {
+	cache_set(e);
+    }
+    inited = true;
+}
+char*
+ox_entity_lookup(char *text, const char *key) {
+    Slot	s = entities;
+    if (!inited) {
+	cache_init();
+    }
+    if (NULL == (s = cache_get(key))) {
+	return NULL;
+    }
+    return ox_ucs_to_utf8_chars(text, s->code);
+}

data/ext/ox/special.h CHANGED

@@ -9,5 +9,6 @@
 #include <stdint.h>
 extern char*	ox_ucs_to_utf8_chars(char *text, uint64_t u);
+extern char*	ox_entity_lookup(char *text, const char *key);
 #endif /* OX_SPECIAL_H */

data/lib/ox/version.rb CHANGED

@@ -1,5 +1,5 @@
 module Ox
   # Current version of the module.
-  VERSION = '2.12.1'
+  VERSION = '2.13.1'
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: ox
 version: !ruby/object:Gem::Version
-  version: 2.12.1
+  version: 2.13.1
 platform: ruby
 authors:
 - Peter Ohler
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-01-05 00:00:00.000000000 Z
+date: 2020-01-30 00:00:00.000000000 Z
 dependencies: []
 description: "A fast XML parser and object serializer that uses only standard C lib.\n\nOptimized
   XML (Ox), as the name implies was written to provide speed optimized\nXML handling.