ox 2.12.1 → 2.13.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4d2959b76c94fd92a6234a9cf211f27263ac36da3e0b74bc724edc2f278585a4
4
- data.tar.gz: eae845e3147144ee70d3b8f41603295c962c00a202282af8b0bd42cacde02ade
3
+ metadata.gz: 1ec1cf99d4e2684029f06dbf745c239ab00335d87017a4bd44fc792d908b9f33
4
+ data.tar.gz: 9f69f9506a7d83644c33b2c6658515fd0e1cfb8b4ce63e128fc8509a40fb31a0
5
5
  SHA512:
6
- metadata.gz: fe53b32f1daae7e8444ce0efa476f3b69fee96b72b5f61dcda51aca3769bac24a2edbfc7aae914bcb073e845af7487d7004fec0b8c6408bee1b3515c6bff6315
7
- data.tar.gz: ee673dbd381c0983cf592090c7bf01d99cbd23d9c5a2740892c9db688bf294824e4510ceff4bb31a0e340c13db8b78bb09badae1186c1929edd27ec59b8da9b2
6
+ metadata.gz: f296b41ddf11021ed71a4d61acd68a78741ef4cd0c7f23efa5be9ebef6254dbe60b749df0d4a9377d80d94d0ba46fff36fd283ae4111a58deea32ed0062a4cdc
7
+ data.tar.gz: c31bcbfc3271f92000c16e357a5aef34fbc105aa3afbc608d9a934f7d2c1c1f6d8af32de02250edf48032f5262bfd0f9b17316031a43f41e6ce5e10c8e1024a9
@@ -4,6 +4,22 @@ All changes to the Ox gem are documented here. Releases follow semantic versioni
4
4
 
5
5
  ## [Unreleased]
6
6
 
7
+ ## [2.13.1] - 2020-01-30
8
+
9
+ HTML Sequences
10
+
11
+ ### Added
12
+
13
+ - All HTML 4 sequence are now supported.
14
+
15
+ ## [2.13.0] - 2020-01-25
16
+
17
+ HTML Escape Sequences
18
+
19
+ ### Added
20
+
21
+ - All HTML 4 escape sequences are now parsed.
22
+
7
23
  ## [2.12.1] - 2020-01-05
8
24
 
9
25
  Ruby 2.7.0
@@ -1000,11 +1000,13 @@ read_coded_chars(PInfo pi, char *text) {
1000
1000
  char *b, buf[32];
1001
1001
  char *end = buf + sizeof(buf) - 1;
1002
1002
  char *s;
1003
+ long blen = 0;
1003
1004
 
1004
1005
  for (b = buf, s = pi->s; b < end; b++, s++) {
1005
1006
  *b = *s;
1006
1007
  if (';' == *s) {
1007
1008
  *(b + 1) = '\0';
1009
+ blen = b - buf;
1008
1010
  s++;
1009
1011
  break;
1010
1012
  }
@@ -1047,30 +1049,20 @@ read_coded_chars(PInfo pi, char *text) {
1047
1049
  } else {
1048
1050
  /*set_error(&pi->err, "Invalid encoding, need UTF-8 or UTF-16 encoding to parse &#nnnn; character sequences.", pi->str, pi->s); */
1049
1051
  set_error(&pi->err, "Invalid encoding, need UTF-8 encoding to parse &#nnnn; character sequences.", pi->str, pi->s);
1050
- return 0;
1052
+ return NULL;
1051
1053
  }
1052
1054
  pi->s = s;
1053
1055
  }
1054
- } else if (0 == strcasecmp(buf, "nbsp;")) {
1055
- pi->s = s;
1056
- *text++ = ' ';
1057
- } else if (0 == strcasecmp(buf, "lt;")) {
1058
- pi->s = s;
1059
- *text++ = '<';
1060
- } else if (0 == strcasecmp(buf, "gt;")) {
1061
- pi->s = s;
1062
- *text++ = '>';
1063
- } else if (0 == strcasecmp(buf, "amp;")) {
1064
- pi->s = s;
1065
- *text++ = '&';
1066
- } else if (0 == strcasecmp(buf, "quot;")) {
1067
- pi->s = s;
1068
- *text++ = '"';
1069
- } else if (0 == strcasecmp(buf, "apos;")) {
1070
- pi->s = s;
1071
- *text++ = '\'';
1072
1056
  } else {
1073
- *text++ = '&';
1057
+ char *t2;
1058
+
1059
+ buf[blen] = '\0';
1060
+ if (NULL == (t2 = ox_entity_lookup(text, buf))) {
1061
+ *text++ = '&';
1062
+ } else {
1063
+ text = t2;
1064
+ pi->s = s;
1065
+ }
1074
1066
  }
1075
1067
  return text;
1076
1068
  }
@@ -1153,16 +1145,30 @@ collapse_special(PInfo pi, char *str) {
1153
1145
  *b++ = '&';
1154
1146
  continue;
1155
1147
  } else {
1156
- c = '?';
1148
+ char key[16];
1149
+ char *k = key;
1150
+ char *kend = key + sizeof(key) - 1;
1151
+
1152
+ *k++ = *s;
1157
1153
  while (';' != *s++) {
1158
1154
  if ('\0' == *s) {
1159
1155
  set_error(&pi->err, "Invalid format, special character does not end with a semicolon", pi->str, pi->s);
1160
1156
  return EDOM;
1161
1157
  }
1158
+ if (kend <= k) {
1159
+ k = key;
1160
+ break;
1161
+ }
1162
+ *k++ = *s;
1162
1163
  }
1163
- s++;
1164
- set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1165
- return 0;
1164
+ k--;
1165
+ *k = '\0';
1166
+ if ('\0' == *key || NULL == (b = ox_entity_lookup(b, key))) {
1167
+ set_error(&pi->err, "Invalid format, invalid special character sequence", pi->str, pi->s);
1168
+ c = '?';
1169
+ return 0;
1170
+ }
1171
+ continue;
1166
1172
  }
1167
1173
  *b++ = (char)c;
1168
1174
  }
@@ -1668,8 +1668,28 @@ ox_sax_collapse_special(SaxDrive dr, char *str, long pos, long line, long col) {
1668
1668
  c = '\'';
1669
1669
  s += 5;
1670
1670
  } else {
1671
- ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
1672
- c = '&';
1671
+ char key[16];
1672
+ char *k = key;
1673
+ char *kend = key + sizeof(key) - 1;
1674
+ char *bn;
1675
+ char *s2 = s;
1676
+
1677
+ for (; ';' != *s2 && '\0' != *s2; s2++, k++) {
1678
+ if (kend <= k) {
1679
+ k = key;
1680
+ break;
1681
+ }
1682
+ *k = *s2;
1683
+ }
1684
+ *k = '\0';
1685
+ if ('\0' == *key || NULL == (bn = ox_entity_lookup(b, key))) {
1686
+ ox_sax_drive_error_at(dr, INVALID_FORMAT "Invalid special character sequence", pos, line, col);
1687
+ c = '&';
1688
+ } else {
1689
+ b = bn;
1690
+ s = s2 + 1;
1691
+ continue;
1692
+ }
1673
1693
  }
1674
1694
  *b++ = (char)c;
1675
1695
  col++;
@@ -3,6 +3,9 @@
3
3
  * All rights reserved.
4
4
  */
5
5
 
6
+ #include <string.h>
7
+ #include <stdbool.h>
8
+
6
9
  #include "special.h"
7
10
 
8
11
  /*
@@ -49,3 +52,345 @@ ox_ucs_to_utf8_chars(char *text, uint64_t u) {
49
52
  }
50
53
  return text;
51
54
  }
55
+
56
+ #define BUCKET_SIZE 256
57
+ #define BUCKET_MASK 255
58
+
59
+ typedef struct _slot {
60
+ const char *key;
61
+ uint64_t code;
62
+ struct _slot *next;
63
+ uint64_t hash;
64
+ } *Slot;
65
+
66
+ typedef struct _cache {
67
+ Slot buckets[BUCKET_SIZE];
68
+ } *Cache;
69
+
70
+ static struct _cache entity_cache;
71
+ static bool inited = false;
72
+
73
+ // HTML entities such as &amp;. This is a complete list from the HTML 4 spec.
74
+ static struct _slot entities[] = {
75
+ { "AElig", 198 }, // latin capital letter AE
76
+ { "Aacute", 193 }, // latin capital letter A with acute
77
+ { "Acirc", 194 }, // latin capital letter A with circumflex
78
+ { "Agrave", 192 }, // latin capital letter A with grave
79
+ { "Alpha", 913 }, // greek capital letter alpha, U+0391
80
+ { "Aring", 197 }, // latin capital letter A with ring above
81
+ { "Atilde", 195 }, // latin capital letter A with tilde
82
+ { "Auml", 196 }, // latin capital letter A with diaeresis
83
+ { "Beta", 914 }, // greek capital letter beta, U+0392
84
+ { "Ccedil", 199 }, // latin capital letter C with cedilla
85
+ { "Chi", 935 }, // greek capital letter chi, U+03A7
86
+ { "Dagger", 8225 }, // double dagger, U+2021 ISOpub
87
+ { "Delta", 916 }, // greek capital letter delta
88
+ { "ETH", 208 }, // latin capital letter ETH, U+00D0 ISOlat1
89
+ { "Eacute", 201 }, // latin capital letter E with acute
90
+ { "Ecirc", 202 }, // latin capital letter E with circumflex
91
+ { "Egrave", 200 }, // latin capital letter E with grave
92
+ { "Epsilon", 917 }, // greek capital letter epsilon, U+0395
93
+ { "Eta", 919 }, // greek capital letter eta, U+0397
94
+ { "Euml", 203 }, // latin capital letter E with diaeresis
95
+ { "Gamma", 915 }, // greek capital letter gamma
96
+ { "Iacute", 205 }, // latin capital letter I with acute
97
+ { "Icirc", 206 }, // latin capital letter I with circumflex
98
+ { "Igrave", 204 }, // latin capital letter I with grave
99
+ { "Iota", 921 }, // greek capital letter iota, U+0399
100
+ { "Iuml", 207 }, // latin capital letter I with diaeresis
101
+ { "Kappa", 922 }, // greek capital letter kappa, U+039A
102
+ { "Lambda", 923 }, // greek capital letter lambda
103
+ { "Mu", 924 }, // greek capital letter mu, U+039C
104
+ { "Ntilde", 209 }, // latin capital letter N with tilde
105
+ { "Nu", 925 }, // greek capital letter nu, U+039D
106
+ { "OElig", 338 }, // - latin capital ligature OE
107
+ { "Oacute", 211 }, // latin capital letter O with acute
108
+ { "Ocirc", 212 }, // latin capital letter O with circumflex
109
+ { "Ograve", 210 }, // latin capital letter O with grave
110
+ { "Omega", 937 }, // greek capital letter omega
111
+ { "Omicron", 927 }, // greek capital letter omicron, U+039F
112
+ { "Oslash", 216 }, // latin capital letter O with stroke
113
+ { "Otilde", 213 }, // latin capital letter O with tilde
114
+ { "Ouml", 214 }, // latin capital letter O with diaeresis
115
+ { "Phi", 934 }, // greek capital letter phi
116
+ { "Pi", 928 }, // greek capital letter pi, U+03A0 ISOgrk3
117
+ { "Prime", 8243 }, // double prime = seconds = inches
118
+ { "Psi", 936 }, // greek capital letter psi
119
+ { "Rho", 929 }, // greek capital letter rho, U+03A1
120
+ { "Scaron", 352 }, // - latin capital letter S with caron
121
+ { "Sigma", 931 }, // greek capital letter sigma
122
+ { "THORN", 222 }, // latin capital letter THORN
123
+ { "Tau", 932 }, // greek capital letter tau, U+03A4
124
+ { "Theta", 920 }, // greek capital letter theta
125
+ { "Uacute", 218 }, // latin capital letter U with acute
126
+ { "Ucirc", 219 }, // latin capital letter U with circumflex
127
+ { "Ugrave", 217 }, // latin capital letter U with grave
128
+ { "Upsilon", 933 }, // greek capital letter upsilon
129
+ { "Uuml", 220 }, // latin capital letter U with diaeresis
130
+ { "Xi", 926 }, // greek capital letter xi, U+039E ISOgrk3
131
+ { "Yacute", 221 }, // latin capital letter Y with acute
132
+ { "Yuml", 376 }, // - latin capital letter Y with diaeresis
133
+ { "Zeta", 918 }, // greek capital letter zeta, U+0396
134
+ { "aacute", 225 }, // latin small letter a with acute
135
+ { "acirc", 226 }, // latin small letter a with circumflex
136
+ { "acute", 180 }, // acute accent = spacing acute
137
+ { "aelig", 230 }, // latin small letter ae
138
+ { "agrave", 224 }, // latin small letter a with grave
139
+ { "alefsym", 8501 }, // alef symbol = first transfinite cardinal
140
+ { "alpha", 945 }, // greek small letter alpha
141
+ { "amp", 38 }, // -- ampersand, U+0026 ISOnum
142
+ { "and", 8743 }, // logical and = wedge, U+2227 ISOtech
143
+ { "ang", 8736 }, // angle, U+2220 ISOamso
144
+ { "aring", 229 }, // latin small letter a with ring above
145
+ { "asymp", 8776 }, // almost equal to = asymptotic to
146
+ { "atilde", 227 }, // latin small letter a with tilde
147
+ { "auml", 228 }, // latin small letter a with diaeresis
148
+ { "bdquo", 8222 }, // double low-9 quotation mark, U+201E NEW
149
+ { "beta", 946 }, // greek small letter beta, U+03B2 ISOgrk3
150
+ { "brvbar", 166 }, // broken bar = broken vertical bar
151
+ { "bull", 8226 }, // bullet = black small circle
152
+ { "cap", 8745 }, // intersection = cap, U+2229 ISOtech
153
+ { "ccedil", 231 }, // latin small letter c with cedilla
154
+ { "cedil", 184 }, // cedilla = spacing cedilla, U+00B8 ISOdia
155
+ { "cent", 162 }, // cent sign, U+00A2 ISOnum
156
+ { "chi", 967 }, // greek small letter chi, U+03C7 ISOgrk3
157
+ { "circ", 710 }, // - modifier letter circumflex accent
158
+ { "clubs", 9827 }, // black club suit = shamrock
159
+ { "cong", 8773 }, // approximately equal to, U+2245 ISOtech
160
+ { "copy", 169 }, // copyright sign, U+00A9 ISOnum
161
+ { "crarr", 8629 }, // downwards arrow with corner leftwards
162
+ { "cup", 8746 }, // union = cup, U+222A ISOtech
163
+ { "curren", 164 }, // currency sign, U+00A4 ISOnum
164
+ { "dArr", 8659 }, // downwards double arrow, U+21D3 ISOamsa
165
+ { "dagger", 8224 }, // dagger, U+2020 ISOpub
166
+ { "darr", 8595 }, // downwards arrow, U+2193 ISOnum
167
+ { "deg", 176 }, // degree sign, U+00B0 ISOnum
168
+ { "delta", 948 }, // greek small letter delta
169
+ { "diams", 9830 }, // black diamond suit, U+2666 ISOpub
170
+ { "divide", 247 }, // division sign, U+00F7 ISOnum
171
+ { "eacute", 233 }, // latin small letter e with acute
172
+ { "ecirc", 234 }, // latin small letter e with circumflex
173
+ { "egrave", 232 }, // latin small letter e with grave
174
+ { "empty", 8709 }, // empty set = null set = diameter
175
+ { "emsp", 8195 }, // em space, U+2003 ISOpub
176
+ { "ensp", 8194 }, // en space, U+2002 ISOpub
177
+ { "epsilon", 949 }, // greek small letter epsilon
178
+ { "equiv", 8801 }, // identical to, U+2261 ISOtech
179
+ { "eta", 951 }, // greek small letter eta, U+03B7 ISOgrk3
180
+ { "eth", 240 }, // latin small letter eth, U+00F0 ISOlat1
181
+ { "euml", 235 }, // latin small letter e with diaeresis
182
+ { "euro", 8364 }, // - euro sign, U+20AC NEW
183
+ { "exist", 8707 }, // there exists, U+2203 ISOtech
184
+ { "fnof", 402 }, // latin small f with hook = function
185
+ { "forall", 8704 }, // for all, U+2200 ISOtech
186
+ { "frac12", 189 }, // vulgar fraction one half
187
+ { "frac14", 188 }, // vulgar fraction one quarter
188
+ { "frac34", 190 }, // vulgar fraction three quarters
189
+ { "frasl", 8260 }, // fraction slash, U+2044 NEW
190
+ { "gamma", 947 }, // greek small letter gamma
191
+ { "ge", 8805 }, // greater-than or equal to
192
+ { "gt", 62 }, // -- greater-than sign, U+003E ISOnum
193
+ { "hArr", 8660 }, // left right double arrow
194
+ { "harr", 8596 }, // left right arrow, U+2194 ISOamsa
195
+ { "hearts", 9829 }, // black heart suit = valentine
196
+ { "hellip", 8230 }, // horizontal ellipsis = three dot leader
197
+ { "iacute", 237 }, // latin small letter i with acute
198
+ { "icirc", 238 }, // latin small letter i with circumflex
199
+ { "iexcl", 161 }, // inverted exclamation mark, U+00A1 ISOnum
200
+ { "igrave", 236 }, // latin small letter i with grave
201
+ { "image", 8465 }, // blackletter capital I = imaginary part
202
+ { "infin", 8734 }, // infinity, U+221E ISOtech
203
+ { "int", 8747 }, // integral, U+222B ISOtech
204
+ { "iota", 953 }, // greek small letter iota, U+03B9 ISOgrk3
205
+ { "iquest", 191 }, // inverted question mark
206
+ { "isin", 8712 }, // element of, U+2208 ISOtech
207
+ { "iuml", 239 }, // latin small letter i with diaeresis
208
+ { "kappa", 954 }, // greek small letter kappa
209
+ { "lArr", 8656 }, // leftwards double arrow, U+21D0 ISOtech
210
+ { "lambda", 955 }, // greek small letter lambda
211
+ { "lang", 9001 }, // left-pointing angle bracket = bra
212
+ { "laquo", 171 }, // left-pointing double angle quotation mark
213
+ { "larr", 8592 }, // leftwards arrow, U+2190 ISOnum
214
+ { "lceil", 8968 }, // left ceiling = apl upstile
215
+ { "ldquo", 8220 }, // left double quotation mark
216
+ { "le", 8804 }, // less-than or equal to, U+2264 ISOtech
217
+ { "lfloor", 8970 }, // left floor = apl downstile
218
+ { "lowast", 8727 }, // asterisk operator, U+2217 ISOtech
219
+ { "loz", 9674 }, // lozenge, U+25CA ISOpub
220
+ { "lrm", 8206 }, // left-to-right mark, U+200E NEW RFC 2070
221
+ { "lsaquo", 8249 }, // single left-pointing angle quotation mark
222
+ { "lsquo", 8216 }, // left single quotation mark
223
+ { "lt", 60 }, // -- less-than sign, U+003C ISOnum
224
+ { "macr", 175 }, // macron = spacing macron = overline
225
+ { "mdash", 8212 }, // em dash, U+2014 ISOpub
226
+ { "micro", 181 }, // micro sign, U+00B5 ISOnum
227
+ { "middot", 183 }, // middle dot = Georgian comma
228
+ { "minus", 8722 }, // minus sign, U+2212 ISOtech
229
+ { "mu", 956 }, // greek small letter mu, U+03BC ISOgrk3
230
+ { "nabla", 8711 }, // nabla = backward difference
231
+ { "nbsp", 160 }, // no-break space = non-breaking space
232
+ { "ndash", 8211 }, // en dash, U+2013 ISOpub
233
+ { "ne", 8800 }, // not equal to, U+2260 ISOtech
234
+ { "ni", 8715 }, // contains as member, U+220B ISOtech
235
+ { "not", 172 }, // not sign, U+00AC ISOnum
236
+ { "notin", 8713 }, // not an element of, U+2209 ISOtech
237
+ { "nsub", 8836 }, // not a subset of, U+2284 ISOamsn
238
+ { "ntilde", 241 }, // latin small letter n with tilde
239
+ { "nu", 957 }, // greek small letter nu, U+03BD ISOgrk3
240
+ { "oacute", 243 }, // latin small letter o with acute
241
+ { "ocirc", 244 }, // latin small letter o with circumflex
242
+ { "oelig", 339 }, // - latin small ligature oe, U+0153 ISOlat2
243
+ { "ograve", 242 }, // latin small letter o with grave
244
+ { "oline", 8254 }, // overline = spacing overscore
245
+ { "omega", 969 }, // greek small letter omega
246
+ { "omicron", 959 }, // greek small letter omicron, U+03BF NEW
247
+ { "oplus", 8853 }, // circled plus = direct sum
248
+ { "or", 8744 }, // logical or = vee, U+2228 ISOtech
249
+ { "ordf", 170 }, // feminine ordinal indicator, U+00AA ISOnum
250
+ { "ordm", 186 }, // masculine ordinal indicator
251
+ { "oslash", 248 }, // latin small letter o with stroke
252
+ { "otilde", 245 }, // latin small letter o with tilde
253
+ { "otimes", 8855 }, // circled times = vector product
254
+ { "ouml", 246 }, // latin small letter o with diaeresis
255
+ { "para", 182 }, // pilcrow sign = paragraph sign
256
+ { "part", 8706 }, // partial differential, U+2202 ISOtech
257
+ { "permil", 8240 }, // per mille sign, U+2030 ISOtech
258
+ { "perp", 8869 }, // up tack = orthogonal to = perpendicular
259
+ { "phi", 966 }, // greek small letter phi, U+03C6 ISOgrk3
260
+ { "pi", 960 }, // greek small letter pi, U+03C0 ISOgrk3
261
+ { "piv", 982 }, // greek pi symbol, U+03D6 ISOgrk3
262
+ { "plusmn", 177 }, // plus-minus sign = plus-or-minus sign
263
+ { "pound", 163 }, // pound sign, U+00A3 ISOnum
264
+ { "prime", 8242 }, // prime = minutes = feet, U+2032 ISOtech
265
+ { "prod", 8719 }, // n-ary product = product sign
266
+ { "prop", 8733 }, // proportional to, U+221D ISOtech
267
+ { "psi", 968 }, // greek small letter psi, U+03C8 ISOgrk3
268
+ { "quot", 34 }, // -- quotation mark = APL quote
269
+ { "rArr", 8658 }, // rightwards double arrow
270
+ { "radic", 8730 }, // square root = radical sign
271
+ { "rang", 9002 }, // right-pointing angle bracket = ket
272
+ { "raquo", 187 }, // right-pointing double angle quotation mark
273
+ { "rarr", 8594 }, // rightwards arrow, U+2192 ISOnum
274
+ { "rceil", 8969 }, // right ceiling, U+2309 ISOamsc
275
+ { "rdquo", 8221 }, // right double quotation mark
276
+ { "real", 8476 }, // blackletter capital R = real part symbol
277
+ { "reg", 174 }, // registered sign = registered trade mark sign
278
+ { "rfloor", 8971 }, // right floor, U+230B ISOamsc
279
+ { "rho", 961 }, // greek small letter rho, U+03C1 ISOgrk3
280
+ { "rlm", 8207 }, // right-to-left mark, U+200F NEW RFC 2070
281
+ { "rsaquo", 8250 }, // single right-pointing angle quotation mark
282
+ { "rsquo", 8217 }, // right single quotation mark
283
+ { "sbquo", 8218 }, // single low-9 quotation mark, U+201A NEW
284
+ { "scaron", 353 }, // - latin small letter s with caron
285
+ { "sdot", 8901 }, // dot operator, U+22C5 ISOamsb
286
+ { "sect", 167 }, // section sign, U+00A7 ISOnum
287
+ { "shy", 173 }, // soft hyphen = discretionary hyphen
288
+ { "sigma", 963 }, // greek small letter sigma
289
+ { "sigmaf", 962 }, // greek small letter final sigma
290
+ { "sim", 8764 }, // tilde operator = varies with = similar to
291
+ { "spades", 9824 }, // black spade suit, U+2660 ISOpub
292
+ { "sub", 8834 }, // subset of, U+2282 ISOtech
293
+ { "sube", 8838 }, // subset of or equal to, U+2286 ISOtech
294
+ { "sum", 8721 }, // n-ary sumation, U+2211 ISOamsb
295
+ { "sup", 8835 }, // superset of, U+2283 ISOtech
296
+ { "sup1", 185 }, // superscript one = superscript digit one
297
+ { "sup2", 178 }, // superscript two = superscript digit two
298
+ { "sup3", 179 }, // superscript three = superscript digit three
299
+ { "supe", 8839 }, // superset of or equal to
300
+ { "szlig", 223 }, // latin small letter sharp s = ess-zed
301
+ { "tau", 964 }, // greek small letter tau, U+03C4 ISOgrk3
302
+ { "there4", 8756 }, // therefore, U+2234 ISOtech
303
+ { "theta", 952 }, // greek small letter theta
304
+ { "thetasym", 977 }, // greek small letter theta symbol
305
+ { "thinsp", 8201 }, // thin space, U+2009 ISOpub
306
+ { "thorn", 254 }, // latin small letter thorn
307
+ { "tilde", 732 }, // - small tilde, U+02DC ISOdia
308
+ { "times", 215 }, // multiplication sign, U+00D7 ISOnum
309
+ { "trade", 8482 }, // trade mark sign, U+2122 ISOnum
310
+ { "uArr", 8657 }, // upwards double arrow, U+21D1 ISOamsa
311
+ { "uacute", 250 }, // latin small letter u with acute
312
+ { "uarr", 8593 }, // upwards arrow, U+2191 ISOnum-->
313
+ { "ucirc", 251 }, // latin small letter u with circumflex
314
+ { "ugrave", 249 }, // latin small letter u with grave
315
+ { "uml", 168 }, // diaeresis = spacing diaeresis
316
+ { "upsih", 978 }, // greek upsilon with hook symbol
317
+ { "upsilon", 965 }, // greek small letter upsilon
318
+ { "uuml", 252 }, // latin small letter u with diaeresis
319
+ { "weierp", 8472 }, // script capital P = power set
320
+ { "xi", 958 }, // greek small letter xi, U+03BE ISOgrk3
321
+ { "yacute", 253 }, // latin small letter y with acute
322
+ { "yen", 165 }, // yen sign = yuan sign, U+00A5 ISOnum
323
+ { "yuml", 255 }, // latin small letter y with diaeresis
324
+ { "zeta", 950 }, // greek small letter zeta, U+03B6 ISOgrk3
325
+ { "zwj", 8205 }, // zero width joiner, U+200D NEW RFC 2070
326
+ { "zwnj", 8204 }, // zero width non-joiner
327
+ { NULL, 0 },
328
+ };
329
+
330
+ static uint64_t
331
+ calc_hash(const char *key) {
332
+ uint64_t h = 0;
333
+
334
+ if (NULL != key) {
335
+ const uint8_t *k = (const uint8_t*)key;
336
+
337
+ for (; 0 != *k; k++) {
338
+ // narrow to most used range of 0x4D (77) in size
339
+ h = 77 * h + ((*k | 0x20) - 0x2D);
340
+ }
341
+ }
342
+ return h;
343
+ }
344
+
345
+ static Slot*
346
+ get_bucketp(uint64_t h) {
347
+ return entity_cache.buckets + (BUCKET_MASK & (h ^ (h << 5) ^ (h >> 7)));
348
+ }
349
+
350
+ static void
351
+ cache_set(Slot s) {
352
+ int64_t h = calc_hash(s->key);
353
+ Slot *bucket = get_bucketp(h);
354
+
355
+ s->hash = h;
356
+ s->next = *bucket;
357
+ *bucket = s;
358
+ }
359
+
360
+ static Slot
361
+ cache_get(const char *key) {
362
+ int64_t h = calc_hash(key);
363
+ Slot *bucket = get_bucketp(h);
364
+ Slot s;
365
+
366
+ for (s = *bucket; NULL != s; s = s->next) {
367
+ if (h == (int64_t)s->hash && 0 == strcasecmp(s->key, key)) {
368
+ return s;
369
+ }
370
+ }
371
+ return NULL;
372
+ }
373
+
374
+ static void
375
+ cache_init() {
376
+ Slot e = entities;
377
+
378
+ memset(&entity_cache, 0, sizeof(struct _cache));
379
+ for (; NULL != e->key; e++) {
380
+ cache_set(e);
381
+ }
382
+ inited = true;
383
+ }
384
+
385
+ char*
386
+ ox_entity_lookup(char *text, const char *key) {
387
+ Slot s = entities;
388
+
389
+ if (!inited) {
390
+ cache_init();
391
+ }
392
+ if (NULL == (s = cache_get(key))) {
393
+ return NULL;
394
+ }
395
+ return ox_ucs_to_utf8_chars(text, s->code);
396
+ }
@@ -9,5 +9,6 @@
9
9
  #include <stdint.h>
10
10
 
11
11
  extern char* ox_ucs_to_utf8_chars(char *text, uint64_t u);
12
+ extern char* ox_entity_lookup(char *text, const char *key);
12
13
 
13
14
  #endif /* OX_SPECIAL_H */
@@ -1,5 +1,5 @@
1
1
 
2
2
  module Ox
3
3
  # Current version of the module.
4
- VERSION = '2.12.1'
4
+ VERSION = '2.13.1'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: ox
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.12.1
4
+ version: 2.13.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Peter Ohler
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-01-05 00:00:00.000000000 Z
11
+ date: 2020-01-30 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: "A fast XML parser and object serializer that uses only standard C lib.\n\nOptimized
14
14
  XML (Ox), as the name implies was written to provide speed optimized\nXML handling.