hpricot 0.8.3-i386-mswin32

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. data/CHANGELOG +104 -0
  2. data/COPYING +18 -0
  3. data/README.md +276 -0
  4. data/Rakefile +234 -0
  5. data/ext/fast_xs/FastXsService.java +1123 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +210 -0
  8. data/ext/hpricot_scan/HpricotCss.java +850 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2099 -0
  10. data/ext/hpricot_scan/extconf.rb +9 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3511 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +120 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +7039 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1161 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +896 -0
  19. data/extras/hpricot.png +0 -0
  20. data/lib/fast_xs.rb +1 -0
  21. data/lib/fast_xs/1.8/fast_xs.so +0 -0
  22. data/lib/fast_xs/1.9/fast_xs.so +0 -0
  23. data/lib/hpricot.rb +26 -0
  24. data/lib/hpricot/blankslate.rb +63 -0
  25. data/lib/hpricot/builder.rb +216 -0
  26. data/lib/hpricot/elements.rb +514 -0
  27. data/lib/hpricot/htmlinfo.rb +691 -0
  28. data/lib/hpricot/inspect.rb +103 -0
  29. data/lib/hpricot/modules.rb +40 -0
  30. data/lib/hpricot/parse.rb +38 -0
  31. data/lib/hpricot/tag.rb +219 -0
  32. data/lib/hpricot/tags.rb +164 -0
  33. data/lib/hpricot/traverse.rb +839 -0
  34. data/lib/hpricot/xchar.rb +94 -0
  35. data/lib/hpricot_scan.rb +1 -0
  36. data/lib/hpricot_scan/1.8/hpricot_scan.so +0 -0
  37. data/lib/hpricot_scan/1.9/hpricot_scan.so +0 -0
  38. data/test/files/basic.xhtml +17 -0
  39. data/test/files/boingboing.html +2266 -0
  40. data/test/files/cy0.html +3653 -0
  41. data/test/files/immob.html +400 -0
  42. data/test/files/pace_application.html +1320 -0
  43. data/test/files/tenderlove.html +16 -0
  44. data/test/files/uswebgen.html +220 -0
  45. data/test/files/utf8.html +1054 -0
  46. data/test/files/week9.html +1723 -0
  47. data/test/files/why.xml +19 -0
  48. data/test/load_files.rb +7 -0
  49. data/test/nokogiri-bench.rb +64 -0
  50. data/test/test_alter.rb +96 -0
  51. data/test/test_builder.rb +37 -0
  52. data/test/test_parser.rb +457 -0
  53. data/test/test_paths.rb +25 -0
  54. data/test/test_preserved.rb +88 -0
  55. data/test/test_xml.rb +28 -0
  56. metadata +128 -0
@@ -0,0 +1,1123 @@
1
+
2
+ import java.io.IOException;
3
+ import java.io.StringWriter;
4
+ import java.io.Writer;
5
+ import java.util.HashMap;
6
+ import java.util.Map;
7
+ import java.util.TreeMap;
8
+ import org.jruby.Ruby;
9
+ import org.jruby.RubyModule;
10
+ import org.jruby.runtime.CallbackFactory;
11
+ import org.jruby.runtime.builtin.IRubyObject;
12
+ import org.jruby.runtime.load.BasicLibraryService;
13
+
14
+ public class FastXsService implements BasicLibraryService {
15
+
16
+ public boolean basicLoad(final Ruby runtime) throws IOException {
17
+ RubyModule string = runtime.getModule("String");
18
+ CallbackFactory fact = runtime.callbackFactory(FastXsService.class);
19
+ string.defineMethod("fast_xs",fact.getFastSingletonMethod("fast_xs"));
20
+ return true;
21
+ }
22
+
23
+ public static IRubyObject fast_xs(IRubyObject recv) {
24
+ String string = recv.convertToString().getUnicodeValue();
25
+ StringWriter writer = new StringWriter ((int)(string.length() * 1.5));
26
+ try {
27
+ Entities.FAST_XS.escape(writer, string);
28
+ return recv.getRuntime().newString(writer.toString());
29
+ } catch (IOException e) {
30
+ throw recv.getRuntime().newIOErrorFromException(e);
31
+ }
32
+ }
33
+ }
34
+
35
+ // From Apache commons-lang,
36
+ // http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?revision=560660&view=markup
37
+ /*
38
+ * Licensed to the Apache Software Foundation (ASF) under one or more
39
+ * contributor license agreements. See the NOTICE file distributed with
40
+ * this work for additional information regarding copyright ownership.
41
+ * The ASF licenses this file to You under the Apache License, Version 2.0
42
+ * (the "License"); you may not use this file except in compliance with
43
+ * the License. You may obtain a copy of the License at
44
+ *
45
+ * http://www.apache.org/licenses/LICENSE-2.0
46
+ *
47
+ * Unless required by applicable law or agreed to in writing, software
48
+ * distributed under the License is distributed on an "AS IS" BASIS,
49
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
50
+ * See the License for the specific language governing permissions and
51
+ * limitations under the License.
52
+ */
53
+
54
+ /**
55
+ * <p>
56
+ * Provides HTML and XML entity utilities.
57
+ * </p>
58
+ *
59
+ * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
60
+ * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
61
+ * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
62
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
63
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
64
+ *
65
+ * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
66
+ * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
67
+ * @since 2.0
68
+ * @version $Id$
69
+ */
70
+ class Entities {
71
+
72
+ private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
73
+ {"amp", "38"}, // & - ampersand
74
+ {"lt", "60"}, // < - less-than
75
+ {"gt", "62"}, // > - greater-than
76
+ };
77
+
78
+ private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
79
+ };
80
+
81
+ // package scoped for testing
82
+ static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
83
+ {"iexcl", "161"}, // inverted exclamation mark
84
+ {"cent", "162"}, // cent sign
85
+ {"pound", "163"}, // pound sign
86
+ {"curren", "164"}, // currency sign
87
+ {"yen", "165"}, // yen sign = yuan sign
88
+ {"brvbar", "166"}, // broken bar = broken vertical bar
89
+ {"sect", "167"}, // section sign
90
+ {"uml", "168"}, // diaeresis = spacing diaeresis
91
+ {"copy", "169"}, // © - copyright sign
92
+ {"ordf", "170"}, // feminine ordinal indicator
93
+ {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
94
+ {"not", "172"}, // not sign
95
+ {"shy", "173"}, // soft hyphen = discretionary hyphen
96
+ {"reg", "174"}, // ® - registered trademark sign
97
+ {"macr", "175"}, // macron = spacing macron = overline = APL overbar
98
+ {"deg", "176"}, // degree sign
99
+ {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
100
+ {"sup2", "178"}, // superscript two = superscript digit two = squared
101
+ {"sup3", "179"}, // superscript three = superscript digit three = cubed
102
+ {"acute", "180"}, // acute accent = spacing acute
103
+ {"micro", "181"}, // micro sign
104
+ {"para", "182"}, // pilcrow sign = paragraph sign
105
+ {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
106
+ {"cedil", "184"}, // cedilla = spacing cedilla
107
+ {"sup1", "185"}, // superscript one = superscript digit one
108
+ {"ordm", "186"}, // masculine ordinal indicator
109
+ {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
110
+ {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
111
+ {"frac12", "189"}, // vulgar fraction one half = fraction one half
112
+ {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
113
+ {"iquest", "191"}, // inverted question mark = turned question mark
114
+ {"Agrave", "192"}, // À - uppercase A, grave accent
115
+ {"Aacute", "193"}, // Á - uppercase A, acute accent
116
+ {"Acirc", "194"}, // Â - uppercase A, circumflex accent
117
+ {"Atilde", "195"}, // Ã - uppercase A, tilde
118
+ {"Auml", "196"}, // Ä - uppercase A, umlaut
119
+ {"Aring", "197"}, // Å - uppercase A, ring
120
+ {"AElig", "198"}, // Æ - uppercase AE
121
+ {"Ccedil", "199"}, // Ç - uppercase C, cedilla
122
+ {"Egrave", "200"}, // È - uppercase E, grave accent
123
+ {"Eacute", "201"}, // É - uppercase E, acute accent
124
+ {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent
125
+ {"Euml", "203"}, // Ë - uppercase E, umlaut
126
+ {"Igrave", "204"}, // Ì - uppercase I, grave accent
127
+ {"Iacute", "205"}, // Í - uppercase I, acute accent
128
+ {"Icirc", "206"}, // Î - uppercase I, circumflex accent
129
+ {"Iuml", "207"}, // Ï - uppercase I, umlaut
130
+ {"ETH", "208"}, // Ð - uppercase Eth, Icelandic
131
+ {"Ntilde", "209"}, // Ñ - uppercase N, tilde
132
+ {"Ograve", "210"}, // Ò - uppercase O, grave accent
133
+ {"Oacute", "211"}, // Ó - uppercase O, acute accent
134
+ {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
135
+ {"Otilde", "213"}, // Õ - uppercase O, tilde
136
+ {"Ouml", "214"}, // Ö - uppercase O, umlaut
137
+ {"times", "215"}, // multiplication sign
138
+ {"Oslash", "216"}, // Ø - uppercase O, slash
139
+ {"Ugrave", "217"}, // Ù - uppercase U, grave accent
140
+ {"Uacute", "218"}, // Ú - uppercase U, acute accent
141
+ {"Ucirc", "219"}, // Û - uppercase U, circumflex accent
142
+ {"Uuml", "220"}, // Ü - uppercase U, umlaut
143
+ {"Yacute", "221"}, // Ý - uppercase Y, acute accent
144
+ {"THORN", "222"}, // Þ - uppercase THORN, Icelandic
145
+ {"szlig", "223"}, // ß - lowercase sharps, German
146
+ {"agrave", "224"}, // à - lowercase a, grave accent
147
+ {"aacute", "225"}, // á - lowercase a, acute accent
148
+ {"acirc", "226"}, // â - lowercase a, circumflex accent
149
+ {"atilde", "227"}, // ã - lowercase a, tilde
150
+ {"auml", "228"}, // ä - lowercase a, umlaut
151
+ {"aring", "229"}, // å - lowercase a, ring
152
+ {"aelig", "230"}, // æ - lowercase ae
153
+ {"ccedil", "231"}, // ç - lowercase c, cedilla
154
+ {"egrave", "232"}, // è - lowercase e, grave accent
155
+ {"eacute", "233"}, // é - lowercase e, acute accent
156
+ {"ecirc", "234"}, // ê - lowercase e, circumflex accent
157
+ {"euml", "235"}, // ë - lowercase e, umlaut
158
+ {"igrave", "236"}, // ì - lowercase i, grave accent
159
+ {"iacute", "237"}, // í - lowercase i, acute accent
160
+ {"icirc", "238"}, // î - lowercase i, circumflex accent
161
+ {"iuml", "239"}, // ï - lowercase i, umlaut
162
+ {"eth", "240"}, // ð - lowercase eth, Icelandic
163
+ {"ntilde", "241"}, // ñ - lowercase n, tilde
164
+ {"ograve", "242"}, // ò - lowercase o, grave accent
165
+ {"oacute", "243"}, // ó - lowercase o, acute accent
166
+ {"ocirc", "244"}, // ô - lowercase o, circumflex accent
167
+ {"otilde", "245"}, // õ - lowercase o, tilde
168
+ {"ouml", "246"}, // ö - lowercase o, umlaut
169
+ {"divide", "247"}, // division sign
170
+ {"oslash", "248"}, // ø - lowercase o, slash
171
+ {"ugrave", "249"}, // ù - lowercase u, grave accent
172
+ {"uacute", "250"}, // ú - lowercase u, acute accent
173
+ {"ucirc", "251"}, // û - lowercase u, circumflex accent
174
+ {"uuml", "252"}, // ü - lowercase u, umlaut
175
+ {"yacute", "253"}, // ý - lowercase y, acute accent
176
+ {"thorn", "254"}, // þ - lowercase thorn, Icelandic
177
+ {"yuml", "255"}, // ÿ - lowercase y, umlaut
178
+ };
179
+
180
+ // http://www.w3.org/TR/REC-html40/sgml/entities.html
181
+ // package scoped for testing
182
+ static final String[][] HTML40_ARRAY = {
183
+ // <!-- Latin Extended-B -->
184
+ {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
185
+ // <!-- Greek -->
186
+ {"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
187
+ {"Beta", "914"}, // greek capital letter beta, U+0392 -->
188
+ {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
189
+ {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
190
+ {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
191
+ {"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
192
+ {"Eta", "919"}, // greek capital letter eta, U+0397 -->
193
+ {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
194
+ {"Iota", "921"}, // greek capital letter iota, U+0399 -->
195
+ {"Kappa", "922"}, // greek capital letter kappa, U+039A -->
196
+ {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
197
+ {"Mu", "924"}, // greek capital letter mu, U+039C -->
198
+ {"Nu", "925"}, // greek capital letter nu, U+039D -->
199
+ {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
200
+ {"Omicron", "927"}, // greek capital letter omicron, U+039F -->
201
+ {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
202
+ {"Rho", "929"}, // greek capital letter rho, U+03A1 -->
203
+ // <!-- there is no Sigmaf, and no U+03A2 character either -->
204
+ {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
205
+ {"Tau", "932"}, // greek capital letter tau, U+03A4 -->
206
+ {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
207
+ {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
208
+ {"Chi", "935"}, // greek capital letter chi, U+03A7 -->
209
+ {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
210
+ {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
211
+ {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
212
+ {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
213
+ {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
214
+ {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
215
+ {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
216
+ {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
217
+ {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
218
+ {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
219
+ {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
220
+ {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
221
+ {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
222
+ {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
223
+ {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
224
+ {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
225
+ {"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
226
+ {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
227
+ {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
228
+ {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
229
+ {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
230
+ {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
231
+ {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
232
+ {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
233
+ {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
234
+ {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
235
+ {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
236
+ {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
237
+ {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
238
+ {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
239
+ // <!-- General Punctuation -->
240
+ {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
241
+ // <!-- bullet is NOT the same as bullet operator, U+2219 -->
242
+ {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
243
+ {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
244
+ {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
245
+ {"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
246
+ {"frasl", "8260"}, // fraction slash, U+2044 NEW -->
247
+ // <!-- Letterlike Symbols -->
248
+ {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
249
+ {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
250
+ {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
251
+ {"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
252
+ {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
253
+ // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
254
+ // same glyph could be used to depict both characters -->
255
+ // <!-- Arrows -->
256
+ {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
257
+ {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
258
+ {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
259
+ {"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
260
+ {"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
261
+ {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
262
+ {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
263
+ // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
264
+ // arrow but also does not have any other character for that function.
265
+ // So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
266
+ {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
267
+ {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
268
+ // <!-- ISO 10646 does not say this is the 'implies' character but does not
269
+ // have another character with this function so ?rArr can be used for
270
+ // 'implies' as ISOtech suggests -->
271
+ {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
272
+ {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
273
+ // <!-- Mathematical Operators -->
274
+ {"forall", "8704"}, // for all, U+2200 ISOtech -->
275
+ {"part", "8706"}, // partial differential, U+2202 ISOtech -->
276
+ {"exist", "8707"}, // there exists, U+2203 ISOtech -->
277
+ {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
278
+ {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
279
+ {"isin", "8712"}, // element of, U+2208 ISOtech -->
280
+ {"notin", "8713"}, // not an element of, U+2209 ISOtech -->
281
+ {"ni", "8715"}, // contains as member, U+220B ISOtech -->
282
+ // <!-- should there be a more memorable name than 'ni'? -->
283
+ {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
284
+ // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
285
+ // though the same glyph might be used for both -->
286
+ {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
287
+ // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
288
+ // though the same glyph might be used for both -->
289
+ {"minus", "8722"}, // minus sign, U+2212 ISOtech -->
290
+ {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
291
+ {"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
292
+ {"prop", "8733"}, // proportional to, U+221D ISOtech -->
293
+ {"infin", "8734"}, // infinity, U+221E ISOtech -->
294
+ {"ang", "8736"}, // angle, U+2220 ISOamso -->
295
+ {"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
296
+ {"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
297
+ {"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
298
+ {"cup", "8746"}, // union = cup, U+222A ISOtech -->
299
+ {"int", "8747"}, // integral, U+222B ISOtech -->
300
+ {"there4", "8756"}, // therefore, U+2234 ISOtech -->
301
+ {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
302
+ // <!-- tilde operator is NOT the same character as the tilde, U+007E,although
303
+ // the same glyph might be used to represent both -->
304
+ {"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
305
+ {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
306
+ {"ne", "8800"}, // not equal to, U+2260 ISOtech -->
307
+ {"equiv", "8801"}, // identical to, U+2261 ISOtech -->
308
+ {"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
309
+ {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
310
+ {"sub", "8834"}, // subset of, U+2282 ISOtech -->
311
+ {"sup", "8835"}, // superset of, U+2283 ISOtech -->
312
+ // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
313
+ // Symbol font encoding and is not included. Should it be, for symmetry?
314
+ // It is in ISOamsn --> <!ENTITY nsub", "8836"},
315
+ // not a subset of, U+2284 ISOamsn -->
316
+ {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
317
+ {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
318
+ {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
319
+ {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
320
+ {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
321
+ {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
322
+ // <!-- dot operator is NOT the same character as U+00B7 middle dot -->
323
+ // <!-- Miscellaneous Technical -->
324
+ {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
325
+ {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
326
+ {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
327
+ {"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
328
+ {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
329
+ // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
330
+ // mark' -->
331
+ {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
332
+ // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
333
+ // 'single right-pointing angle quotation mark' -->
334
+ // <!-- Geometric Shapes -->
335
+ {"loz", "9674"}, // lozenge, U+25CA ISOpub -->
336
+ // <!-- Miscellaneous Symbols -->
337
+ {"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
338
+ // <!-- black here seems to mean filled as opposed to hollow -->
339
+ {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
340
+ {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
341
+ {"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
342
+
343
+ // <!-- Latin Extended-A -->
344
+ {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
345
+ {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
346
+ // <!-- ligature is a misnomer, this is a separate character in some languages -->
347
+ {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
348
+ {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
349
+ {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
350
+ // <!-- Spacing Modifier Letters -->
351
+ {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
352
+ {"tilde", "732"}, // small tilde, U+02DC ISOdia -->
353
+ // <!-- General Punctuation -->
354
+ {"ensp", "8194"}, // en space, U+2002 ISOpub -->
355
+ {"emsp", "8195"}, // em space, U+2003 ISOpub -->
356
+ {"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
357
+ {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
358
+ {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
359
+ {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
360
+ {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
361
+ {"ndash", "8211"}, // en dash, U+2013 ISOpub -->
362
+ {"mdash", "8212"}, // em dash, U+2014 ISOpub -->
363
+ {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
364
+ {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
365
+ {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
366
+ {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
367
+ {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
368
+ {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
369
+ {"dagger", "8224"}, // dagger, U+2020 ISOpub -->
370
+ {"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
371
+ {"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
372
+ {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
373
+ // <!-- lsaquo is proposed but not yet ISO standardized -->
374
+ {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
375
+ // <!-- rsaquo is proposed but not yet ISO standardized -->
376
+ {"euro", "8364"}, // -- euro sign, U+20AC NEW -->
377
+ };
378
+
379
+ /**
380
+ * <p>
381
+ * The set of entities supported by standard XML.
382
+ * </p>
383
+ */
384
+ public static final Entities XML;
385
+
386
+ /**
387
+ * <p>
388
+ * The set of entities supported by HTML 3.2.
389
+ * </p>
390
+ */
391
+ public static final Entities HTML32;
392
+
393
+ /**
394
+ * <p>
395
+ * The set of entities supported by HTML 4.0.
396
+ * </p>
397
+ */
398
+ public static final Entities HTML40;
399
+
400
+ /**
401
+ * <p>
402
+ * The set of entities supported by the Ruby fast_xs extension.
403
+ * </p>
404
+ */
405
+ public static final Entities FAST_XS;
406
+
407
+ static {
408
+ XML = new Entities();
409
+ XML.addEntities(BASIC_ARRAY);
410
+ XML.addEntities(APOS_ARRAY);
411
+ }
412
+
413
+ static {
414
+ HTML32 = new Entities();
415
+ HTML32.addEntities(BASIC_ARRAY);
416
+ HTML32.addEntities(ISO8859_1_ARRAY);
417
+ }
418
+
419
+ static {
420
+ HTML40 = new Entities();
421
+ fillWithHtml40Entities(HTML40);
422
+ }
423
+
424
+ static {
425
+ FAST_XS = new Entities();
426
+ FAST_XS.addEntities(BASIC_ARRAY);
427
+ }
428
+
429
+ /**
430
+ * <p>
431
+ * Fills the specified entities instance with HTML 40 entities.
432
+ * </p>
433
+ *
434
+ * @param entities
435
+ * the instance to be filled.
436
+ */
437
+ static void fillWithHtml40Entities(Entities entities) {
438
+ entities.addEntities(BASIC_ARRAY);
439
+ entities.addEntities(ISO8859_1_ARRAY);
440
+ entities.addEntities(HTML40_ARRAY);
441
+ }
442
+
443
+ static interface EntityMap {
444
+ /**
445
+ * <p>
446
+ * Add an entry to this entity map.
447
+ * </p>
448
+ *
449
+ * @param name
450
+ * the entity name
451
+ * @param value
452
+ * the entity value
453
+ */
454
+ void add(String name, int value);
455
+
456
+ /**
457
+ * <p>
458
+ * Returns the name of the entity identified by the specified value.
459
+ * </p>
460
+ *
461
+ * @param value
462
+ * the value to locate
463
+ * @return entity name associated with the specified value
464
+ */
465
+ String name(int value);
466
+
467
+ /**
468
+ * <p>
469
+ * Returns the value of the entity identified by the specified name.
470
+ * </p>
471
+ *
472
+ * @param name
473
+ * the name to locate
474
+ * @return entity value associated with the specified name
475
+ */
476
+ int value(String name);
477
+ }
478
+
479
+ // Very limited IntHashMap - it only supports get() and put()
480
+ private static class IntHashMap {
481
+ private transient Entry table[];
482
+
483
+ private transient int count;
484
+
485
+
486
+ private int threshold;
487
+
488
+ private final float loadFactor;
489
+
490
+ private static class Entry {
491
+ final int hash;
492
+ final int key;
493
+ Object value;
494
+ Entry next;
495
+
496
+ protected Entry(int hash, int key, Object value, Entry next) {
497
+ this.hash = hash;
498
+ this.key = key;
499
+ this.value = value;
500
+ this.next = next;
501
+ }
502
+ }
503
+
504
+ public IntHashMap() {
505
+ this.loadFactor = 0.75f;
506
+ table = new Entry[20];
507
+ threshold = (int) (20 * 0.75f);
508
+ }
509
+
510
+ public Object get(int key) {
511
+ Entry tab[] = table;
512
+ int hash = key;
513
+ int index = (hash & 0x7FFFFFFF) % tab.length;
514
+ for (Entry e = tab[index]; e != null; e = e.next) {
515
+ if (e.hash == hash) {
516
+ return e.value;
517
+ }
518
+ }
519
+ return null;
520
+ }
521
+
522
+ protected void rehash() {
523
+ int oldCapacity = table.length;
524
+ Entry oldMap[] = table;
525
+
526
+ int newCapacity = oldCapacity * 2 + 1;
527
+ Entry newMap[] = new Entry[newCapacity];
528
+
529
+ threshold = (int) (newCapacity * loadFactor);
530
+ table = newMap;
531
+
532
+ for (int i = oldCapacity; i-- > 0;) {
533
+ for (Entry old = oldMap[i]; old != null;) {
534
+ Entry e = old;
535
+ old = old.next;
536
+
537
+ int index = (e.hash & 0x7FFFFFFF) % newCapacity;
538
+ e.next = newMap[index];
539
+ newMap[index] = e;
540
+ }
541
+ }
542
+ }
543
+
544
+ public Object put(int key, Object value) {
545
+ // Makes sure the key is not already in the hashtable.
546
+ Entry tab[] = table;
547
+ int hash = key;
548
+ int index = (hash & 0x7FFFFFFF) % tab.length;
549
+ for (Entry e = tab[index]; e != null; e = e.next) {
550
+ if (e.hash == hash) {
551
+ Object old = e.value;
552
+ e.value = value;
553
+ return old;
554
+ }
555
+ }
556
+
557
+ if (count >= threshold) {
558
+ // Rehash the table if the threshold is exceeded
559
+ rehash();
560
+
561
+ tab = table;
562
+ index = (hash & 0x7FFFFFFF) % tab.length;
563
+ }
564
+
565
+ // Creates the new entry.
566
+ Entry e = new Entry(hash, key, value, tab[index]);
567
+ tab[index] = e;
568
+ count++;
569
+ return null;
570
+ }
571
+ }
572
+
573
+ static class PrimitiveEntityMap implements EntityMap {
574
+ private Map mapNameToValue = new HashMap();
575
+
576
+ private IntHashMap mapValueToName = new IntHashMap();
577
+
578
+ /**
579
+ * {@inheritDoc}
580
+ */
581
+ public void add(String name, int value) {
582
+ mapNameToValue.put(name, new Integer(value));
583
+ mapValueToName.put(value, name);
584
+ }
585
+
586
+ /**
587
+ * {@inheritDoc}
588
+ */
589
+ public String name(int value) {
590
+ return (String) mapValueToName.get(value);
591
+ }
592
+
593
+ /**
594
+ * {@inheritDoc}
595
+ */
596
+ public int value(String name) {
597
+ Object value = mapNameToValue.get(name);
598
+ if (value == null) {
599
+ return -1;
600
+ }
601
+ return ((Integer) value).intValue();
602
+ }
603
+ }
604
+
605
+ static abstract class MapIntMap implements Entities.EntityMap {
606
+ protected Map mapNameToValue;
607
+
608
+ protected Map mapValueToName;
609
+
610
+ /**
611
+ * {@inheritDoc}
612
+ */
613
+ public void add(String name, int value) {
614
+ mapNameToValue.put(name, new Integer(value));
615
+ mapValueToName.put(new Integer(value), name);
616
+ }
617
+
618
+ /**
619
+ * {@inheritDoc}
620
+ */
621
+ public String name(int value) {
622
+ return (String) mapValueToName.get(new Integer(value));
623
+ }
624
+
625
+ /**
626
+ * {@inheritDoc}
627
+ */
628
+ public int value(String name) {
629
+ Object value = mapNameToValue.get(name);
630
+ if (value == null) {
631
+ return -1;
632
+ }
633
+ return ((Integer) value).intValue();
634
+ }
635
+ }
636
+
637
+ static class HashEntityMap extends MapIntMap {
638
+ /**
639
+ * Constructs a new instance of <code>HashEntityMap</code>.
640
+ */
641
+ public HashEntityMap() {
642
+ mapNameToValue = new HashMap();
643
+ mapValueToName = new HashMap();
644
+ }
645
+ }
646
+
647
+ static class TreeEntityMap extends MapIntMap {
648
+ /**
649
+ * Constructs a new instance of <code>TreeEntityMap</code>.
650
+ */
651
+ public TreeEntityMap() {
652
+ mapNameToValue = new TreeMap();
653
+ mapValueToName = new TreeMap();
654
+ }
655
+ }
656
+
657
+ static class LookupEntityMap extends PrimitiveEntityMap {
658
+ private String[] lookupTable;
659
+
660
+ private int LOOKUP_TABLE_SIZE = 256;
661
+
662
+ /**
663
+ * {@inheritDoc}
664
+ */
665
+ public String name(int value) {
666
+ if (value < LOOKUP_TABLE_SIZE) {
667
+ return lookupTable()[value];
668
+ }
669
+ return super.name(value);
670
+ }
671
+
672
+ /**
673
+ * <p>
674
+ * Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
675
+ * </p>
676
+ *
677
+ * @return the lookup table
678
+ */
679
+ private String[] lookupTable() {
680
+ if (lookupTable == null) {
681
+ createLookupTable();
682
+ }
683
+ return lookupTable;
684
+ }
685
+
686
+ /**
687
+ * <p>
688
+ * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
689
+ * </p>
690
+ */
691
+ private void createLookupTable() {
692
+ lookupTable = new String[LOOKUP_TABLE_SIZE];
693
+ for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
694
+ lookupTable[i] = super.name(i);
695
+ }
696
+ }
697
+ }
698
+
699
+ static class ArrayEntityMap implements EntityMap {
700
+ protected int growBy = 100;
701
+
702
+ protected int size = 0;
703
+
704
+ protected String[] names;
705
+
706
+ protected int[] values;
707
+
708
+ /**
709
+ * Constructs a new instance of <code>ArrayEntityMap</code>.
710
+ */
711
+ public ArrayEntityMap() {
712
+ names = new String[growBy];
713
+ values = new int[growBy];
714
+ }
715
+
716
+ /**
717
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
718
+ * grow.
719
+ *
720
+ * @param growBy
721
+ * array will be initialized to and will grow by this amount
722
+ */
723
+ public ArrayEntityMap(int growBy) {
724
+ this.growBy = growBy;
725
+ names = new String[growBy];
726
+ values = new int[growBy];
727
+ }
728
+
729
+ /**
730
+ * {@inheritDoc}
731
+ */
732
+ public void add(String name, int value) {
733
+ ensureCapacity(size + 1);
734
+ names[size] = name;
735
+ values[size] = value;
736
+ size++;
737
+ }
738
+
739
+ /**
740
+ * Verifies the capacity of the entity array, adjusting the size if necessary.
741
+ *
742
+ * @param capacity
743
+ * size the array should be
744
+ */
745
+ protected void ensureCapacity(int capacity) {
746
+ if (capacity > names.length) {
747
+ int newSize = Math.max(capacity, size + growBy);
748
+ String[] newNames = new String[newSize];
749
+ System.arraycopy(names, 0, newNames, 0, size);
750
+ names = newNames;
751
+ int[] newValues = new int[newSize];
752
+ System.arraycopy(values, 0, newValues, 0, size);
753
+ values = newValues;
754
+ }
755
+ }
756
+
757
+ /**
758
+ * {@inheritDoc}
759
+ */
760
+ public String name(int value) {
761
+ for (int i = 0; i < size; ++i) {
762
+ if (values[i] == value) {
763
+ return names[i];
764
+ }
765
+ }
766
+ return null;
767
+ }
768
+
769
+ /**
770
+ * {@inheritDoc}
771
+ */
772
+ public int value(String name) {
773
+ for (int i = 0; i < size; ++i) {
774
+ if (names[i].equals(name)) {
775
+ return values[i];
776
+ }
777
+ }
778
+ return -1;
779
+ }
780
+ }
781
+
782
+ static class BinaryEntityMap extends ArrayEntityMap {
783
+
784
+ /**
785
+ * Constructs a new instance of <code>BinaryEntityMap</code>.
786
+ */
787
+ public BinaryEntityMap() {
788
+ super();
789
+ }
790
+
791
+ /**
792
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
793
+ * should grow.
794
+ *
795
+ * @param growBy
796
+ * array will be initialized to and will grow by this amount
797
+ */
798
+ public BinaryEntityMap(int growBy) {
799
+ super(growBy);
800
+ }
801
+
802
+ /**
803
+ * Performs a binary search of the entity array for the specified key. This method is based on code in
804
+ * {@link java.util.Arrays}.
805
+ *
806
+ * @param key
807
+ * the key to be found
808
+ * @return the index of the entity array matching the specified key
809
+ */
810
+ private int binarySearch(int key) {
811
+ int low = 0;
812
+ int high = size - 1;
813
+
814
+ while (low <= high) {
815
+ int mid = (low + high) >> 1;
816
+ int midVal = values[mid];
817
+
818
+ if (midVal < key) {
819
+ low = mid + 1;
820
+ } else if (midVal > key) {
821
+ high = mid - 1;
822
+ } else {
823
+ return mid; // key found
824
+ }
825
+ }
826
+ return -(low + 1); // key not found.
827
+ }
828
+
829
+ /**
830
+ * {@inheritDoc}
831
+ */
832
+ public void add(String name, int value) {
833
+ ensureCapacity(size + 1);
834
+ int insertAt = binarySearch(value);
835
+ if (insertAt > 0) {
836
+ return; // note: this means you can't insert the same value twice
837
+ }
838
+ insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
839
+ System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
840
+ values[insertAt] = value;
841
+ System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
842
+ names[insertAt] = name;
843
+ size++;
844
+ }
845
+
846
+ /**
847
+ * {@inheritDoc}
848
+ */
849
+ public String name(int value) {
850
+ int index = binarySearch(value);
851
+ if (index < 0) {
852
+ return null;
853
+ }
854
+ return names[index];
855
+ }
856
+ }
857
+
858
+ // package scoped for testing
859
+ EntityMap map = new Entities.LookupEntityMap();
860
+
861
+ /**
862
+ * <p>
863
+ * Adds entities to this entity.
864
+ * </p>
865
+ *
866
+ * @param entityArray
867
+ * array of entities to be added
868
+ */
869
+ public void addEntities(String[][] entityArray) {
870
+ for (int i = 0; i < entityArray.length; ++i) {
871
+ addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
872
+ }
873
+ }
874
+
875
+ /**
876
+ * <p>
877
+ * Add an entity to this entity.
878
+ * </p>
879
+ *
880
+ * @param name
881
+ * name of the entity
882
+ * @param value
883
+ * vale of the entity
884
+ */
885
+ public void addEntity(String name, int value) {
886
+ map.add(name, value);
887
+ }
888
+
889
+ /**
890
+ * <p>
891
+ * Returns the name of the entity identified by the specified value.
892
+ * </p>
893
+ *
894
+ * @param value
895
+ * the value to locate
896
+ * @return entity name associated with the specified value
897
+ */
898
+ public String entityName(int value) {
899
+ return map.name(value);
900
+ }
901
+
902
+ /**
903
+ * <p>
904
+ * Returns the value of the entity identified by the specified name.
905
+ * </p>
906
+ *
907
+ * @param name
908
+ * the name to locate
909
+ * @return entity value associated with the specified name
910
+ */
911
+ public int entityValue(String name) {
912
+ return map.value(name);
913
+ }
914
+
915
+ /**
916
+ * <p>
917
+ * Escapes the characters in a <code>String</code>.
918
+ * </p>
919
+ *
920
+ * <p>
921
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), escape(&quot;\u00A1&quot;) will return
922
+ * &quot;&amp;foo;&quot;
923
+ * </p>
924
+ *
925
+ * @param str
926
+ * The <code>String</code> to escape.
927
+ * @return A new escaped <code>String</code>.
928
+ */
929
+ public String escape(String str) {
930
+ StringWriter stringWriter = createStringWriter(str);
931
+ try {
932
+ this.escape(stringWriter, str);
933
+ } catch (IOException e) {
934
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
935
+ // throw IOExceptions.
936
+ throw new RuntimeException(e);
937
+ }
938
+ return stringWriter.toString();
939
+ }
940
+
941
+ /**
942
+ * <p>
943
+ * Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
944
+ * passed.
945
+ * </p>
946
+ *
947
+ * @param writer
948
+ * The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
949
+ * @param str
950
+ * The <code>String</code> to escape. Assumed to be a non-null value.
951
+ * @throws IOException
952
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
953
+ * methods.
954
+ *
955
+ * @see #escape(String)
956
+ * @see Writer
957
+ */
958
+ public void escape(Writer writer, String str) throws IOException {
959
+ int len = str.length();
960
+ for (int i = 0; i < len; i++) {
961
+ char c = str.charAt(i);
962
+ String entityName = this.entityName(c);
963
+ if (entityName == null) {
964
+ if (c > 0x7F) {
965
+ writer.write("&#");
966
+ writer.write(Integer.toString(c, 10));
967
+ writer.write(';');
968
+ } else {
969
+ writer.write(c);
970
+ }
971
+ } else {
972
+ writer.write('&');
973
+ writer.write(entityName);
974
+ writer.write(';');
975
+ }
976
+ }
977
+ }
978
+
979
+ /**
980
+ * <p>
981
+ * Unescapes the entities in a <code>String</code>.
982
+ * </p>
983
+ *
984
+ * <p>
985
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), unescape(&quot;&amp;foo;&quot;) will return
986
+ * &quot;\u00A1&quot;
987
+ * </p>
988
+ *
989
+ * @param str
990
+ * The <code>String</code> to escape.
991
+ * @return A new escaped <code>String</code>.
992
+ */
993
+ public String unescape(String str) {
994
+ int firstAmp = str.indexOf('&');
995
+ if (firstAmp < 0) {
996
+ return str;
997
+ } else {
998
+ StringWriter stringWriter = createStringWriter(str);
999
+ try {
1000
+ this.doUnescape(stringWriter, str, firstAmp);
1001
+ } catch (IOException e) {
1002
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
1003
+ // do not throw IOExceptions.
1004
+ throw new RuntimeException(e);
1005
+ }
1006
+ return stringWriter.toString();
1007
+ }
1008
+ }
1009
+
1010
+ /**
1011
+ * Make the StringWriter 10% larger than the source String to avoid growing the writer
1012
+ *
1013
+ * @param str The source string
1014
+ * @return A newly created StringWriter
1015
+ */
1016
+ private StringWriter createStringWriter(String str) {
1017
+ return new StringWriter((int) (str.length() + (str.length() * 0.1)));
1018
+ }
1019
+
1020
+ /**
1021
+ * <p>
1022
+ * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
1023
+ * <code>Writer</code> passed.
1024
+ * </p>
1025
+ *
1026
+ * @param writer
1027
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
1028
+ * @param str
1029
+ * The source <code>String</code> to unescape; assumed to be non-null.
1030
+ * @throws IOException
1031
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
1032
+ * methods.
1033
+ *
1034
+ * @see #escape(String)
1035
+ * @see Writer
1036
+ */
1037
+ public void unescape(Writer writer, String str) throws IOException {
1038
+ int firstAmp = str.indexOf('&');
1039
+ if (firstAmp < 0) {
1040
+ writer.write(str);
1041
+ return;
1042
+ } else {
1043
+ doUnescape(writer, str, firstAmp);
1044
+ }
1045
+ }
1046
+
1047
+ /**
1048
+ * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
1049
+ *
1050
+ * @param writer
1051
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
1052
+ * @param str
1053
+ * The source <code>String</code> to unescape; assumed to be non-null.
1054
+ * @param firstAmp
1055
+ * The <code>int</code> index of the first ampersand in the source String.
1056
+ * @throws IOException
1057
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
1058
+ * methods.
1059
+ */
1060
+ private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
1061
+ writer.write(str, 0, firstAmp);
1062
+ int len = str.length();
1063
+ for (int i = firstAmp; i < len; i++) {
1064
+ char c = str.charAt(i);
1065
+ if (c == '&') {
1066
+ int nextIdx = i + 1;
1067
+ int semiColonIdx = str.indexOf(';', nextIdx);
1068
+ if (semiColonIdx == -1) {
1069
+ writer.write(c);
1070
+ continue;
1071
+ }
1072
+ int amphersandIdx = str.indexOf('&', i + 1);
1073
+ if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
1074
+ // Then the text looks like &...&...;
1075
+ writer.write(c);
1076
+ continue;
1077
+ }
1078
+ String entityContent = str.substring(nextIdx, semiColonIdx);
1079
+ int entityValue = -1;
1080
+ int entityContentLen = entityContent.length();
1081
+ if (entityContentLen > 0) {
1082
+ if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
1083
+ // hexidecimal)
1084
+ if (entityContentLen > 1) {
1085
+ char isHexChar = entityContent.charAt(1);
1086
+ try {
1087
+ switch (isHexChar) {
1088
+ case 'X' :
1089
+ case 'x' : {
1090
+ entityValue = Integer.parseInt(entityContent.substring(2), 16);
1091
+ break;
1092
+ }
1093
+ default : {
1094
+ entityValue = Integer.parseInt(entityContent.substring(1), 10);
1095
+ }
1096
+ }
1097
+ if (entityValue > 0xFFFF) {
1098
+ entityValue = -1;
1099
+ }
1100
+ } catch (NumberFormatException e) {
1101
+ entityValue = -1;
1102
+ }
1103
+ }
1104
+ } else { // escaped value content is an entity name
1105
+ entityValue = this.entityValue(entityContent);
1106
+ }
1107
+ }
1108
+
1109
+ if (entityValue == -1) {
1110
+ writer.write('&');
1111
+ writer.write(entityContent);
1112
+ writer.write(';');
1113
+ } else {
1114
+ writer.write(entityValue);
1115
+ }
1116
+ i = semiColonIdx; // move index up to the semi-colon
1117
+ } else {
1118
+ writer.write(c);
1119
+ }
1120
+ }
1121
+ }
1122
+
1123
+ }