hpricot 0.8.2-java

Sign up to get free protection for your applications and to get access to all the features.
Files changed (52) hide show
  1. data/CHANGELOG +88 -0
  2. data/COPYING +18 -0
  3. data/README +275 -0
  4. data/Rakefile +272 -0
  5. data/ext/fast_xs/FastXsService.java +1030 -0
  6. data/ext/fast_xs/extconf.rb +4 -0
  7. data/ext/fast_xs/fast_xs.c +201 -0
  8. data/ext/hpricot_scan/HpricotCss.java +831 -0
  9. data/ext/hpricot_scan/HpricotScanService.java +2086 -0
  10. data/ext/hpricot_scan/extconf.rb +6 -0
  11. data/ext/hpricot_scan/hpricot_common.rl +76 -0
  12. data/ext/hpricot_scan/hpricot_css.c +3503 -0
  13. data/ext/hpricot_scan/hpricot_css.java.rl +155 -0
  14. data/ext/hpricot_scan/hpricot_css.rl +115 -0
  15. data/ext/hpricot_scan/hpricot_scan.c +6927 -0
  16. data/ext/hpricot_scan/hpricot_scan.h +79 -0
  17. data/ext/hpricot_scan/hpricot_scan.java.rl +1152 -0
  18. data/ext/hpricot_scan/hpricot_scan.rl +788 -0
  19. data/extras/mingw-rbconfig.rb +176 -0
  20. data/lib/fast_xs.jar +0 -0
  21. data/lib/hpricot.rb +26 -0
  22. data/lib/hpricot/blankslate.rb +63 -0
  23. data/lib/hpricot/builder.rb +216 -0
  24. data/lib/hpricot/elements.rb +510 -0
  25. data/lib/hpricot/htmlinfo.rb +691 -0
  26. data/lib/hpricot/inspect.rb +103 -0
  27. data/lib/hpricot/modules.rb +40 -0
  28. data/lib/hpricot/parse.rb +38 -0
  29. data/lib/hpricot/tag.rb +219 -0
  30. data/lib/hpricot/tags.rb +164 -0
  31. data/lib/hpricot/traverse.rb +839 -0
  32. data/lib/hpricot/xchar.rb +94 -0
  33. data/lib/hpricot_scan.jar +0 -0
  34. data/test/files/basic.xhtml +17 -0
  35. data/test/files/boingboing.html +2266 -0
  36. data/test/files/cy0.html +3653 -0
  37. data/test/files/immob.html +400 -0
  38. data/test/files/pace_application.html +1320 -0
  39. data/test/files/tenderlove.html +16 -0
  40. data/test/files/uswebgen.html +220 -0
  41. data/test/files/utf8.html +1054 -0
  42. data/test/files/week9.html +1723 -0
  43. data/test/files/why.xml +19 -0
  44. data/test/load_files.rb +7 -0
  45. data/test/nokogiri-bench.rb +64 -0
  46. data/test/test_alter.rb +96 -0
  47. data/test/test_builder.rb +37 -0
  48. data/test/test_parser.rb +428 -0
  49. data/test/test_paths.rb +25 -0
  50. data/test/test_preserved.rb +88 -0
  51. data/test/test_xml.rb +28 -0
  52. metadata +112 -0
@@ -0,0 +1,1030 @@
1
+
2
+ import java.io.IOException;
3
+ import java.io.StringWriter;
4
+ import java.io.Writer;
5
+ import java.util.HashMap;
6
+ import java.util.Map;
7
+ import java.util.TreeMap;
8
+ import org.jruby.Ruby;
9
+ import org.jruby.RubyModule;
10
+ import org.jruby.runtime.CallbackFactory;
11
+ import org.jruby.runtime.builtin.IRubyObject;
12
+ import org.jruby.runtime.load.BasicLibraryService;
13
+ import org.jruby.util.collections.IntHashMap;
14
+
15
+ public class FastXsService implements BasicLibraryService {
16
+
17
+ public boolean basicLoad(final Ruby runtime) throws IOException {
18
+ RubyModule string = runtime.getModule("String");
19
+ CallbackFactory fact = runtime.callbackFactory(FastXsService.class);
20
+ string.defineMethod("fast_xs",fact.getFastSingletonMethod("fast_xs"));
21
+ return true;
22
+ }
23
+
24
+ public static IRubyObject fast_xs(IRubyObject recv) {
25
+ String string = recv.convertToString().getUnicodeValue();
26
+ StringWriter writer = new StringWriter ((int)(string.length() * 1.5));
27
+ try {
28
+ Entities.FAST_XS.escape(writer, string);
29
+ return recv.getRuntime().newString(writer.toString());
30
+ } catch (IOException e) {
31
+ throw recv.getRuntime().newIOErrorFromException(e);
32
+ }
33
+ }
34
+ }
35
+
36
+ // From Apache commons-lang,
37
+ // http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?revision=560660&view=markup
38
+ /*
39
+ * Licensed to the Apache Software Foundation (ASF) under one or more
40
+ * contributor license agreements. See the NOTICE file distributed with
41
+ * this work for additional information regarding copyright ownership.
42
+ * The ASF licenses this file to You under the Apache License, Version 2.0
43
+ * (the "License"); you may not use this file except in compliance with
44
+ * the License. You may obtain a copy of the License at
45
+ *
46
+ * http://www.apache.org/licenses/LICENSE-2.0
47
+ *
48
+ * Unless required by applicable law or agreed to in writing, software
49
+ * distributed under the License is distributed on an "AS IS" BASIS,
50
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
51
+ * See the License for the specific language governing permissions and
52
+ * limitations under the License.
53
+ */
54
+
55
+ /**
56
+ * <p>
57
+ * Provides HTML and XML entity utilities.
58
+ * </p>
59
+ *
60
+ * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
61
+ * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
62
+ * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
63
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
64
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
65
+ *
66
+ * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
67
+ * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
68
+ * @since 2.0
69
+ * @version $Id$
70
+ */
71
+ class Entities {
72
+
73
+ private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
74
+ {"amp", "38"}, // & - ampersand
75
+ {"lt", "60"}, // < - less-than
76
+ {"gt", "62"}, // > - greater-than
77
+ };
78
+
79
+ private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
80
+ };
81
+
82
+ // package scoped for testing
83
+ static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
84
+ {"iexcl", "161"}, // inverted exclamation mark
85
+ {"cent", "162"}, // cent sign
86
+ {"pound", "163"}, // pound sign
87
+ {"curren", "164"}, // currency sign
88
+ {"yen", "165"}, // yen sign = yuan sign
89
+ {"brvbar", "166"}, // broken bar = broken vertical bar
90
+ {"sect", "167"}, // section sign
91
+ {"uml", "168"}, // diaeresis = spacing diaeresis
92
+ {"copy", "169"}, // © - copyright sign
93
+ {"ordf", "170"}, // feminine ordinal indicator
94
+ {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
95
+ {"not", "172"}, // not sign
96
+ {"shy", "173"}, // soft hyphen = discretionary hyphen
97
+ {"reg", "174"}, // ® - registered trademark sign
98
+ {"macr", "175"}, // macron = spacing macron = overline = APL overbar
99
+ {"deg", "176"}, // degree sign
100
+ {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
101
+ {"sup2", "178"}, // superscript two = superscript digit two = squared
102
+ {"sup3", "179"}, // superscript three = superscript digit three = cubed
103
+ {"acute", "180"}, // acute accent = spacing acute
104
+ {"micro", "181"}, // micro sign
105
+ {"para", "182"}, // pilcrow sign = paragraph sign
106
+ {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
107
+ {"cedil", "184"}, // cedilla = spacing cedilla
108
+ {"sup1", "185"}, // superscript one = superscript digit one
109
+ {"ordm", "186"}, // masculine ordinal indicator
110
+ {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
111
+ {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
112
+ {"frac12", "189"}, // vulgar fraction one half = fraction one half
113
+ {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
114
+ {"iquest", "191"}, // inverted question mark = turned question mark
115
+ {"Agrave", "192"}, // À - uppercase A, grave accent
116
+ {"Aacute", "193"}, // Á - uppercase A, acute accent
117
+ {"Acirc", "194"}, // Â - uppercase A, circumflex accent
118
+ {"Atilde", "195"}, // Ã - uppercase A, tilde
119
+ {"Auml", "196"}, // Ä - uppercase A, umlaut
120
+ {"Aring", "197"}, // Å - uppercase A, ring
121
+ {"AElig", "198"}, // Æ - uppercase AE
122
+ {"Ccedil", "199"}, // Ç - uppercase C, cedilla
123
+ {"Egrave", "200"}, // È - uppercase E, grave accent
124
+ {"Eacute", "201"}, // É - uppercase E, acute accent
125
+ {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent
126
+ {"Euml", "203"}, // Ë - uppercase E, umlaut
127
+ {"Igrave", "204"}, // Ì - uppercase I, grave accent
128
+ {"Iacute", "205"}, // Í - uppercase I, acute accent
129
+ {"Icirc", "206"}, // Î - uppercase I, circumflex accent
130
+ {"Iuml", "207"}, // Ï - uppercase I, umlaut
131
+ {"ETH", "208"}, // Ð - uppercase Eth, Icelandic
132
+ {"Ntilde", "209"}, // Ñ - uppercase N, tilde
133
+ {"Ograve", "210"}, // Ò - uppercase O, grave accent
134
+ {"Oacute", "211"}, // Ó - uppercase O, acute accent
135
+ {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
136
+ {"Otilde", "213"}, // Õ - uppercase O, tilde
137
+ {"Ouml", "214"}, // Ö - uppercase O, umlaut
138
+ {"times", "215"}, // multiplication sign
139
+ {"Oslash", "216"}, // Ø - uppercase O, slash
140
+ {"Ugrave", "217"}, // Ù - uppercase U, grave accent
141
+ {"Uacute", "218"}, // Ú - uppercase U, acute accent
142
+ {"Ucirc", "219"}, // Û - uppercase U, circumflex accent
143
+ {"Uuml", "220"}, // Ü - uppercase U, umlaut
144
+ {"Yacute", "221"}, // Ý - uppercase Y, acute accent
145
+ {"THORN", "222"}, // Þ - uppercase THORN, Icelandic
146
+ {"szlig", "223"}, // ß - lowercase sharps, German
147
+ {"agrave", "224"}, // à - lowercase a, grave accent
148
+ {"aacute", "225"}, // á - lowercase a, acute accent
149
+ {"acirc", "226"}, // â - lowercase a, circumflex accent
150
+ {"atilde", "227"}, // ã - lowercase a, tilde
151
+ {"auml", "228"}, // ä - lowercase a, umlaut
152
+ {"aring", "229"}, // å - lowercase a, ring
153
+ {"aelig", "230"}, // æ - lowercase ae
154
+ {"ccedil", "231"}, // ç - lowercase c, cedilla
155
+ {"egrave", "232"}, // è - lowercase e, grave accent
156
+ {"eacute", "233"}, // é - lowercase e, acute accent
157
+ {"ecirc", "234"}, // ê - lowercase e, circumflex accent
158
+ {"euml", "235"}, // ë - lowercase e, umlaut
159
+ {"igrave", "236"}, // ì - lowercase i, grave accent
160
+ {"iacute", "237"}, // í - lowercase i, acute accent
161
+ {"icirc", "238"}, // î - lowercase i, circumflex accent
162
+ {"iuml", "239"}, // ï - lowercase i, umlaut
163
+ {"eth", "240"}, // ð - lowercase eth, Icelandic
164
+ {"ntilde", "241"}, // ñ - lowercase n, tilde
165
+ {"ograve", "242"}, // ò - lowercase o, grave accent
166
+ {"oacute", "243"}, // ó - lowercase o, acute accent
167
+ {"ocirc", "244"}, // ô - lowercase o, circumflex accent
168
+ {"otilde", "245"}, // õ - lowercase o, tilde
169
+ {"ouml", "246"}, // ö - lowercase o, umlaut
170
+ {"divide", "247"}, // division sign
171
+ {"oslash", "248"}, // ø - lowercase o, slash
172
+ {"ugrave", "249"}, // ù - lowercase u, grave accent
173
+ {"uacute", "250"}, // ú - lowercase u, acute accent
174
+ {"ucirc", "251"}, // û - lowercase u, circumflex accent
175
+ {"uuml", "252"}, // ü - lowercase u, umlaut
176
+ {"yacute", "253"}, // ý - lowercase y, acute accent
177
+ {"thorn", "254"}, // þ - lowercase thorn, Icelandic
178
+ {"yuml", "255"}, // ÿ - lowercase y, umlaut
179
+ };
180
+
181
+ // http://www.w3.org/TR/REC-html40/sgml/entities.html
182
+ // package scoped for testing
183
+ static final String[][] HTML40_ARRAY = {
184
+ // <!-- Latin Extended-B -->
185
+ {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
186
+ // <!-- Greek -->
187
+ {"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
188
+ {"Beta", "914"}, // greek capital letter beta, U+0392 -->
189
+ {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
190
+ {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
191
+ {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
192
+ {"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
193
+ {"Eta", "919"}, // greek capital letter eta, U+0397 -->
194
+ {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
195
+ {"Iota", "921"}, // greek capital letter iota, U+0399 -->
196
+ {"Kappa", "922"}, // greek capital letter kappa, U+039A -->
197
+ {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
198
+ {"Mu", "924"}, // greek capital letter mu, U+039C -->
199
+ {"Nu", "925"}, // greek capital letter nu, U+039D -->
200
+ {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
201
+ {"Omicron", "927"}, // greek capital letter omicron, U+039F -->
202
+ {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
203
+ {"Rho", "929"}, // greek capital letter rho, U+03A1 -->
204
+ // <!-- there is no Sigmaf, and no U+03A2 character either -->
205
+ {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
206
+ {"Tau", "932"}, // greek capital letter tau, U+03A4 -->
207
+ {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
208
+ {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
209
+ {"Chi", "935"}, // greek capital letter chi, U+03A7 -->
210
+ {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
211
+ {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
212
+ {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
213
+ {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
214
+ {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
215
+ {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
216
+ {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
217
+ {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
218
+ {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
219
+ {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
220
+ {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
221
+ {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
222
+ {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
223
+ {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
224
+ {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
225
+ {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
226
+ {"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
227
+ {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
228
+ {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
229
+ {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
230
+ {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
231
+ {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
232
+ {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
233
+ {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
234
+ {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
235
+ {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
236
+ {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
237
+ {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
238
+ {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
239
+ {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
240
+ // <!-- General Punctuation -->
241
+ {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
242
+ // <!-- bullet is NOT the same as bullet operator, U+2219 -->
243
+ {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
244
+ {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
245
+ {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
246
+ {"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
247
+ {"frasl", "8260"}, // fraction slash, U+2044 NEW -->
248
+ // <!-- Letterlike Symbols -->
249
+ {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
250
+ {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
251
+ {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
252
+ {"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
253
+ {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
254
+ // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
255
+ // same glyph could be used to depict both characters -->
256
+ // <!-- Arrows -->
257
+ {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
258
+ {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
259
+ {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
260
+ {"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
261
+ {"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
262
+ {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
263
+ {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
264
+ // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
265
+ // arrow but also does not have any other character for that function.
266
+ // So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
267
+ {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
268
+ {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
269
+ // <!-- ISO 10646 does not say this is the 'implies' character but does not
270
+ // have another character with this function so ?rArr can be used for
271
+ // 'implies' as ISOtech suggests -->
272
+ {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
273
+ {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
274
+ // <!-- Mathematical Operators -->
275
+ {"forall", "8704"}, // for all, U+2200 ISOtech -->
276
+ {"part", "8706"}, // partial differential, U+2202 ISOtech -->
277
+ {"exist", "8707"}, // there exists, U+2203 ISOtech -->
278
+ {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
279
+ {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
280
+ {"isin", "8712"}, // element of, U+2208 ISOtech -->
281
+ {"notin", "8713"}, // not an element of, U+2209 ISOtech -->
282
+ {"ni", "8715"}, // contains as member, U+220B ISOtech -->
283
+ // <!-- should there be a more memorable name than 'ni'? -->
284
+ {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
285
+ // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
286
+ // though the same glyph might be used for both -->
287
+ {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
288
+ // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
289
+ // though the same glyph might be used for both -->
290
+ {"minus", "8722"}, // minus sign, U+2212 ISOtech -->
291
+ {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
292
+ {"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
293
+ {"prop", "8733"}, // proportional to, U+221D ISOtech -->
294
+ {"infin", "8734"}, // infinity, U+221E ISOtech -->
295
+ {"ang", "8736"}, // angle, U+2220 ISOamso -->
296
+ {"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
297
+ {"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
298
+ {"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
299
+ {"cup", "8746"}, // union = cup, U+222A ISOtech -->
300
+ {"int", "8747"}, // integral, U+222B ISOtech -->
301
+ {"there4", "8756"}, // therefore, U+2234 ISOtech -->
302
+ {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
303
+ // <!-- tilde operator is NOT the same character as the tilde, U+007E,although
304
+ // the same glyph might be used to represent both -->
305
+ {"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
306
+ {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
307
+ {"ne", "8800"}, // not equal to, U+2260 ISOtech -->
308
+ {"equiv", "8801"}, // identical to, U+2261 ISOtech -->
309
+ {"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
310
+ {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
311
+ {"sub", "8834"}, // subset of, U+2282 ISOtech -->
312
+ {"sup", "8835"}, // superset of, U+2283 ISOtech -->
313
+ // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
314
+ // Symbol font encoding and is not included. Should it be, for symmetry?
315
+ // It is in ISOamsn --> <!ENTITY nsub", "8836"},
316
+ // not a subset of, U+2284 ISOamsn -->
317
+ {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
318
+ {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
319
+ {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
320
+ {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
321
+ {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
322
+ {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
323
+ // <!-- dot operator is NOT the same character as U+00B7 middle dot -->
324
+ // <!-- Miscellaneous Technical -->
325
+ {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
326
+ {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
327
+ {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
328
+ {"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
329
+ {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
330
+ // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
331
+ // mark' -->
332
+ {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
333
+ // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
334
+ // 'single right-pointing angle quotation mark' -->
335
+ // <!-- Geometric Shapes -->
336
+ {"loz", "9674"}, // lozenge, U+25CA ISOpub -->
337
+ // <!-- Miscellaneous Symbols -->
338
+ {"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
339
+ // <!-- black here seems to mean filled as opposed to hollow -->
340
+ {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
341
+ {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
342
+ {"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
343
+
344
+ // <!-- Latin Extended-A -->
345
+ {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
346
+ {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
347
+ // <!-- ligature is a misnomer, this is a separate character in some languages -->
348
+ {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
349
+ {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
350
+ {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
351
+ // <!-- Spacing Modifier Letters -->
352
+ {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
353
+ {"tilde", "732"}, // small tilde, U+02DC ISOdia -->
354
+ // <!-- General Punctuation -->
355
+ {"ensp", "8194"}, // en space, U+2002 ISOpub -->
356
+ {"emsp", "8195"}, // em space, U+2003 ISOpub -->
357
+ {"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
358
+ {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
359
+ {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
360
+ {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
361
+ {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
362
+ {"ndash", "8211"}, // en dash, U+2013 ISOpub -->
363
+ {"mdash", "8212"}, // em dash, U+2014 ISOpub -->
364
+ {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
365
+ {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
366
+ {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
367
+ {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
368
+ {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
369
+ {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
370
+ {"dagger", "8224"}, // dagger, U+2020 ISOpub -->
371
+ {"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
372
+ {"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
373
+ {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
374
+ // <!-- lsaquo is proposed but not yet ISO standardized -->
375
+ {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
376
+ // <!-- rsaquo is proposed but not yet ISO standardized -->
377
+ {"euro", "8364"}, // -- euro sign, U+20AC NEW -->
378
+ };
379
+
380
+ /**
381
+ * <p>
382
+ * The set of entities supported by standard XML.
383
+ * </p>
384
+ */
385
+ public static final Entities XML;
386
+
387
+ /**
388
+ * <p>
389
+ * The set of entities supported by HTML 3.2.
390
+ * </p>
391
+ */
392
+ public static final Entities HTML32;
393
+
394
+ /**
395
+ * <p>
396
+ * The set of entities supported by HTML 4.0.
397
+ * </p>
398
+ */
399
+ public static final Entities HTML40;
400
+
401
+ /**
402
+ * <p>
403
+ * The set of entities supported by the Ruby fast_xs extension.
404
+ * </p>
405
+ */
406
+ public static final Entities FAST_XS;
407
+
408
+ static {
409
+ XML = new Entities();
410
+ XML.addEntities(BASIC_ARRAY);
411
+ XML.addEntities(APOS_ARRAY);
412
+ }
413
+
414
+ static {
415
+ HTML32 = new Entities();
416
+ HTML32.addEntities(BASIC_ARRAY);
417
+ HTML32.addEntities(ISO8859_1_ARRAY);
418
+ }
419
+
420
+ static {
421
+ HTML40 = new Entities();
422
+ fillWithHtml40Entities(HTML40);
423
+ }
424
+
425
+ static {
426
+ FAST_XS = new Entities();
427
+ FAST_XS.addEntities(BASIC_ARRAY);
428
+ }
429
+
430
+ /**
431
+ * <p>
432
+ * Fills the specified entities instance with HTML 40 entities.
433
+ * </p>
434
+ *
435
+ * @param entities
436
+ * the instance to be filled.
437
+ */
438
+ static void fillWithHtml40Entities(Entities entities) {
439
+ entities.addEntities(BASIC_ARRAY);
440
+ entities.addEntities(ISO8859_1_ARRAY);
441
+ entities.addEntities(HTML40_ARRAY);
442
+ }
443
+
444
+ static interface EntityMap {
445
+ /**
446
+ * <p>
447
+ * Add an entry to this entity map.
448
+ * </p>
449
+ *
450
+ * @param name
451
+ * the entity name
452
+ * @param value
453
+ * the entity value
454
+ */
455
+ void add(String name, int value);
456
+
457
+ /**
458
+ * <p>
459
+ * Returns the name of the entity identified by the specified value.
460
+ * </p>
461
+ *
462
+ * @param value
463
+ * the value to locate
464
+ * @return entity name associated with the specified value
465
+ */
466
+ String name(int value);
467
+
468
+ /**
469
+ * <p>
470
+ * Returns the value of the entity identified by the specified name.
471
+ * </p>
472
+ *
473
+ * @param name
474
+ * the name to locate
475
+ * @return entity value associated with the specified name
476
+ */
477
+ int value(String name);
478
+ }
479
+
480
+ static class PrimitiveEntityMap implements EntityMap {
481
+ private Map mapNameToValue = new HashMap();
482
+
483
+ private IntHashMap mapValueToName = new IntHashMap();
484
+
485
+ /**
486
+ * {@inheritDoc}
487
+ */
488
+ public void add(String name, int value) {
489
+ mapNameToValue.put(name, new Integer(value));
490
+ mapValueToName.put(value, name);
491
+ }
492
+
493
+ /**
494
+ * {@inheritDoc}
495
+ */
496
+ public String name(int value) {
497
+ return (String) mapValueToName.get(value);
498
+ }
499
+
500
+ /**
501
+ * {@inheritDoc}
502
+ */
503
+ public int value(String name) {
504
+ Object value = mapNameToValue.get(name);
505
+ if (value == null) {
506
+ return -1;
507
+ }
508
+ return ((Integer) value).intValue();
509
+ }
510
+ }
511
+
512
+ static abstract class MapIntMap implements Entities.EntityMap {
513
+ protected Map mapNameToValue;
514
+
515
+ protected Map mapValueToName;
516
+
517
+ /**
518
+ * {@inheritDoc}
519
+ */
520
+ public void add(String name, int value) {
521
+ mapNameToValue.put(name, new Integer(value));
522
+ mapValueToName.put(new Integer(value), name);
523
+ }
524
+
525
+ /**
526
+ * {@inheritDoc}
527
+ */
528
+ public String name(int value) {
529
+ return (String) mapValueToName.get(new Integer(value));
530
+ }
531
+
532
+ /**
533
+ * {@inheritDoc}
534
+ */
535
+ public int value(String name) {
536
+ Object value = mapNameToValue.get(name);
537
+ if (value == null) {
538
+ return -1;
539
+ }
540
+ return ((Integer) value).intValue();
541
+ }
542
+ }
543
+
544
+ static class HashEntityMap extends MapIntMap {
545
+ /**
546
+ * Constructs a new instance of <code>HashEntityMap</code>.
547
+ */
548
+ public HashEntityMap() {
549
+ mapNameToValue = new HashMap();
550
+ mapValueToName = new HashMap();
551
+ }
552
+ }
553
+
554
+ static class TreeEntityMap extends MapIntMap {
555
+ /**
556
+ * Constructs a new instance of <code>TreeEntityMap</code>.
557
+ */
558
+ public TreeEntityMap() {
559
+ mapNameToValue = new TreeMap();
560
+ mapValueToName = new TreeMap();
561
+ }
562
+ }
563
+
564
+ static class LookupEntityMap extends PrimitiveEntityMap {
565
+ private String[] lookupTable;
566
+
567
+ private int LOOKUP_TABLE_SIZE = 256;
568
+
569
+ /**
570
+ * {@inheritDoc}
571
+ */
572
+ public String name(int value) {
573
+ if (value < LOOKUP_TABLE_SIZE) {
574
+ return lookupTable()[value];
575
+ }
576
+ return super.name(value);
577
+ }
578
+
579
+ /**
580
+ * <p>
581
+ * Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
582
+ * </p>
583
+ *
584
+ * @return the lookup table
585
+ */
586
+ private String[] lookupTable() {
587
+ if (lookupTable == null) {
588
+ createLookupTable();
589
+ }
590
+ return lookupTable;
591
+ }
592
+
593
+ /**
594
+ * <p>
595
+ * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
596
+ * </p>
597
+ */
598
+ private void createLookupTable() {
599
+ lookupTable = new String[LOOKUP_TABLE_SIZE];
600
+ for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
601
+ lookupTable[i] = super.name(i);
602
+ }
603
+ }
604
+ }
605
+
606
+ static class ArrayEntityMap implements EntityMap {
607
+ protected int growBy = 100;
608
+
609
+ protected int size = 0;
610
+
611
+ protected String[] names;
612
+
613
+ protected int[] values;
614
+
615
+ /**
616
+ * Constructs a new instance of <code>ArrayEntityMap</code>.
617
+ */
618
+ public ArrayEntityMap() {
619
+ names = new String[growBy];
620
+ values = new int[growBy];
621
+ }
622
+
623
+ /**
624
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
625
+ * grow.
626
+ *
627
+ * @param growBy
628
+ * array will be initialized to and will grow by this amount
629
+ */
630
+ public ArrayEntityMap(int growBy) {
631
+ this.growBy = growBy;
632
+ names = new String[growBy];
633
+ values = new int[growBy];
634
+ }
635
+
636
+ /**
637
+ * {@inheritDoc}
638
+ */
639
+ public void add(String name, int value) {
640
+ ensureCapacity(size + 1);
641
+ names[size] = name;
642
+ values[size] = value;
643
+ size++;
644
+ }
645
+
646
+ /**
647
+ * Verifies the capacity of the entity array, adjusting the size if necessary.
648
+ *
649
+ * @param capacity
650
+ * size the array should be
651
+ */
652
+ protected void ensureCapacity(int capacity) {
653
+ if (capacity > names.length) {
654
+ int newSize = Math.max(capacity, size + growBy);
655
+ String[] newNames = new String[newSize];
656
+ System.arraycopy(names, 0, newNames, 0, size);
657
+ names = newNames;
658
+ int[] newValues = new int[newSize];
659
+ System.arraycopy(values, 0, newValues, 0, size);
660
+ values = newValues;
661
+ }
662
+ }
663
+
664
+ /**
665
+ * {@inheritDoc}
666
+ */
667
+ public String name(int value) {
668
+ for (int i = 0; i < size; ++i) {
669
+ if (values[i] == value) {
670
+ return names[i];
671
+ }
672
+ }
673
+ return null;
674
+ }
675
+
676
+ /**
677
+ * {@inheritDoc}
678
+ */
679
+ public int value(String name) {
680
+ for (int i = 0; i < size; ++i) {
681
+ if (names[i].equals(name)) {
682
+ return values[i];
683
+ }
684
+ }
685
+ return -1;
686
+ }
687
+ }
688
+
689
+ static class BinaryEntityMap extends ArrayEntityMap {
690
+
691
+ /**
692
+ * Constructs a new instance of <code>BinaryEntityMap</code>.
693
+ */
694
+ public BinaryEntityMap() {
695
+ super();
696
+ }
697
+
698
+ /**
699
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
700
+ * should grow.
701
+ *
702
+ * @param growBy
703
+ * array will be initialized to and will grow by this amount
704
+ */
705
+ public BinaryEntityMap(int growBy) {
706
+ super(growBy);
707
+ }
708
+
709
+ /**
710
+ * Performs a binary search of the entity array for the specified key. This method is based on code in
711
+ * {@link java.util.Arrays}.
712
+ *
713
+ * @param key
714
+ * the key to be found
715
+ * @return the index of the entity array matching the specified key
716
+ */
717
+ private int binarySearch(int key) {
718
+ int low = 0;
719
+ int high = size - 1;
720
+
721
+ while (low <= high) {
722
+ int mid = (low + high) >> 1;
723
+ int midVal = values[mid];
724
+
725
+ if (midVal < key) {
726
+ low = mid + 1;
727
+ } else if (midVal > key) {
728
+ high = mid - 1;
729
+ } else {
730
+ return mid; // key found
731
+ }
732
+ }
733
+ return -(low + 1); // key not found.
734
+ }
735
+
736
+ /**
737
+ * {@inheritDoc}
738
+ */
739
+ public void add(String name, int value) {
740
+ ensureCapacity(size + 1);
741
+ int insertAt = binarySearch(value);
742
+ if (insertAt > 0) {
743
+ return; // note: this means you can't insert the same value twice
744
+ }
745
+ insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
746
+ System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
747
+ values[insertAt] = value;
748
+ System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
749
+ names[insertAt] = name;
750
+ size++;
751
+ }
752
+
753
+ /**
754
+ * {@inheritDoc}
755
+ */
756
+ public String name(int value) {
757
+ int index = binarySearch(value);
758
+ if (index < 0) {
759
+ return null;
760
+ }
761
+ return names[index];
762
+ }
763
+ }
764
+
765
+ // package scoped for testing
766
+ EntityMap map = new Entities.LookupEntityMap();
767
+
768
+ /**
769
+ * <p>
770
+ * Adds entities to this entity.
771
+ * </p>
772
+ *
773
+ * @param entityArray
774
+ * array of entities to be added
775
+ */
776
+ public void addEntities(String[][] entityArray) {
777
+ for (int i = 0; i < entityArray.length; ++i) {
778
+ addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
779
+ }
780
+ }
781
+
782
+ /**
783
+ * <p>
784
+ * Add an entity to this entity.
785
+ * </p>
786
+ *
787
+ * @param name
788
+ * name of the entity
789
+ * @param value
790
+ * vale of the entity
791
+ */
792
+ public void addEntity(String name, int value) {
793
+ map.add(name, value);
794
+ }
795
+
796
+ /**
797
+ * <p>
798
+ * Returns the name of the entity identified by the specified value.
799
+ * </p>
800
+ *
801
+ * @param value
802
+ * the value to locate
803
+ * @return entity name associated with the specified value
804
+ */
805
+ public String entityName(int value) {
806
+ return map.name(value);
807
+ }
808
+
809
+ /**
810
+ * <p>
811
+ * Returns the value of the entity identified by the specified name.
812
+ * </p>
813
+ *
814
+ * @param name
815
+ * the name to locate
816
+ * @return entity value associated with the specified name
817
+ */
818
+ public int entityValue(String name) {
819
+ return map.value(name);
820
+ }
821
+
822
+ /**
823
+ * <p>
824
+ * Escapes the characters in a <code>String</code>.
825
+ * </p>
826
+ *
827
+ * <p>
828
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), escape(&quot;\u00A1&quot;) will return
829
+ * &quot;&amp;foo;&quot;
830
+ * </p>
831
+ *
832
+ * @param str
833
+ * The <code>String</code> to escape.
834
+ * @return A new escaped <code>String</code>.
835
+ */
836
+ public String escape(String str) {
837
+ StringWriter stringWriter = createStringWriter(str);
838
+ try {
839
+ this.escape(stringWriter, str);
840
+ } catch (IOException e) {
841
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
842
+ // throw IOExceptions.
843
+ throw new RuntimeException(e);
844
+ }
845
+ return stringWriter.toString();
846
+ }
847
+
848
+ /**
849
+ * <p>
850
+ * Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
851
+ * passed.
852
+ * </p>
853
+ *
854
+ * @param writer
855
+ * The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
856
+ * @param str
857
+ * The <code>String</code> to escape. Assumed to be a non-null value.
858
+ * @throws IOException
859
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
860
+ * methods.
861
+ *
862
+ * @see #escape(String)
863
+ * @see Writer
864
+ */
865
+ public void escape(Writer writer, String str) throws IOException {
866
+ int len = str.length();
867
+ for (int i = 0; i < len; i++) {
868
+ char c = str.charAt(i);
869
+ String entityName = this.entityName(c);
870
+ if (entityName == null) {
871
+ if (c > 0x7F) {
872
+ writer.write("&#");
873
+ writer.write(Integer.toString(c, 10));
874
+ writer.write(';');
875
+ } else {
876
+ writer.write(c);
877
+ }
878
+ } else {
879
+ writer.write('&');
880
+ writer.write(entityName);
881
+ writer.write(';');
882
+ }
883
+ }
884
+ }
885
+
886
+ /**
887
+ * <p>
888
+ * Unescapes the entities in a <code>String</code>.
889
+ * </p>
890
+ *
891
+ * <p>
892
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), unescape(&quot;&amp;foo;&quot;) will return
893
+ * &quot;\u00A1&quot;
894
+ * </p>
895
+ *
896
+ * @param str
897
+ * The <code>String</code> to escape.
898
+ * @return A new escaped <code>String</code>.
899
+ */
900
+ public String unescape(String str) {
901
+ int firstAmp = str.indexOf('&');
902
+ if (firstAmp < 0) {
903
+ return str;
904
+ } else {
905
+ StringWriter stringWriter = createStringWriter(str);
906
+ try {
907
+ this.doUnescape(stringWriter, str, firstAmp);
908
+ } catch (IOException e) {
909
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
910
+ // do not throw IOExceptions.
911
+ throw new RuntimeException(e);
912
+ }
913
+ return stringWriter.toString();
914
+ }
915
+ }
916
+
917
+ /**
918
+ * Make the StringWriter 10% larger than the source String to avoid growing the writer
919
+ *
920
+ * @param str The source string
921
+ * @return A newly created StringWriter
922
+ */
923
+ private StringWriter createStringWriter(String str) {
924
+ return new StringWriter((int) (str.length() + (str.length() * 0.1)));
925
+ }
926
+
927
+ /**
928
+ * <p>
929
+ * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
930
+ * <code>Writer</code> passed.
931
+ * </p>
932
+ *
933
+ * @param writer
934
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
935
+ * @param str
936
+ * The source <code>String</code> to unescape; assumed to be non-null.
937
+ * @throws IOException
938
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
939
+ * methods.
940
+ *
941
+ * @see #escape(String)
942
+ * @see Writer
943
+ */
944
+ public void unescape(Writer writer, String str) throws IOException {
945
+ int firstAmp = str.indexOf('&');
946
+ if (firstAmp < 0) {
947
+ writer.write(str);
948
+ return;
949
+ } else {
950
+ doUnescape(writer, str, firstAmp);
951
+ }
952
+ }
953
+
954
+ /**
955
+ * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
956
+ *
957
+ * @param writer
958
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
959
+ * @param str
960
+ * The source <code>String</code> to unescape; assumed to be non-null.
961
+ * @param firstAmp
962
+ * The <code>int</code> index of the first ampersand in the source String.
963
+ * @throws IOException
964
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
965
+ * methods.
966
+ */
967
+ private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
968
+ writer.write(str, 0, firstAmp);
969
+ int len = str.length();
970
+ for (int i = firstAmp; i < len; i++) {
971
+ char c = str.charAt(i);
972
+ if (c == '&') {
973
+ int nextIdx = i + 1;
974
+ int semiColonIdx = str.indexOf(';', nextIdx);
975
+ if (semiColonIdx == -1) {
976
+ writer.write(c);
977
+ continue;
978
+ }
979
+ int amphersandIdx = str.indexOf('&', i + 1);
980
+ if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
981
+ // Then the text looks like &...&...;
982
+ writer.write(c);
983
+ continue;
984
+ }
985
+ String entityContent = str.substring(nextIdx, semiColonIdx);
986
+ int entityValue = -1;
987
+ int entityContentLen = entityContent.length();
988
+ if (entityContentLen > 0) {
989
+ if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
990
+ // hexidecimal)
991
+ if (entityContentLen > 1) {
992
+ char isHexChar = entityContent.charAt(1);
993
+ try {
994
+ switch (isHexChar) {
995
+ case 'X' :
996
+ case 'x' : {
997
+ entityValue = Integer.parseInt(entityContent.substring(2), 16);
998
+ break;
999
+ }
1000
+ default : {
1001
+ entityValue = Integer.parseInt(entityContent.substring(1), 10);
1002
+ }
1003
+ }
1004
+ if (entityValue > 0xFFFF) {
1005
+ entityValue = -1;
1006
+ }
1007
+ } catch (NumberFormatException e) {
1008
+ entityValue = -1;
1009
+ }
1010
+ }
1011
+ } else { // escaped value content is an entity name
1012
+ entityValue = this.entityValue(entityContent);
1013
+ }
1014
+ }
1015
+
1016
+ if (entityValue == -1) {
1017
+ writer.write('&');
1018
+ writer.write(entityContent);
1019
+ writer.write(';');
1020
+ } else {
1021
+ writer.write(entityValue);
1022
+ }
1023
+ i = semiColonIdx; // move index up to the semi-colon
1024
+ } else {
1025
+ writer.write(c);
1026
+ }
1027
+ }
1028
+ }
1029
+
1030
+ }