hpricot 0.6-jruby → 0.6.161-jruby
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +70 -46
- data/ext/fast_xs/FastXsService.java +1018 -0
- data/ext/fast_xs/extconf.rb +4 -0
- data/ext/fast_xs/fast_xs.c +194 -0
- data/ext/hpricot_scan/HpricotScanService.java +951 -760
- data/ext/hpricot_scan/hpricot_scan.c +1984 -1974
- data/ext/hpricot_scan/hpricot_scan.java.java +1300 -0
- data/ext/hpricot_scan/hpricot_scan.java.rl +6 -2
- data/ext/hpricot_scan/hpricot_scan.rl +10 -4
- data/lib/hpricot/builder.rb +20 -5
- data/lib/hpricot/elements.rb +6 -6
- data/lib/hpricot/parse.rb +4 -3
- data/lib/hpricot/tag.rb +6 -2
- data/lib/hpricot/traverse.rb +17 -6
- data/lib/universal-java1.5/fast_xs.jar +0 -0
- data/lib/universal-java1.5/hpricot_scan.jar +0 -0
- data/test/test_alter.rb +12 -0
- data/test/test_builder.rb +13 -0
- data/test/test_parser.rb +7 -0
- data/test/test_paths.rb +9 -0
- metadata +61 -56
- data/lib/i686-linux/hpricot_scan.jar +0 -0
data/Rakefile
CHANGED
@@ -6,19 +6,22 @@ require 'rake/testtask'
|
|
6
6
|
require 'fileutils'
|
7
7
|
include FileUtils
|
8
8
|
|
9
|
+
RbConfig = Config unless defined?(RbConfig)
|
10
|
+
|
9
11
|
NAME = "hpricot"
|
10
12
|
REV = `svn info`[/Revision: (\d+)/, 1] rescue nil
|
11
13
|
VERS = ENV['VERSION'] || "0.6" + (REV ? ".#{REV}" : "")
|
12
14
|
PKG = "#{NAME}-#{VERS}"
|
13
|
-
BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
|
15
|
+
BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp,class}"
|
14
16
|
ARCHLIB = "lib/#{::Config::CONFIG['arch']}"
|
15
|
-
CLEAN.include ["ext/hpricot_scan/#{BIN}", "lib/**/#{BIN}",
|
16
|
-
'
|
17
|
+
CLEAN.include ["ext/hpricot_scan/#{BIN}", "ext/fast_xs/#{BIN}", "lib/**/#{BIN}", ARCHLIB,
|
18
|
+
'ext/fast_xs/Makefile', 'ext/hpricot_scan/Makefile',
|
19
|
+
'**/.*.sw?', '*.gem', '.config', 'pkg']
|
17
20
|
RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README', '--inline-source']
|
18
21
|
PKG_FILES = %w(CHANGELOG COPYING README Rakefile) +
|
19
22
|
Dir.glob("{bin,doc,test,lib,extras}/**/*") +
|
20
23
|
Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
|
21
|
-
%w[ext/hpricot_scan/hpricot_scan.c] # needed because it's generated later
|
24
|
+
%w[ext/hpricot_scan/hpricot_scan.c ext/hpricot_scan/HpricotScanService.java] # needed because it's generated later
|
22
25
|
SPEC =
|
23
26
|
Gem::Specification.new do |s|
|
24
27
|
s.name = NAME
|
@@ -66,24 +69,40 @@ Rake::GemPackageTask.new(SPEC) do |p|
|
|
66
69
|
p.gem_spec = SPEC
|
67
70
|
end
|
68
71
|
|
69
|
-
|
70
|
-
ext = "ext
|
71
|
-
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
72
|
-
ext_files = FileList[
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
]
|
72
|
+
['hpricot_scan', 'fast_xs'].each do |extension|
|
73
|
+
ext = "ext/#{extension}"
|
74
|
+
ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
|
75
|
+
ext_files = FileList[
|
76
|
+
"#{ext}/*.c",
|
77
|
+
"#{ext}/*.h",
|
78
|
+
"#{ext}/*.rl",
|
79
|
+
"#{ext}/extconf.rb",
|
80
|
+
"#{ext}/Makefile",
|
81
|
+
"lib"
|
82
|
+
]
|
83
|
+
|
84
|
+
desc "Builds just the #{extension} extension"
|
85
|
+
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
86
|
+
|
87
|
+
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
88
|
+
Dir.chdir(ext) do ruby "extconf.rb" end
|
89
|
+
end
|
90
|
+
|
91
|
+
file ext_so => ext_files do
|
92
|
+
Dir.chdir(ext) do
|
93
|
+
sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make')
|
94
|
+
end
|
95
|
+
mkdir_p ARCHLIB
|
96
|
+
cp ext_so, ARCHLIB
|
97
|
+
end
|
98
|
+
end
|
80
99
|
|
81
100
|
task "lib" do
|
82
101
|
directory "lib"
|
83
102
|
end
|
84
103
|
|
85
104
|
desc "Compiles the Ruby extension"
|
86
|
-
task :compile => [:hpricot_scan] do
|
105
|
+
task :compile => [:hpricot_scan, :fast_xs] do
|
87
106
|
if Dir.glob(File.join(ARCHLIB,"hpricot_scan.*")).length == 0
|
88
107
|
STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
|
89
108
|
STDERR.puts "Gem actually failed to build. Your system is"
|
@@ -94,21 +113,6 @@ task :compile => [:hpricot_scan] do
|
|
94
113
|
end
|
95
114
|
task :hpricot_scan => [:ragel]
|
96
115
|
|
97
|
-
desc "Builds just the #{extension} extension"
|
98
|
-
task extension.to_sym => ["#{ext}/Makefile", ext_so ]
|
99
|
-
|
100
|
-
file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
|
101
|
-
Dir.chdir(ext) do ruby "extconf.rb" end
|
102
|
-
end
|
103
|
-
|
104
|
-
file ext_so => ext_files do
|
105
|
-
Dir.chdir(ext) do
|
106
|
-
sh(PLATFORM =~ /win32/ ? 'nmake' : 'make')
|
107
|
-
end
|
108
|
-
mkdir_p ARCHLIB
|
109
|
-
cp ext_so, ARCHLIB
|
110
|
-
end
|
111
|
-
|
112
116
|
desc "returns the ragel version"
|
113
117
|
task :ragel_version do
|
114
118
|
@ragel_v = `ragel -v`[/(version )(\S*)/,2].to_f
|
@@ -127,8 +131,8 @@ end
|
|
127
131
|
### Win32 Packages ###
|
128
132
|
|
129
133
|
Win32Spec = SPEC.dup
|
130
|
-
Win32Spec.platform = Gem::Platform::
|
131
|
-
Win32Spec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.so"]
|
134
|
+
Win32Spec.platform = Gem::Platform::CURRENT
|
135
|
+
Win32Spec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.so", "#{ARCHLIB}/fast_xs.so"]
|
132
136
|
Win32Spec.extensions = []
|
133
137
|
|
134
138
|
WIN32_PKG_DIR = "#{PKG}-mswin32"
|
@@ -160,19 +164,44 @@ CLEAN.include WIN32_PKG_DIR
|
|
160
164
|
|
161
165
|
### JRuby Packages ###
|
162
166
|
|
163
|
-
|
164
|
-
|
165
|
-
|
167
|
+
def java_classpath_arg
|
168
|
+
# A myriad of ways to discover the JRuby classpath
|
169
|
+
classpath = begin
|
170
|
+
require 'java'
|
171
|
+
# Already running in a JRuby JVM
|
172
|
+
Java::java.lang.System.getProperty('java.class.path')
|
173
|
+
rescue LoadError
|
174
|
+
ENV['JRUBY_PARENT_CLASSPATH'] || ENV['JRUBY_HOME'] && FileList["#{ENV['JRUBY_HOME']}/lib/*.jar"].join(File::PATH_SEPARATOR)
|
175
|
+
end
|
176
|
+
classpath ? "-cp #{classpath}" : ""
|
177
|
+
end
|
178
|
+
|
179
|
+
def compile_java(filename, jarname)
|
180
|
+
sh %{javac -source 1.4 -target 1.4 #{java_classpath_arg} #{filename}}
|
181
|
+
sh %{jar cf #{jarname} *.class}
|
166
182
|
end
|
167
183
|
|
168
|
-
desc "Compiles the JRuby extension"
|
169
184
|
task :hpricot_scan_java => [:ragel_java] do
|
170
|
-
Dir.chdir
|
185
|
+
Dir.chdir "ext/hpricot_scan" do
|
186
|
+
compile_java("HpricotScanService.java", "hpricot_scan.jar")
|
187
|
+
end
|
188
|
+
end
|
189
|
+
|
190
|
+
task :fast_xs_java do
|
191
|
+
Dir.chdir "ext/fast_xs" do
|
192
|
+
compile_java("FastXsService.java", "fast_xs.jar")
|
193
|
+
end
|
194
|
+
end
|
195
|
+
|
196
|
+
desc "Compiles the JRuby extensions"
|
197
|
+
task :hpricot_java => [:hpricot_scan_java, :fast_xs_java] do
|
198
|
+
mkdir_p "#{ARCHLIB}"
|
199
|
+
%w(hpricot_scan fast_xs).each {|ext| mv "ext/#{ext}/#{ext}.jar", "#{ARCHLIB}"}
|
171
200
|
end
|
172
201
|
|
173
202
|
JRubySpec = SPEC.dup
|
174
203
|
JRubySpec.platform = 'jruby'
|
175
|
-
JRubySpec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.jar"]
|
204
|
+
JRubySpec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.jar", "#{ARCHLIB}/fast_xs.jar"]
|
176
205
|
JRubySpec.extensions = []
|
177
206
|
|
178
207
|
JRUBY_PKG_DIR = "#{PKG}-jruby"
|
@@ -183,15 +212,10 @@ file JRUBY_PKG_DIR => [:ragel_java, :package] do
|
|
183
212
|
mv PKG, JRUBY_PKG_DIR
|
184
213
|
end
|
185
214
|
|
186
|
-
desc "Cross-compile the hpricot_scan extension for JRuby"
|
187
|
-
file "hpricot_scan_jruby" => [JRUBY_PKG_DIR] do
|
188
|
-
Dir.chdir("#{JRUBY_PKG_DIR}/ext/hpricot_scan", &compile_java)
|
189
|
-
mv "#{JRUBY_PKG_DIR}/ext/hpricot_scan/hpricot_scan.jar", "#{JRUBY_PKG_DIR}/#{ARCHLIB}"
|
190
|
-
end
|
191
|
-
|
192
215
|
desc "Build the RubyGems package for JRuby"
|
193
|
-
task :package_jruby =>
|
216
|
+
task :package_jruby => JRUBY_PKG_DIR do
|
194
217
|
Dir.chdir("#{JRUBY_PKG_DIR}") do
|
218
|
+
Rake::Task[:hpricot_java].invoke
|
195
219
|
Gem::Builder.new(JRubySpec).build
|
196
220
|
verbose(true) {
|
197
221
|
mv Dir["*.gem"].first, "../pkg/#{JRUBY_PKG_DIR}.gem"
|
@@ -0,0 +1,1018 @@
|
|
1
|
+
|
2
|
+
import java.io.IOException;
|
3
|
+
import java.io.StringWriter;
|
4
|
+
import java.io.Writer;
|
5
|
+
import java.util.HashMap;
|
6
|
+
import java.util.Map;
|
7
|
+
import java.util.TreeMap;
|
8
|
+
import org.jruby.Ruby;
|
9
|
+
import org.jruby.RubyModule;
|
10
|
+
import org.jruby.runtime.CallbackFactory;
|
11
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
12
|
+
import org.jruby.runtime.load.BasicLibraryService;
|
13
|
+
import org.jruby.util.collections.IntHashMap;
|
14
|
+
|
15
|
+
public class FastXsService implements BasicLibraryService {
|
16
|
+
|
17
|
+
public boolean basicLoad(final Ruby runtime) throws IOException {
|
18
|
+
RubyModule string = runtime.getModule("String");
|
19
|
+
CallbackFactory fact = runtime.callbackFactory(FastXsService.class);
|
20
|
+
string.defineMethod("fast_xs",fact.getFastSingletonMethod("fast_xs"));
|
21
|
+
return true;
|
22
|
+
}
|
23
|
+
|
24
|
+
public static IRubyObject fast_xs(IRubyObject recv) {
|
25
|
+
String string = recv.convertToString().getUnicodeValue();
|
26
|
+
StringWriter writer = new StringWriter ((int)(string.length() * 1.5));
|
27
|
+
try {
|
28
|
+
Entities.HTML40.escape(writer, string);
|
29
|
+
return recv.getRuntime().newString(writer.toString());
|
30
|
+
} catch (IOException e) {
|
31
|
+
throw recv.getRuntime().newIOErrorFromException(e);
|
32
|
+
}
|
33
|
+
}
|
34
|
+
}
|
35
|
+
|
36
|
+
// From Apache commons-lang,
|
37
|
+
// http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?revision=560660&view=markup
|
38
|
+
/*
|
39
|
+
* Licensed to the Apache Software Foundation (ASF) under one or more
|
40
|
+
* contributor license agreements. See the NOTICE file distributed with
|
41
|
+
* this work for additional information regarding copyright ownership.
|
42
|
+
* The ASF licenses this file to You under the Apache License, Version 2.0
|
43
|
+
* (the "License"); you may not use this file except in compliance with
|
44
|
+
* the License. You may obtain a copy of the License at
|
45
|
+
*
|
46
|
+
* http://www.apache.org/licenses/LICENSE-2.0
|
47
|
+
*
|
48
|
+
* Unless required by applicable law or agreed to in writing, software
|
49
|
+
* distributed under the License is distributed on an "AS IS" BASIS,
|
50
|
+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
51
|
+
* See the License for the specific language governing permissions and
|
52
|
+
* limitations under the License.
|
53
|
+
*/
|
54
|
+
|
55
|
+
/**
|
56
|
+
* <p>
|
57
|
+
* Provides HTML and XML entity utilities.
|
58
|
+
* </p>
|
59
|
+
*
|
60
|
+
* @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
|
61
|
+
* @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
|
62
|
+
* @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
|
63
|
+
* @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
|
64
|
+
* @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
|
65
|
+
*
|
66
|
+
* @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
|
67
|
+
* @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
|
68
|
+
* @since 2.0
|
69
|
+
* @version $Id$
|
70
|
+
*/
|
71
|
+
class Entities {
|
72
|
+
|
73
|
+
private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
|
74
|
+
{"amp", "38"}, // & - ampersand
|
75
|
+
{"lt", "60"}, // < - less-than
|
76
|
+
{"gt", "62"}, // > - greater-than
|
77
|
+
};
|
78
|
+
|
79
|
+
private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
|
80
|
+
};
|
81
|
+
|
82
|
+
// package scoped for testing
|
83
|
+
static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
|
84
|
+
{"iexcl", "161"}, // inverted exclamation mark
|
85
|
+
{"cent", "162"}, // cent sign
|
86
|
+
{"pound", "163"}, // pound sign
|
87
|
+
{"curren", "164"}, // currency sign
|
88
|
+
{"yen", "165"}, // yen sign = yuan sign
|
89
|
+
{"brvbar", "166"}, // broken bar = broken vertical bar
|
90
|
+
{"sect", "167"}, // section sign
|
91
|
+
{"uml", "168"}, // diaeresis = spacing diaeresis
|
92
|
+
{"copy", "169"}, // © - copyright sign
|
93
|
+
{"ordf", "170"}, // feminine ordinal indicator
|
94
|
+
{"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
|
95
|
+
{"not", "172"}, // not sign
|
96
|
+
{"shy", "173"}, // soft hyphen = discretionary hyphen
|
97
|
+
{"reg", "174"}, // ® - registered trademark sign
|
98
|
+
{"macr", "175"}, // macron = spacing macron = overline = APL overbar
|
99
|
+
{"deg", "176"}, // degree sign
|
100
|
+
{"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
|
101
|
+
{"sup2", "178"}, // superscript two = superscript digit two = squared
|
102
|
+
{"sup3", "179"}, // superscript three = superscript digit three = cubed
|
103
|
+
{"acute", "180"}, // acute accent = spacing acute
|
104
|
+
{"micro", "181"}, // micro sign
|
105
|
+
{"para", "182"}, // pilcrow sign = paragraph sign
|
106
|
+
{"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
|
107
|
+
{"cedil", "184"}, // cedilla = spacing cedilla
|
108
|
+
{"sup1", "185"}, // superscript one = superscript digit one
|
109
|
+
{"ordm", "186"}, // masculine ordinal indicator
|
110
|
+
{"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
|
111
|
+
{"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
|
112
|
+
{"frac12", "189"}, // vulgar fraction one half = fraction one half
|
113
|
+
{"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
|
114
|
+
{"iquest", "191"}, // inverted question mark = turned question mark
|
115
|
+
{"Agrave", "192"}, // À - uppercase A, grave accent
|
116
|
+
{"Aacute", "193"}, // Á - uppercase A, acute accent
|
117
|
+
{"Acirc", "194"}, // Â - uppercase A, circumflex accent
|
118
|
+
{"Atilde", "195"}, // Ã - uppercase A, tilde
|
119
|
+
{"Auml", "196"}, // Ä - uppercase A, umlaut
|
120
|
+
{"Aring", "197"}, // Å - uppercase A, ring
|
121
|
+
{"AElig", "198"}, // Æ - uppercase AE
|
122
|
+
{"Ccedil", "199"}, // Ç - uppercase C, cedilla
|
123
|
+
{"Egrave", "200"}, // È - uppercase E, grave accent
|
124
|
+
{"Eacute", "201"}, // É - uppercase E, acute accent
|
125
|
+
{"Ecirc", "202"}, // Ê - uppercase E, circumflex accent
|
126
|
+
{"Euml", "203"}, // Ë - uppercase E, umlaut
|
127
|
+
{"Igrave", "204"}, // Ì - uppercase I, grave accent
|
128
|
+
{"Iacute", "205"}, // Í - uppercase I, acute accent
|
129
|
+
{"Icirc", "206"}, // Î - uppercase I, circumflex accent
|
130
|
+
{"Iuml", "207"}, // Ï - uppercase I, umlaut
|
131
|
+
{"ETH", "208"}, // Ð - uppercase Eth, Icelandic
|
132
|
+
{"Ntilde", "209"}, // Ñ - uppercase N, tilde
|
133
|
+
{"Ograve", "210"}, // Ò - uppercase O, grave accent
|
134
|
+
{"Oacute", "211"}, // Ó - uppercase O, acute accent
|
135
|
+
{"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
|
136
|
+
{"Otilde", "213"}, // Õ - uppercase O, tilde
|
137
|
+
{"Ouml", "214"}, // Ö - uppercase O, umlaut
|
138
|
+
{"times", "215"}, // multiplication sign
|
139
|
+
{"Oslash", "216"}, // Ø - uppercase O, slash
|
140
|
+
{"Ugrave", "217"}, // Ù - uppercase U, grave accent
|
141
|
+
{"Uacute", "218"}, // Ú - uppercase U, acute accent
|
142
|
+
{"Ucirc", "219"}, // Û - uppercase U, circumflex accent
|
143
|
+
{"Uuml", "220"}, // Ü - uppercase U, umlaut
|
144
|
+
{"Yacute", "221"}, // Ý - uppercase Y, acute accent
|
145
|
+
{"THORN", "222"}, // Þ - uppercase THORN, Icelandic
|
146
|
+
{"szlig", "223"}, // ß - lowercase sharps, German
|
147
|
+
{"agrave", "224"}, // à - lowercase a, grave accent
|
148
|
+
{"aacute", "225"}, // á - lowercase a, acute accent
|
149
|
+
{"acirc", "226"}, // â - lowercase a, circumflex accent
|
150
|
+
{"atilde", "227"}, // ã - lowercase a, tilde
|
151
|
+
{"auml", "228"}, // ä - lowercase a, umlaut
|
152
|
+
{"aring", "229"}, // å - lowercase a, ring
|
153
|
+
{"aelig", "230"}, // æ - lowercase ae
|
154
|
+
{"ccedil", "231"}, // ç - lowercase c, cedilla
|
155
|
+
{"egrave", "232"}, // è - lowercase e, grave accent
|
156
|
+
{"eacute", "233"}, // é - lowercase e, acute accent
|
157
|
+
{"ecirc", "234"}, // ê - lowercase e, circumflex accent
|
158
|
+
{"euml", "235"}, // ë - lowercase e, umlaut
|
159
|
+
{"igrave", "236"}, // ì - lowercase i, grave accent
|
160
|
+
{"iacute", "237"}, // í - lowercase i, acute accent
|
161
|
+
{"icirc", "238"}, // î - lowercase i, circumflex accent
|
162
|
+
{"iuml", "239"}, // ï - lowercase i, umlaut
|
163
|
+
{"eth", "240"}, // ð - lowercase eth, Icelandic
|
164
|
+
{"ntilde", "241"}, // ñ - lowercase n, tilde
|
165
|
+
{"ograve", "242"}, // ò - lowercase o, grave accent
|
166
|
+
{"oacute", "243"}, // ó - lowercase o, acute accent
|
167
|
+
{"ocirc", "244"}, // ô - lowercase o, circumflex accent
|
168
|
+
{"otilde", "245"}, // õ - lowercase o, tilde
|
169
|
+
{"ouml", "246"}, // ö - lowercase o, umlaut
|
170
|
+
{"divide", "247"}, // division sign
|
171
|
+
{"oslash", "248"}, // ø - lowercase o, slash
|
172
|
+
{"ugrave", "249"}, // ù - lowercase u, grave accent
|
173
|
+
{"uacute", "250"}, // ú - lowercase u, acute accent
|
174
|
+
{"ucirc", "251"}, // û - lowercase u, circumflex accent
|
175
|
+
{"uuml", "252"}, // ü - lowercase u, umlaut
|
176
|
+
{"yacute", "253"}, // ý - lowercase y, acute accent
|
177
|
+
{"thorn", "254"}, // þ - lowercase thorn, Icelandic
|
178
|
+
{"yuml", "255"}, // ÿ - lowercase y, umlaut
|
179
|
+
};
|
180
|
+
|
181
|
+
// http://www.w3.org/TR/REC-html40/sgml/entities.html
|
182
|
+
// package scoped for testing
|
183
|
+
static final String[][] HTML40_ARRAY = {
|
184
|
+
// <!-- Latin Extended-B -->
|
185
|
+
{"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
|
186
|
+
// <!-- Greek -->
|
187
|
+
{"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
|
188
|
+
{"Beta", "914"}, // greek capital letter beta, U+0392 -->
|
189
|
+
{"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
|
190
|
+
{"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
|
191
|
+
{"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
|
192
|
+
{"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
|
193
|
+
{"Eta", "919"}, // greek capital letter eta, U+0397 -->
|
194
|
+
{"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
|
195
|
+
{"Iota", "921"}, // greek capital letter iota, U+0399 -->
|
196
|
+
{"Kappa", "922"}, // greek capital letter kappa, U+039A -->
|
197
|
+
{"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
|
198
|
+
{"Mu", "924"}, // greek capital letter mu, U+039C -->
|
199
|
+
{"Nu", "925"}, // greek capital letter nu, U+039D -->
|
200
|
+
{"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
|
201
|
+
{"Omicron", "927"}, // greek capital letter omicron, U+039F -->
|
202
|
+
{"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
|
203
|
+
{"Rho", "929"}, // greek capital letter rho, U+03A1 -->
|
204
|
+
// <!-- there is no Sigmaf, and no U+03A2 character either -->
|
205
|
+
{"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
|
206
|
+
{"Tau", "932"}, // greek capital letter tau, U+03A4 -->
|
207
|
+
{"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
|
208
|
+
{"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
|
209
|
+
{"Chi", "935"}, // greek capital letter chi, U+03A7 -->
|
210
|
+
{"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
|
211
|
+
{"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
|
212
|
+
{"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
|
213
|
+
{"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
|
214
|
+
{"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
|
215
|
+
{"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
|
216
|
+
{"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
|
217
|
+
{"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
|
218
|
+
{"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
|
219
|
+
{"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
|
220
|
+
{"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
|
221
|
+
{"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
|
222
|
+
{"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
|
223
|
+
{"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
|
224
|
+
{"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
|
225
|
+
{"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
|
226
|
+
{"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
|
227
|
+
{"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
|
228
|
+
{"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
|
229
|
+
{"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
|
230
|
+
{"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
|
231
|
+
{"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
|
232
|
+
{"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
|
233
|
+
{"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
|
234
|
+
{"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
|
235
|
+
{"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
|
236
|
+
{"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
|
237
|
+
{"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
|
238
|
+
{"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
|
239
|
+
{"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
|
240
|
+
// <!-- General Punctuation -->
|
241
|
+
{"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
|
242
|
+
// <!-- bullet is NOT the same as bullet operator, U+2219 -->
|
243
|
+
{"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
|
244
|
+
{"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
|
245
|
+
{"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
|
246
|
+
{"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
|
247
|
+
{"frasl", "8260"}, // fraction slash, U+2044 NEW -->
|
248
|
+
// <!-- Letterlike Symbols -->
|
249
|
+
{"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
|
250
|
+
{"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
|
251
|
+
{"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
|
252
|
+
{"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
|
253
|
+
{"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
|
254
|
+
// <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
|
255
|
+
// same glyph could be used to depict both characters -->
|
256
|
+
// <!-- Arrows -->
|
257
|
+
{"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
|
258
|
+
{"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
|
259
|
+
{"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
|
260
|
+
{"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
|
261
|
+
{"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
|
262
|
+
{"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
|
263
|
+
{"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
|
264
|
+
// <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
|
265
|
+
// arrow but also does not have any other character for that function.
|
266
|
+
// So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
|
267
|
+
{"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
|
268
|
+
{"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
|
269
|
+
// <!-- ISO 10646 does not say this is the 'implies' character but does not
|
270
|
+
// have another character with this function so ?rArr can be used for
|
271
|
+
// 'implies' as ISOtech suggests -->
|
272
|
+
{"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
|
273
|
+
{"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
|
274
|
+
// <!-- Mathematical Operators -->
|
275
|
+
{"forall", "8704"}, // for all, U+2200 ISOtech -->
|
276
|
+
{"part", "8706"}, // partial differential, U+2202 ISOtech -->
|
277
|
+
{"exist", "8707"}, // there exists, U+2203 ISOtech -->
|
278
|
+
{"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
|
279
|
+
{"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
|
280
|
+
{"isin", "8712"}, // element of, U+2208 ISOtech -->
|
281
|
+
{"notin", "8713"}, // not an element of, U+2209 ISOtech -->
|
282
|
+
{"ni", "8715"}, // contains as member, U+220B ISOtech -->
|
283
|
+
// <!-- should there be a more memorable name than 'ni'? -->
|
284
|
+
{"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
|
285
|
+
// <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
|
286
|
+
// though the same glyph might be used for both -->
|
287
|
+
{"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
|
288
|
+
// <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
|
289
|
+
// though the same glyph might be used for both -->
|
290
|
+
{"minus", "8722"}, // minus sign, U+2212 ISOtech -->
|
291
|
+
{"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
|
292
|
+
{"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
|
293
|
+
{"prop", "8733"}, // proportional to, U+221D ISOtech -->
|
294
|
+
{"infin", "8734"}, // infinity, U+221E ISOtech -->
|
295
|
+
{"ang", "8736"}, // angle, U+2220 ISOamso -->
|
296
|
+
{"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
|
297
|
+
{"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
|
298
|
+
{"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
|
299
|
+
{"cup", "8746"}, // union = cup, U+222A ISOtech -->
|
300
|
+
{"int", "8747"}, // integral, U+222B ISOtech -->
|
301
|
+
{"there4", "8756"}, // therefore, U+2234 ISOtech -->
|
302
|
+
{"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
|
303
|
+
// <!-- tilde operator is NOT the same character as the tilde, U+007E,although
|
304
|
+
// the same glyph might be used to represent both -->
|
305
|
+
{"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
|
306
|
+
{"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
|
307
|
+
{"ne", "8800"}, // not equal to, U+2260 ISOtech -->
|
308
|
+
{"equiv", "8801"}, // identical to, U+2261 ISOtech -->
|
309
|
+
{"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
|
310
|
+
{"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
|
311
|
+
{"sub", "8834"}, // subset of, U+2282 ISOtech -->
|
312
|
+
{"sup", "8835"}, // superset of, U+2283 ISOtech -->
|
313
|
+
// <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
|
314
|
+
// Symbol font encoding and is not included. Should it be, for symmetry?
|
315
|
+
// It is in ISOamsn --> <!ENTITY nsub", "8836"},
|
316
|
+
// not a subset of, U+2284 ISOamsn -->
|
317
|
+
{"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
|
318
|
+
{"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
|
319
|
+
{"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
|
320
|
+
{"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
|
321
|
+
{"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
|
322
|
+
{"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
|
323
|
+
// <!-- dot operator is NOT the same character as U+00B7 middle dot -->
|
324
|
+
// <!-- Miscellaneous Technical -->
|
325
|
+
{"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
|
326
|
+
{"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
|
327
|
+
{"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
|
328
|
+
{"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
|
329
|
+
{"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
|
330
|
+
// <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
|
331
|
+
// mark' -->
|
332
|
+
{"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
|
333
|
+
// <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
|
334
|
+
// 'single right-pointing angle quotation mark' -->
|
335
|
+
// <!-- Geometric Shapes -->
|
336
|
+
{"loz", "9674"}, // lozenge, U+25CA ISOpub -->
|
337
|
+
// <!-- Miscellaneous Symbols -->
|
338
|
+
{"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
|
339
|
+
// <!-- black here seems to mean filled as opposed to hollow -->
|
340
|
+
{"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
|
341
|
+
{"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
|
342
|
+
{"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
|
343
|
+
|
344
|
+
// <!-- Latin Extended-A -->
|
345
|
+
{"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
|
346
|
+
{"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
|
347
|
+
// <!-- ligature is a misnomer, this is a separate character in some languages -->
|
348
|
+
{"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
|
349
|
+
{"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
|
350
|
+
{"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
|
351
|
+
// <!-- Spacing Modifier Letters -->
|
352
|
+
{"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
|
353
|
+
{"tilde", "732"}, // small tilde, U+02DC ISOdia -->
|
354
|
+
// <!-- General Punctuation -->
|
355
|
+
{"ensp", "8194"}, // en space, U+2002 ISOpub -->
|
356
|
+
{"emsp", "8195"}, // em space, U+2003 ISOpub -->
|
357
|
+
{"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
|
358
|
+
{"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
|
359
|
+
{"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
|
360
|
+
{"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
|
361
|
+
{"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
|
362
|
+
{"ndash", "8211"}, // en dash, U+2013 ISOpub -->
|
363
|
+
{"mdash", "8212"}, // em dash, U+2014 ISOpub -->
|
364
|
+
{"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
|
365
|
+
{"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
|
366
|
+
{"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
|
367
|
+
{"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
|
368
|
+
{"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
|
369
|
+
{"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
|
370
|
+
{"dagger", "8224"}, // dagger, U+2020 ISOpub -->
|
371
|
+
{"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
|
372
|
+
{"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
|
373
|
+
{"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
|
374
|
+
// <!-- lsaquo is proposed but not yet ISO standardized -->
|
375
|
+
{"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
|
376
|
+
// <!-- rsaquo is proposed but not yet ISO standardized -->
|
377
|
+
{"euro", "8364"}, // -- euro sign, U+20AC NEW -->
|
378
|
+
};
|
379
|
+
|
380
|
+
/**
|
381
|
+
* <p>
|
382
|
+
* The set of entities supported by standard XML.
|
383
|
+
* </p>
|
384
|
+
*/
|
385
|
+
public static final Entities XML;
|
386
|
+
|
387
|
+
/**
|
388
|
+
* <p>
|
389
|
+
* The set of entities supported by HTML 3.2.
|
390
|
+
* </p>
|
391
|
+
*/
|
392
|
+
public static final Entities HTML32;
|
393
|
+
|
394
|
+
/**
|
395
|
+
* <p>
|
396
|
+
* The set of entities supported by HTML 4.0.
|
397
|
+
* </p>
|
398
|
+
*/
|
399
|
+
public static final Entities HTML40;
|
400
|
+
|
401
|
+
static {
|
402
|
+
XML = new Entities();
|
403
|
+
XML.addEntities(BASIC_ARRAY);
|
404
|
+
XML.addEntities(APOS_ARRAY);
|
405
|
+
}
|
406
|
+
|
407
|
+
static {
|
408
|
+
HTML32 = new Entities();
|
409
|
+
HTML32.addEntities(BASIC_ARRAY);
|
410
|
+
HTML32.addEntities(ISO8859_1_ARRAY);
|
411
|
+
}
|
412
|
+
|
413
|
+
static {
|
414
|
+
HTML40 = new Entities();
|
415
|
+
fillWithHtml40Entities(HTML40);
|
416
|
+
}
|
417
|
+
|
418
|
+
/**
|
419
|
+
* <p>
|
420
|
+
* Fills the specified entities instance with HTML 40 entities.
|
421
|
+
* </p>
|
422
|
+
*
|
423
|
+
* @param entities
|
424
|
+
* the instance to be filled.
|
425
|
+
*/
|
426
|
+
static void fillWithHtml40Entities(Entities entities) {
|
427
|
+
entities.addEntities(BASIC_ARRAY);
|
428
|
+
entities.addEntities(ISO8859_1_ARRAY);
|
429
|
+
entities.addEntities(HTML40_ARRAY);
|
430
|
+
}
|
431
|
+
|
432
|
+
static interface EntityMap {
|
433
|
+
/**
|
434
|
+
* <p>
|
435
|
+
* Add an entry to this entity map.
|
436
|
+
* </p>
|
437
|
+
*
|
438
|
+
* @param name
|
439
|
+
* the entity name
|
440
|
+
* @param value
|
441
|
+
* the entity value
|
442
|
+
*/
|
443
|
+
void add(String name, int value);
|
444
|
+
|
445
|
+
/**
|
446
|
+
* <p>
|
447
|
+
* Returns the name of the entity identified by the specified value.
|
448
|
+
* </p>
|
449
|
+
*
|
450
|
+
* @param value
|
451
|
+
* the value to locate
|
452
|
+
* @return entity name associated with the specified value
|
453
|
+
*/
|
454
|
+
String name(int value);
|
455
|
+
|
456
|
+
/**
|
457
|
+
* <p>
|
458
|
+
* Returns the value of the entity identified by the specified name.
|
459
|
+
* </p>
|
460
|
+
*
|
461
|
+
* @param name
|
462
|
+
* the name to locate
|
463
|
+
* @return entity value associated with the specified name
|
464
|
+
*/
|
465
|
+
int value(String name);
|
466
|
+
}
|
467
|
+
|
468
|
+
static class PrimitiveEntityMap implements EntityMap {
|
469
|
+
private Map mapNameToValue = new HashMap();
|
470
|
+
|
471
|
+
private IntHashMap mapValueToName = new IntHashMap();
|
472
|
+
|
473
|
+
/**
|
474
|
+
* {@inheritDoc}
|
475
|
+
*/
|
476
|
+
public void add(String name, int value) {
|
477
|
+
mapNameToValue.put(name, new Integer(value));
|
478
|
+
mapValueToName.put(value, name);
|
479
|
+
}
|
480
|
+
|
481
|
+
/**
|
482
|
+
* {@inheritDoc}
|
483
|
+
*/
|
484
|
+
public String name(int value) {
|
485
|
+
return (String) mapValueToName.get(value);
|
486
|
+
}
|
487
|
+
|
488
|
+
/**
|
489
|
+
* {@inheritDoc}
|
490
|
+
*/
|
491
|
+
public int value(String name) {
|
492
|
+
Object value = mapNameToValue.get(name);
|
493
|
+
if (value == null) {
|
494
|
+
return -1;
|
495
|
+
}
|
496
|
+
return ((Integer) value).intValue();
|
497
|
+
}
|
498
|
+
}
|
499
|
+
|
500
|
+
static abstract class MapIntMap implements Entities.EntityMap {
|
501
|
+
protected Map mapNameToValue;
|
502
|
+
|
503
|
+
protected Map mapValueToName;
|
504
|
+
|
505
|
+
/**
|
506
|
+
* {@inheritDoc}
|
507
|
+
*/
|
508
|
+
public void add(String name, int value) {
|
509
|
+
mapNameToValue.put(name, new Integer(value));
|
510
|
+
mapValueToName.put(new Integer(value), name);
|
511
|
+
}
|
512
|
+
|
513
|
+
/**
|
514
|
+
* {@inheritDoc}
|
515
|
+
*/
|
516
|
+
public String name(int value) {
|
517
|
+
return (String) mapValueToName.get(new Integer(value));
|
518
|
+
}
|
519
|
+
|
520
|
+
/**
|
521
|
+
* {@inheritDoc}
|
522
|
+
*/
|
523
|
+
public int value(String name) {
|
524
|
+
Object value = mapNameToValue.get(name);
|
525
|
+
if (value == null) {
|
526
|
+
return -1;
|
527
|
+
}
|
528
|
+
return ((Integer) value).intValue();
|
529
|
+
}
|
530
|
+
}
|
531
|
+
|
532
|
+
static class HashEntityMap extends MapIntMap {
|
533
|
+
/**
|
534
|
+
* Constructs a new instance of <code>HashEntityMap</code>.
|
535
|
+
*/
|
536
|
+
public HashEntityMap() {
|
537
|
+
mapNameToValue = new HashMap();
|
538
|
+
mapValueToName = new HashMap();
|
539
|
+
}
|
540
|
+
}
|
541
|
+
|
542
|
+
static class TreeEntityMap extends MapIntMap {
|
543
|
+
/**
|
544
|
+
* Constructs a new instance of <code>TreeEntityMap</code>.
|
545
|
+
*/
|
546
|
+
public TreeEntityMap() {
|
547
|
+
mapNameToValue = new TreeMap();
|
548
|
+
mapValueToName = new TreeMap();
|
549
|
+
}
|
550
|
+
}
|
551
|
+
|
552
|
+
static class LookupEntityMap extends PrimitiveEntityMap {
|
553
|
+
private String[] lookupTable;
|
554
|
+
|
555
|
+
private int LOOKUP_TABLE_SIZE = 256;
|
556
|
+
|
557
|
+
/**
|
558
|
+
* {@inheritDoc}
|
559
|
+
*/
|
560
|
+
public String name(int value) {
|
561
|
+
if (value < LOOKUP_TABLE_SIZE) {
|
562
|
+
return lookupTable()[value];
|
563
|
+
}
|
564
|
+
return super.name(value);
|
565
|
+
}
|
566
|
+
|
567
|
+
/**
|
568
|
+
* <p>
|
569
|
+
* Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
|
570
|
+
* </p>
|
571
|
+
*
|
572
|
+
* @return the lookup table
|
573
|
+
*/
|
574
|
+
private String[] lookupTable() {
|
575
|
+
if (lookupTable == null) {
|
576
|
+
createLookupTable();
|
577
|
+
}
|
578
|
+
return lookupTable;
|
579
|
+
}
|
580
|
+
|
581
|
+
/**
|
582
|
+
* <p>
|
583
|
+
* Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
|
584
|
+
* </p>
|
585
|
+
*/
|
586
|
+
private void createLookupTable() {
|
587
|
+
lookupTable = new String[LOOKUP_TABLE_SIZE];
|
588
|
+
for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
|
589
|
+
lookupTable[i] = super.name(i);
|
590
|
+
}
|
591
|
+
}
|
592
|
+
}
|
593
|
+
|
594
|
+
static class ArrayEntityMap implements EntityMap {
|
595
|
+
protected int growBy = 100;
|
596
|
+
|
597
|
+
protected int size = 0;
|
598
|
+
|
599
|
+
protected String[] names;
|
600
|
+
|
601
|
+
protected int[] values;
|
602
|
+
|
603
|
+
/**
|
604
|
+
* Constructs a new instance of <code>ArrayEntityMap</code>.
|
605
|
+
*/
|
606
|
+
public ArrayEntityMap() {
|
607
|
+
names = new String[growBy];
|
608
|
+
values = new int[growBy];
|
609
|
+
}
|
610
|
+
|
611
|
+
/**
|
612
|
+
* Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
|
613
|
+
* grow.
|
614
|
+
*
|
615
|
+
* @param growBy
|
616
|
+
* array will be initialized to and will grow by this amount
|
617
|
+
*/
|
618
|
+
public ArrayEntityMap(int growBy) {
|
619
|
+
this.growBy = growBy;
|
620
|
+
names = new String[growBy];
|
621
|
+
values = new int[growBy];
|
622
|
+
}
|
623
|
+
|
624
|
+
/**
|
625
|
+
* {@inheritDoc}
|
626
|
+
*/
|
627
|
+
public void add(String name, int value) {
|
628
|
+
ensureCapacity(size + 1);
|
629
|
+
names[size] = name;
|
630
|
+
values[size] = value;
|
631
|
+
size++;
|
632
|
+
}
|
633
|
+
|
634
|
+
/**
|
635
|
+
* Verifies the capacity of the entity array, adjusting the size if necessary.
|
636
|
+
*
|
637
|
+
* @param capacity
|
638
|
+
* size the array should be
|
639
|
+
*/
|
640
|
+
protected void ensureCapacity(int capacity) {
|
641
|
+
if (capacity > names.length) {
|
642
|
+
int newSize = Math.max(capacity, size + growBy);
|
643
|
+
String[] newNames = new String[newSize];
|
644
|
+
System.arraycopy(names, 0, newNames, 0, size);
|
645
|
+
names = newNames;
|
646
|
+
int[] newValues = new int[newSize];
|
647
|
+
System.arraycopy(values, 0, newValues, 0, size);
|
648
|
+
values = newValues;
|
649
|
+
}
|
650
|
+
}
|
651
|
+
|
652
|
+
/**
|
653
|
+
* {@inheritDoc}
|
654
|
+
*/
|
655
|
+
public String name(int value) {
|
656
|
+
for (int i = 0; i < size; ++i) {
|
657
|
+
if (values[i] == value) {
|
658
|
+
return names[i];
|
659
|
+
}
|
660
|
+
}
|
661
|
+
return null;
|
662
|
+
}
|
663
|
+
|
664
|
+
/**
|
665
|
+
* {@inheritDoc}
|
666
|
+
*/
|
667
|
+
public int value(String name) {
|
668
|
+
for (int i = 0; i < size; ++i) {
|
669
|
+
if (names[i].equals(name)) {
|
670
|
+
return values[i];
|
671
|
+
}
|
672
|
+
}
|
673
|
+
return -1;
|
674
|
+
}
|
675
|
+
}
|
676
|
+
|
677
|
+
static class BinaryEntityMap extends ArrayEntityMap {
|
678
|
+
|
679
|
+
/**
|
680
|
+
* Constructs a new instance of <code>BinaryEntityMap</code>.
|
681
|
+
*/
|
682
|
+
public BinaryEntityMap() {
|
683
|
+
super();
|
684
|
+
}
|
685
|
+
|
686
|
+
/**
|
687
|
+
* Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
|
688
|
+
* should grow.
|
689
|
+
*
|
690
|
+
* @param growBy
|
691
|
+
* array will be initialized to and will grow by this amount
|
692
|
+
*/
|
693
|
+
public BinaryEntityMap(int growBy) {
|
694
|
+
super(growBy);
|
695
|
+
}
|
696
|
+
|
697
|
+
/**
|
698
|
+
* Performs a binary search of the entity array for the specified key. This method is based on code in
|
699
|
+
* {@link java.util.Arrays}.
|
700
|
+
*
|
701
|
+
* @param key
|
702
|
+
* the key to be found
|
703
|
+
* @return the index of the entity array matching the specified key
|
704
|
+
*/
|
705
|
+
private int binarySearch(int key) {
|
706
|
+
int low = 0;
|
707
|
+
int high = size - 1;
|
708
|
+
|
709
|
+
while (low <= high) {
|
710
|
+
int mid = (low + high) >> 1;
|
711
|
+
int midVal = values[mid];
|
712
|
+
|
713
|
+
if (midVal < key) {
|
714
|
+
low = mid + 1;
|
715
|
+
} else if (midVal > key) {
|
716
|
+
high = mid - 1;
|
717
|
+
} else {
|
718
|
+
return mid; // key found
|
719
|
+
}
|
720
|
+
}
|
721
|
+
return -(low + 1); // key not found.
|
722
|
+
}
|
723
|
+
|
724
|
+
/**
|
725
|
+
* {@inheritDoc}
|
726
|
+
*/
|
727
|
+
public void add(String name, int value) {
|
728
|
+
ensureCapacity(size + 1);
|
729
|
+
int insertAt = binarySearch(value);
|
730
|
+
if (insertAt > 0) {
|
731
|
+
return; // note: this means you can't insert the same value twice
|
732
|
+
}
|
733
|
+
insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
|
734
|
+
System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
|
735
|
+
values[insertAt] = value;
|
736
|
+
System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
|
737
|
+
names[insertAt] = name;
|
738
|
+
size++;
|
739
|
+
}
|
740
|
+
|
741
|
+
/**
|
742
|
+
* {@inheritDoc}
|
743
|
+
*/
|
744
|
+
public String name(int value) {
|
745
|
+
int index = binarySearch(value);
|
746
|
+
if (index < 0) {
|
747
|
+
return null;
|
748
|
+
}
|
749
|
+
return names[index];
|
750
|
+
}
|
751
|
+
}
|
752
|
+
|
753
|
+
// package scoped for testing
|
754
|
+
EntityMap map = new Entities.LookupEntityMap();
|
755
|
+
|
756
|
+
/**
|
757
|
+
* <p>
|
758
|
+
* Adds entities to this entity.
|
759
|
+
* </p>
|
760
|
+
*
|
761
|
+
* @param entityArray
|
762
|
+
* array of entities to be added
|
763
|
+
*/
|
764
|
+
public void addEntities(String[][] entityArray) {
|
765
|
+
for (int i = 0; i < entityArray.length; ++i) {
|
766
|
+
addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
|
767
|
+
}
|
768
|
+
}
|
769
|
+
|
770
|
+
/**
|
771
|
+
* <p>
|
772
|
+
* Add an entity to this entity.
|
773
|
+
* </p>
|
774
|
+
*
|
775
|
+
* @param name
|
776
|
+
* name of the entity
|
777
|
+
* @param value
|
778
|
+
* vale of the entity
|
779
|
+
*/
|
780
|
+
public void addEntity(String name, int value) {
|
781
|
+
map.add(name, value);
|
782
|
+
}
|
783
|
+
|
784
|
+
/**
|
785
|
+
* <p>
|
786
|
+
* Returns the name of the entity identified by the specified value.
|
787
|
+
* </p>
|
788
|
+
*
|
789
|
+
* @param value
|
790
|
+
* the value to locate
|
791
|
+
* @return entity name associated with the specified value
|
792
|
+
*/
|
793
|
+
public String entityName(int value) {
|
794
|
+
return map.name(value);
|
795
|
+
}
|
796
|
+
|
797
|
+
/**
|
798
|
+
* <p>
|
799
|
+
* Returns the value of the entity identified by the specified name.
|
800
|
+
* </p>
|
801
|
+
*
|
802
|
+
* @param name
|
803
|
+
* the name to locate
|
804
|
+
* @return entity value associated with the specified name
|
805
|
+
*/
|
806
|
+
public int entityValue(String name) {
|
807
|
+
return map.value(name);
|
808
|
+
}
|
809
|
+
|
810
|
+
/**
|
811
|
+
* <p>
|
812
|
+
* Escapes the characters in a <code>String</code>.
|
813
|
+
* </p>
|
814
|
+
*
|
815
|
+
* <p>
|
816
|
+
* For example, if you have called addEntity("foo", 0xA1), escape("\u00A1") will return
|
817
|
+
* "&foo;"
|
818
|
+
* </p>
|
819
|
+
*
|
820
|
+
* @param str
|
821
|
+
* The <code>String</code> to escape.
|
822
|
+
* @return A new escaped <code>String</code>.
|
823
|
+
*/
|
824
|
+
public String escape(String str) {
|
825
|
+
StringWriter stringWriter = createStringWriter(str);
|
826
|
+
try {
|
827
|
+
this.escape(stringWriter, str);
|
828
|
+
} catch (IOException e) {
|
829
|
+
// This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
|
830
|
+
// throw IOExceptions.
|
831
|
+
throw new RuntimeException(e);
|
832
|
+
}
|
833
|
+
return stringWriter.toString();
|
834
|
+
}
|
835
|
+
|
836
|
+
/**
|
837
|
+
* <p>
|
838
|
+
* Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
|
839
|
+
* passed.
|
840
|
+
* </p>
|
841
|
+
*
|
842
|
+
* @param writer
|
843
|
+
* The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
|
844
|
+
* @param str
|
845
|
+
* The <code>String</code> to escape. Assumed to be a non-null value.
|
846
|
+
* @throws IOException
|
847
|
+
* when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
|
848
|
+
* methods.
|
849
|
+
*
|
850
|
+
* @see #escape(String)
|
851
|
+
* @see Writer
|
852
|
+
*/
|
853
|
+
public void escape(Writer writer, String str) throws IOException {
|
854
|
+
int len = str.length();
|
855
|
+
for (int i = 0; i < len; i++) {
|
856
|
+
char c = str.charAt(i);
|
857
|
+
String entityName = this.entityName(c);
|
858
|
+
if (entityName == null) {
|
859
|
+
if (c > 0x7F) {
|
860
|
+
writer.write("&#");
|
861
|
+
writer.write(Integer.toString(c, 10));
|
862
|
+
writer.write(';');
|
863
|
+
} else {
|
864
|
+
writer.write(c);
|
865
|
+
}
|
866
|
+
} else {
|
867
|
+
writer.write('&');
|
868
|
+
writer.write(entityName);
|
869
|
+
writer.write(';');
|
870
|
+
}
|
871
|
+
}
|
872
|
+
}
|
873
|
+
|
874
|
+
/**
|
875
|
+
* <p>
|
876
|
+
* Unescapes the entities in a <code>String</code>.
|
877
|
+
* </p>
|
878
|
+
*
|
879
|
+
* <p>
|
880
|
+
* For example, if you have called addEntity("foo", 0xA1), unescape("&foo;") will return
|
881
|
+
* "\u00A1"
|
882
|
+
* </p>
|
883
|
+
*
|
884
|
+
* @param str
|
885
|
+
* The <code>String</code> to escape.
|
886
|
+
* @return A new escaped <code>String</code>.
|
887
|
+
*/
|
888
|
+
public String unescape(String str) {
|
889
|
+
int firstAmp = str.indexOf('&');
|
890
|
+
if (firstAmp < 0) {
|
891
|
+
return str;
|
892
|
+
} else {
|
893
|
+
StringWriter stringWriter = createStringWriter(str);
|
894
|
+
try {
|
895
|
+
this.doUnescape(stringWriter, str, firstAmp);
|
896
|
+
} catch (IOException e) {
|
897
|
+
// This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
|
898
|
+
// do not throw IOExceptions.
|
899
|
+
throw new RuntimeException(e);
|
900
|
+
}
|
901
|
+
return stringWriter.toString();
|
902
|
+
}
|
903
|
+
}
|
904
|
+
|
905
|
+
/**
|
906
|
+
* Make the StringWriter 10% larger than the source String to avoid growing the writer
|
907
|
+
*
|
908
|
+
* @param str The source string
|
909
|
+
* @return A newly created StringWriter
|
910
|
+
*/
|
911
|
+
private StringWriter createStringWriter(String str) {
|
912
|
+
return new StringWriter((int) (str.length() + (str.length() * 0.1)));
|
913
|
+
}
|
914
|
+
|
915
|
+
/**
|
916
|
+
* <p>
|
917
|
+
* Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
|
918
|
+
* <code>Writer</code> passed.
|
919
|
+
* </p>
|
920
|
+
*
|
921
|
+
* @param writer
|
922
|
+
* The <code>Writer</code> to write the results to; assumed to be non-null.
|
923
|
+
* @param str
|
924
|
+
* The source <code>String</code> to unescape; assumed to be non-null.
|
925
|
+
* @throws IOException
|
926
|
+
* when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
|
927
|
+
* methods.
|
928
|
+
*
|
929
|
+
* @see #escape(String)
|
930
|
+
* @see Writer
|
931
|
+
*/
|
932
|
+
public void unescape(Writer writer, String str) throws IOException {
|
933
|
+
int firstAmp = str.indexOf('&');
|
934
|
+
if (firstAmp < 0) {
|
935
|
+
writer.write(str);
|
936
|
+
return;
|
937
|
+
} else {
|
938
|
+
doUnescape(writer, str, firstAmp);
|
939
|
+
}
|
940
|
+
}
|
941
|
+
|
942
|
+
/**
|
943
|
+
* Underlying unescape method that allows the optimisation of not starting from the 0 index again.
|
944
|
+
*
|
945
|
+
* @param writer
|
946
|
+
* The <code>Writer</code> to write the results to; assumed to be non-null.
|
947
|
+
* @param str
|
948
|
+
* The source <code>String</code> to unescape; assumed to be non-null.
|
949
|
+
* @param firstAmp
|
950
|
+
* The <code>int</code> index of the first ampersand in the source String.
|
951
|
+
* @throws IOException
|
952
|
+
* when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
|
953
|
+
* methods.
|
954
|
+
*/
|
955
|
+
private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
|
956
|
+
writer.write(str, 0, firstAmp);
|
957
|
+
int len = str.length();
|
958
|
+
for (int i = firstAmp; i < len; i++) {
|
959
|
+
char c = str.charAt(i);
|
960
|
+
if (c == '&') {
|
961
|
+
int nextIdx = i + 1;
|
962
|
+
int semiColonIdx = str.indexOf(';', nextIdx);
|
963
|
+
if (semiColonIdx == -1) {
|
964
|
+
writer.write(c);
|
965
|
+
continue;
|
966
|
+
}
|
967
|
+
int amphersandIdx = str.indexOf('&', i + 1);
|
968
|
+
if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
|
969
|
+
// Then the text looks like &...&...;
|
970
|
+
writer.write(c);
|
971
|
+
continue;
|
972
|
+
}
|
973
|
+
String entityContent = str.substring(nextIdx, semiColonIdx);
|
974
|
+
int entityValue = -1;
|
975
|
+
int entityContentLen = entityContent.length();
|
976
|
+
if (entityContentLen > 0) {
|
977
|
+
if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
|
978
|
+
// hexidecimal)
|
979
|
+
if (entityContentLen > 1) {
|
980
|
+
char isHexChar = entityContent.charAt(1);
|
981
|
+
try {
|
982
|
+
switch (isHexChar) {
|
983
|
+
case 'X' :
|
984
|
+
case 'x' : {
|
985
|
+
entityValue = Integer.parseInt(entityContent.substring(2), 16);
|
986
|
+
break;
|
987
|
+
}
|
988
|
+
default : {
|
989
|
+
entityValue = Integer.parseInt(entityContent.substring(1), 10);
|
990
|
+
}
|
991
|
+
}
|
992
|
+
if (entityValue > 0xFFFF) {
|
993
|
+
entityValue = -1;
|
994
|
+
}
|
995
|
+
} catch (NumberFormatException e) {
|
996
|
+
entityValue = -1;
|
997
|
+
}
|
998
|
+
}
|
999
|
+
} else { // escaped value content is an entity name
|
1000
|
+
entityValue = this.entityValue(entityContent);
|
1001
|
+
}
|
1002
|
+
}
|
1003
|
+
|
1004
|
+
if (entityValue == -1) {
|
1005
|
+
writer.write('&');
|
1006
|
+
writer.write(entityContent);
|
1007
|
+
writer.write(';');
|
1008
|
+
} else {
|
1009
|
+
writer.write(entityValue);
|
1010
|
+
}
|
1011
|
+
i = semiColonIdx; // move index up to the semi-colon
|
1012
|
+
} else {
|
1013
|
+
writer.write(c);
|
1014
|
+
}
|
1015
|
+
}
|
1016
|
+
}
|
1017
|
+
|
1018
|
+
}
|