hpricot 0.6-mswin32 → 0.6.164-mswin32

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -6,19 +6,32 @@ require 'rake/testtask'
6
6
  require 'fileutils'
7
7
  include FileUtils
8
8
 
9
+ RbConfig = Config unless defined?(RbConfig)
10
+
9
11
  NAME = "hpricot"
10
- REV = `svn info`[/Revision: (\d+)/, 1] rescue nil
12
+ REV = (`#{ENV['GIT'] || "git"} rev-list HEAD`.split.length + 1).to_s
11
13
  VERS = ENV['VERSION'] || "0.6" + (REV ? ".#{REV}" : "")
12
14
  PKG = "#{NAME}-#{VERS}"
13
- BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
14
- ARCHLIB = "lib/#{::Config::CONFIG['arch']}"
15
- CLEAN.include ["ext/hpricot_scan/#{BIN}", "lib/**/#{BIN}", 'ext/hpricot_scan/Makefile',
16
- '**/.*.sw?', '*.gem', '.config']
15
+ BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp,class}"
16
+ CLEAN.include ["ext/hpricot_scan/#{BIN}", "ext/fast_xs/#{BIN}", "lib/**/#{BIN}",
17
+ 'ext/fast_xs/Makefile', 'ext/hpricot_scan/Makefile',
18
+ '**/.*.sw?', '*.gem', '.config', 'pkg']
17
19
  RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README', '--inline-source']
18
20
  PKG_FILES = %w(CHANGELOG COPYING README Rakefile) +
19
21
  Dir.glob("{bin,doc,test,lib,extras}/**/*") +
20
22
  Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
21
- %w[ext/hpricot_scan/hpricot_scan.c] # needed because it's generated later
23
+ %w[ext/hpricot_scan/hpricot_scan.c ext/hpricot_scan/HpricotScanService.java] # needed because they are generated later
24
+ RAGEL_C_CODE_GENERATION_STYLES = {
25
+ "table_driven" => 'T0',
26
+ "faster_table_driven" => 'T1',
27
+ "flat_table_driven" => 'F0',
28
+ "faster_flat_table_driven" => 'F1',
29
+ "goto_driven" => 'G0',
30
+ "faster_goto_driven" => 'G1',
31
+ "really_fast goto_driven" => 'G2'
32
+ # "n_way_split_really_fast_goto_driven" => 'P<N>'
33
+ }
34
+ DEFAULT_RAGEL_C_CODE_GENERATION = "really_fast goto_driven"
22
35
  SPEC =
23
36
  Gem::Specification.new do |s|
24
37
  s.name = NAME
@@ -32,12 +45,20 @@ SPEC =
32
45
  s.author = "why the lucky stiff"
33
46
  s.email = 'why@ruby-lang.org'
34
47
  s.homepage = 'http://code.whytheluckystiff.net/hpricot/'
48
+ s.rubyforge_project = 'hobix'
35
49
  s.files = PKG_FILES
36
- s.require_paths = [ARCHLIB, "lib"]
50
+ s.require_paths = ["lib"]
37
51
  s.extensions = FileList["ext/**/extconf.rb"].to_a
38
52
  s.bindir = "bin"
39
53
  end
40
54
 
55
+ Win32Spec = SPEC.dup
56
+ Win32Spec.platform = 'mswin32'
57
+ Win32Spec.files = PKG_FILES + ["lib/hpricot_scan.so", "lib/fast_xs.so"]
58
+ Win32Spec.extensions = []
59
+
60
+ WIN32_PKG_DIR = "#{PKG}-mswin32"
61
+
41
62
  desc "Does a full compile, test run"
42
63
  task :default => [:compile, :test]
43
64
 
@@ -49,7 +70,7 @@ task :release => [:package, :package_win32, :package_jruby]
49
70
 
50
71
  desc "Run all the tests"
51
72
  Rake::TestTask.new do |t|
52
- t.libs << "test" << ARCHLIB
73
+ t.libs << "test"
53
74
  t.test_files = FileList['test/test_*.rb']
54
75
  t.verbose = true
55
76
  end
@@ -66,25 +87,47 @@ Rake::GemPackageTask.new(SPEC) do |p|
66
87
  p.gem_spec = SPEC
67
88
  end
68
89
 
69
- extension = "hpricot_scan"
70
- ext = "ext/hpricot_scan"
71
- ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
72
- ext_files = FileList[
73
- "#{ext}/*.c",
74
- "#{ext}/*.h",
75
- "#{ext}/*.rl",
76
- "#{ext}/extconf.rb",
77
- "#{ext}/Makefile",
78
- "lib"
79
- ]
90
+ ['hpricot_scan', 'fast_xs'].each do |extension|
91
+ ext = "ext/#{extension}"
92
+ ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
93
+ ext_files = FileList[
94
+ "#{ext}/*.c",
95
+ "#{ext}/*.h",
96
+ "#{ext}/*.rl",
97
+ "#{ext}/extconf.rb",
98
+ "#{ext}/Makefile",
99
+ "lib"
100
+ ]
101
+
102
+ desc "Builds just the #{extension} extension"
103
+ task extension.to_sym => ["#{ext}/Makefile", ext_so ]
104
+
105
+ file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
106
+ Dir.chdir(ext) do ruby "extconf.rb" end
107
+ end
108
+
109
+ file ext_so => ext_files do
110
+ Dir.chdir(ext) do
111
+ sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make')
112
+ end
113
+ cp ext_so, "lib"
114
+ end
115
+
116
+ desc "Cross-compile the #{extension} extension for win32"
117
+ file "#{extension}_win32" => [WIN32_PKG_DIR] do
118
+ cp "extras/mingw-rbconfig.rb", "#{WIN32_PKG_DIR}/ext/#{extension}/rbconfig.rb"
119
+ sh "cd #{WIN32_PKG_DIR}/ext/#{extension}/ && ruby -I. extconf.rb && make"
120
+ mv "#{WIN32_PKG_DIR}/ext/#{extension}/#{extension}.so", "#{WIN32_PKG_DIR}/lib"
121
+ end
122
+ end
80
123
 
81
124
  task "lib" do
82
125
  directory "lib"
83
126
  end
84
127
 
85
128
  desc "Compiles the Ruby extension"
86
- task :compile => [:hpricot_scan] do
87
- if Dir.glob(File.join(ARCHLIB,"hpricot_scan.*")).length == 0
129
+ task :compile => [:hpricot_scan, :fast_xs] do
130
+ if Dir.glob(File.join("lib","hpricot_scan.*")).length == 0
88
131
  STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
89
132
  STDERR.puts "Gem actually failed to build. Your system is"
90
133
  STDERR.puts "NOT configured properly to build hpricot."
@@ -94,60 +137,47 @@ task :compile => [:hpricot_scan] do
94
137
  end
95
138
  task :hpricot_scan => [:ragel]
96
139
 
97
- desc "Builds just the #{extension} extension"
98
- task extension.to_sym => ["#{ext}/Makefile", ext_so ]
99
-
100
- file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
101
- Dir.chdir(ext) do ruby "extconf.rb" end
102
- end
103
-
104
- file ext_so => ext_files do
105
- Dir.chdir(ext) do
106
- sh(PLATFORM =~ /win32/ ? 'nmake' : 'make')
107
- end
108
- mkdir_p ARCHLIB
109
- cp ext_so, ARCHLIB
110
- end
111
-
112
- desc "returns the ragel version"
140
+ desc "Determines the Ragel version and displays it on the console along with the location of the Ragel binary."
113
141
  task :ragel_version do
114
142
  @ragel_v = `ragel -v`[/(version )(\S*)/,2].to_f
143
+ puts "Using ragel version: #{@ragel_v}, location: #{`which ragel`}"
144
+ @ragel_v
115
145
  end
116
146
 
117
147
  desc "Generates the C scanner code with Ragel."
118
148
  task :ragel => [:ragel_version] do
119
- sh %{ragel ext/hpricot_scan/hpricot_scan.rl | #{@ragel_v >= 5.18 ? 'rlgen-cd' : 'rlcodegen'} -G2 -o ext/hpricot_scan/hpricot_scan.c}
149
+ if @ragel_v >= 6.1
150
+ @ragel_c_code_generation_style = RAGEL_C_CODE_GENERATION_STYLES[DEFAULT_RAGEL_C_CODE_GENERATION]
151
+ sh %{cd ext/hpricot_scan; ragel hpricot_scan.rl -#{@ragel_c_code_generation_style} -o hpricot_scan.c}
152
+ else
153
+ STDERR.puts "Ragel 6.1 or greater is required."
154
+ exit(1)
155
+ end
120
156
  end
121
157
 
122
- desc "Generates the Java scanner code with Ragel."
158
+ # Java only supports the table-driven code
159
+ # generation style at this point.
160
+ desc "Generates the Java scanner code using the Ragel table-driven code generation style."
123
161
  task :ragel_java => [:ragel_version] do
124
- sh %{ragel -J ext/hpricot_scan/hpricot_scan.java.rl | #{@ragel_v >= 5.18 ? 'rlgen-java' : 'rlcodegen'} -o ext/hpricot_scan/HpricotScanService.java}
162
+ if @ragel_v >= 6.1
163
+ puts "compiling with ragel version #{@ragel_v}"
164
+ sh %{ragel -J -o ext/hpricot_scan/HpricotScanService.java ext/hpricot_scan/hpricot_scan.java.rl}
165
+ else
166
+ STDERR.puts "Ragel 6.1 or greater is required."
167
+ exit(1)
168
+ end
125
169
  end
126
170
 
127
171
  ### Win32 Packages ###
128
172
 
129
- Win32Spec = SPEC.dup
130
- Win32Spec.platform = Gem::Platform::WIN32
131
- Win32Spec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.so"]
132
- Win32Spec.extensions = []
133
-
134
- WIN32_PKG_DIR = "#{PKG}-mswin32"
135
-
136
173
  desc "Package up the Win32 distribution."
137
174
  file WIN32_PKG_DIR => [:package] do
138
175
  sh "tar zxf pkg/#{PKG}.tgz"
139
176
  mv PKG, WIN32_PKG_DIR
140
177
  end
141
178
 
142
- desc "Cross-compile the hpricot_scan extension for win32"
143
- file "hpricot_scan_win32" => [WIN32_PKG_DIR] do
144
- cp "extras/mingw-rbconfig.rb", "#{WIN32_PKG_DIR}/ext/hpricot_scan/rbconfig.rb"
145
- sh "cd #{WIN32_PKG_DIR}/ext/hpricot_scan/ && ruby -I. extconf.rb && make"
146
- mv "#{WIN32_PKG_DIR}/ext/hpricot_scan/hpricot_scan.so", "#{WIN32_PKG_DIR}/#{ARCHLIB}"
147
- end
148
-
149
179
  desc "Build the binary RubyGems package for win32"
150
- task :package_win32 => ["hpricot_scan_win32"] do
180
+ task :package_win32 => ["fast_xs_win32", "hpricot_scan_win32"] do
151
181
  Dir.chdir("#{WIN32_PKG_DIR}") do
152
182
  Gem::Builder.new(Win32Spec).build
153
183
  verbose(true) {
@@ -160,19 +190,43 @@ CLEAN.include WIN32_PKG_DIR
160
190
 
161
191
  ### JRuby Packages ###
162
192
 
163
- compile_java = proc do
164
- sh %{javac -source 1.4 -target 1.4 -classpath $JRUBY_HOME/lib/jruby.jar HpricotScanService.java}
165
- sh %{jar cf hpricot_scan.jar HpricotScanService.class}
193
+ def java_classpath_arg
194
+ # A myriad of ways to discover the JRuby classpath
195
+ classpath = begin
196
+ require 'java'
197
+ # Already running in a JRuby JVM
198
+ Java::java.lang.System.getProperty('java.class.path')
199
+ rescue LoadError
200
+ ENV['JRUBY_PARENT_CLASSPATH'] || ENV['JRUBY_HOME'] && FileList["#{ENV['JRUBY_HOME']}/lib/*.jar"].join(File::PATH_SEPARATOR)
201
+ end
202
+ classpath ? "-cp #{classpath}" : ""
203
+ end
204
+
205
+ def compile_java(filename, jarname)
206
+ sh %{javac -source 1.4 -target 1.4 #{java_classpath_arg} #{filename}}
207
+ sh %{jar cf #{jarname} *.class}
166
208
  end
167
209
 
168
- desc "Compiles the JRuby extension"
169
210
  task :hpricot_scan_java => [:ragel_java] do
170
- Dir.chdir("ext/hpricot_scan", &compile_java)
211
+ Dir.chdir "ext/hpricot_scan" do
212
+ compile_java("HpricotScanService.java", "hpricot_scan.jar")
213
+ end
214
+ end
215
+
216
+ task :fast_xs_java do
217
+ Dir.chdir "ext/fast_xs" do
218
+ compile_java("FastXsService.java", "fast_xs.jar")
219
+ end
220
+ end
221
+
222
+ desc "Compiles the JRuby extensions"
223
+ task :hpricot_java => [:hpricot_scan_java, :fast_xs_java] do
224
+ %w(hpricot_scan fast_xs).each {|ext| mv "ext/#{ext}/#{ext}.jar", "lib"}
171
225
  end
172
226
 
173
227
  JRubySpec = SPEC.dup
174
228
  JRubySpec.platform = 'jruby'
175
- JRubySpec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.jar"]
229
+ JRubySpec.files = PKG_FILES + ["lib/hpricot_scan.jar", "lib/fast_xs.jar"]
176
230
  JRubySpec.extensions = []
177
231
 
178
232
  JRUBY_PKG_DIR = "#{PKG}-jruby"
@@ -183,15 +237,10 @@ file JRUBY_PKG_DIR => [:ragel_java, :package] do
183
237
  mv PKG, JRUBY_PKG_DIR
184
238
  end
185
239
 
186
- desc "Cross-compile the hpricot_scan extension for JRuby"
187
- file "hpricot_scan_jruby" => [JRUBY_PKG_DIR] do
188
- Dir.chdir("#{JRUBY_PKG_DIR}/ext/hpricot_scan", &compile_java)
189
- mv "#{JRUBY_PKG_DIR}/ext/hpricot_scan/hpricot_scan.jar", "#{JRUBY_PKG_DIR}/#{ARCHLIB}"
190
- end
191
-
192
240
  desc "Build the RubyGems package for JRuby"
193
- task :package_jruby => ["hpricot_scan_jruby"] do
241
+ task :package_jruby => JRUBY_PKG_DIR do
194
242
  Dir.chdir("#{JRUBY_PKG_DIR}") do
243
+ Rake::Task[:hpricot_java].invoke
195
244
  Gem::Builder.new(JRubySpec).build
196
245
  verbose(true) {
197
246
  mv Dir["*.gem"].first, "../pkg/#{JRUBY_PKG_DIR}.gem"
@@ -0,0 +1,1018 @@
1
+
2
+ import java.io.IOException;
3
+ import java.io.StringWriter;
4
+ import java.io.Writer;
5
+ import java.util.HashMap;
6
+ import java.util.Map;
7
+ import java.util.TreeMap;
8
+ import org.jruby.Ruby;
9
+ import org.jruby.RubyModule;
10
+ import org.jruby.runtime.CallbackFactory;
11
+ import org.jruby.runtime.builtin.IRubyObject;
12
+ import org.jruby.runtime.load.BasicLibraryService;
13
+ import org.jruby.util.collections.IntHashMap;
14
+
15
+ public class FastXsService implements BasicLibraryService {
16
+
17
+ public boolean basicLoad(final Ruby runtime) throws IOException {
18
+ RubyModule string = runtime.getModule("String");
19
+ CallbackFactory fact = runtime.callbackFactory(FastXsService.class);
20
+ string.defineMethod("fast_xs",fact.getFastSingletonMethod("fast_xs"));
21
+ return true;
22
+ }
23
+
24
+ public static IRubyObject fast_xs(IRubyObject recv) {
25
+ String string = recv.convertToString().getUnicodeValue();
26
+ StringWriter writer = new StringWriter ((int)(string.length() * 1.5));
27
+ try {
28
+ Entities.XML.escape(writer, string);
29
+ return recv.getRuntime().newString(writer.toString());
30
+ } catch (IOException e) {
31
+ throw recv.getRuntime().newIOErrorFromException(e);
32
+ }
33
+ }
34
+ }
35
+
36
+ // From Apache commons-lang,
37
+ // http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?revision=560660&view=markup
38
+ /*
39
+ * Licensed to the Apache Software Foundation (ASF) under one or more
40
+ * contributor license agreements. See the NOTICE file distributed with
41
+ * this work for additional information regarding copyright ownership.
42
+ * The ASF licenses this file to You under the Apache License, Version 2.0
43
+ * (the "License"); you may not use this file except in compliance with
44
+ * the License. You may obtain a copy of the License at
45
+ *
46
+ * http://www.apache.org/licenses/LICENSE-2.0
47
+ *
48
+ * Unless required by applicable law or agreed to in writing, software
49
+ * distributed under the License is distributed on an "AS IS" BASIS,
50
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
51
+ * See the License for the specific language governing permissions and
52
+ * limitations under the License.
53
+ */
54
+
55
+ /**
56
+ * <p>
57
+ * Provides HTML and XML entity utilities.
58
+ * </p>
59
+ *
60
+ * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
61
+ * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
62
+ * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
63
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
64
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
65
+ *
66
+ * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
67
+ * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
68
+ * @since 2.0
69
+ * @version $Id$
70
+ */
71
+ class Entities {
72
+
73
+ private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
74
+ {"amp", "38"}, // & - ampersand
75
+ {"lt", "60"}, // < - less-than
76
+ {"gt", "62"}, // > - greater-than
77
+ };
78
+
79
+ private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
80
+ };
81
+
82
+ // package scoped for testing
83
+ static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
84
+ {"iexcl", "161"}, // inverted exclamation mark
85
+ {"cent", "162"}, // cent sign
86
+ {"pound", "163"}, // pound sign
87
+ {"curren", "164"}, // currency sign
88
+ {"yen", "165"}, // yen sign = yuan sign
89
+ {"brvbar", "166"}, // broken bar = broken vertical bar
90
+ {"sect", "167"}, // section sign
91
+ {"uml", "168"}, // diaeresis = spacing diaeresis
92
+ {"copy", "169"}, // © - copyright sign
93
+ {"ordf", "170"}, // feminine ordinal indicator
94
+ {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
95
+ {"not", "172"}, // not sign
96
+ {"shy", "173"}, // soft hyphen = discretionary hyphen
97
+ {"reg", "174"}, // ® - registered trademark sign
98
+ {"macr", "175"}, // macron = spacing macron = overline = APL overbar
99
+ {"deg", "176"}, // degree sign
100
+ {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
101
+ {"sup2", "178"}, // superscript two = superscript digit two = squared
102
+ {"sup3", "179"}, // superscript three = superscript digit three = cubed
103
+ {"acute", "180"}, // acute accent = spacing acute
104
+ {"micro", "181"}, // micro sign
105
+ {"para", "182"}, // pilcrow sign = paragraph sign
106
+ {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
107
+ {"cedil", "184"}, // cedilla = spacing cedilla
108
+ {"sup1", "185"}, // superscript one = superscript digit one
109
+ {"ordm", "186"}, // masculine ordinal indicator
110
+ {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
111
+ {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
112
+ {"frac12", "189"}, // vulgar fraction one half = fraction one half
113
+ {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
114
+ {"iquest", "191"}, // inverted question mark = turned question mark
115
+ {"Agrave", "192"}, // À - uppercase A, grave accent
116
+ {"Aacute", "193"}, // Á - uppercase A, acute accent
117
+ {"Acirc", "194"}, // Â - uppercase A, circumflex accent
118
+ {"Atilde", "195"}, // Ã - uppercase A, tilde
119
+ {"Auml", "196"}, // Ä - uppercase A, umlaut
120
+ {"Aring", "197"}, // Å - uppercase A, ring
121
+ {"AElig", "198"}, // Æ - uppercase AE
122
+ {"Ccedil", "199"}, // Ç - uppercase C, cedilla
123
+ {"Egrave", "200"}, // È - uppercase E, grave accent
124
+ {"Eacute", "201"}, // É - uppercase E, acute accent
125
+ {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent
126
+ {"Euml", "203"}, // Ë - uppercase E, umlaut
127
+ {"Igrave", "204"}, // Ì - uppercase I, grave accent
128
+ {"Iacute", "205"}, // Í - uppercase I, acute accent
129
+ {"Icirc", "206"}, // Î - uppercase I, circumflex accent
130
+ {"Iuml", "207"}, // Ï - uppercase I, umlaut
131
+ {"ETH", "208"}, // Ð - uppercase Eth, Icelandic
132
+ {"Ntilde", "209"}, // Ñ - uppercase N, tilde
133
+ {"Ograve", "210"}, // Ò - uppercase O, grave accent
134
+ {"Oacute", "211"}, // Ó - uppercase O, acute accent
135
+ {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
136
+ {"Otilde", "213"}, // Õ - uppercase O, tilde
137
+ {"Ouml", "214"}, // Ö - uppercase O, umlaut
138
+ {"times", "215"}, // multiplication sign
139
+ {"Oslash", "216"}, // Ø - uppercase O, slash
140
+ {"Ugrave", "217"}, // Ù - uppercase U, grave accent
141
+ {"Uacute", "218"}, // Ú - uppercase U, acute accent
142
+ {"Ucirc", "219"}, // Û - uppercase U, circumflex accent
143
+ {"Uuml", "220"}, // Ü - uppercase U, umlaut
144
+ {"Yacute", "221"}, // Ý - uppercase Y, acute accent
145
+ {"THORN", "222"}, // Þ - uppercase THORN, Icelandic
146
+ {"szlig", "223"}, // ß - lowercase sharps, German
147
+ {"agrave", "224"}, // à - lowercase a, grave accent
148
+ {"aacute", "225"}, // á - lowercase a, acute accent
149
+ {"acirc", "226"}, // â - lowercase a, circumflex accent
150
+ {"atilde", "227"}, // ã - lowercase a, tilde
151
+ {"auml", "228"}, // ä - lowercase a, umlaut
152
+ {"aring", "229"}, // å - lowercase a, ring
153
+ {"aelig", "230"}, // æ - lowercase ae
154
+ {"ccedil", "231"}, // ç - lowercase c, cedilla
155
+ {"egrave", "232"}, // è - lowercase e, grave accent
156
+ {"eacute", "233"}, // é - lowercase e, acute accent
157
+ {"ecirc", "234"}, // ê - lowercase e, circumflex accent
158
+ {"euml", "235"}, // ë - lowercase e, umlaut
159
+ {"igrave", "236"}, // ì - lowercase i, grave accent
160
+ {"iacute", "237"}, // í - lowercase i, acute accent
161
+ {"icirc", "238"}, // î - lowercase i, circumflex accent
162
+ {"iuml", "239"}, // ï - lowercase i, umlaut
163
+ {"eth", "240"}, // ð - lowercase eth, Icelandic
164
+ {"ntilde", "241"}, // ñ - lowercase n, tilde
165
+ {"ograve", "242"}, // ò - lowercase o, grave accent
166
+ {"oacute", "243"}, // ó - lowercase o, acute accent
167
+ {"ocirc", "244"}, // ô - lowercase o, circumflex accent
168
+ {"otilde", "245"}, // õ - lowercase o, tilde
169
+ {"ouml", "246"}, // ö - lowercase o, umlaut
170
+ {"divide", "247"}, // division sign
171
+ {"oslash", "248"}, // ø - lowercase o, slash
172
+ {"ugrave", "249"}, // ù - lowercase u, grave accent
173
+ {"uacute", "250"}, // ú - lowercase u, acute accent
174
+ {"ucirc", "251"}, // û - lowercase u, circumflex accent
175
+ {"uuml", "252"}, // ü - lowercase u, umlaut
176
+ {"yacute", "253"}, // ý - lowercase y, acute accent
177
+ {"thorn", "254"}, // þ - lowercase thorn, Icelandic
178
+ {"yuml", "255"}, // ÿ - lowercase y, umlaut
179
+ };
180
+
181
+ // http://www.w3.org/TR/REC-html40/sgml/entities.html
182
+ // package scoped for testing
183
+ static final String[][] HTML40_ARRAY = {
184
+ // <!-- Latin Extended-B -->
185
+ {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
186
+ // <!-- Greek -->
187
+ {"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
188
+ {"Beta", "914"}, // greek capital letter beta, U+0392 -->
189
+ {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
190
+ {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
191
+ {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
192
+ {"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
193
+ {"Eta", "919"}, // greek capital letter eta, U+0397 -->
194
+ {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
195
+ {"Iota", "921"}, // greek capital letter iota, U+0399 -->
196
+ {"Kappa", "922"}, // greek capital letter kappa, U+039A -->
197
+ {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
198
+ {"Mu", "924"}, // greek capital letter mu, U+039C -->
199
+ {"Nu", "925"}, // greek capital letter nu, U+039D -->
200
+ {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
201
+ {"Omicron", "927"}, // greek capital letter omicron, U+039F -->
202
+ {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
203
+ {"Rho", "929"}, // greek capital letter rho, U+03A1 -->
204
+ // <!-- there is no Sigmaf, and no U+03A2 character either -->
205
+ {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
206
+ {"Tau", "932"}, // greek capital letter tau, U+03A4 -->
207
+ {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
208
+ {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
209
+ {"Chi", "935"}, // greek capital letter chi, U+03A7 -->
210
+ {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
211
+ {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
212
+ {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
213
+ {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
214
+ {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
215
+ {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
216
+ {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
217
+ {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
218
+ {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
219
+ {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
220
+ {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
221
+ {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
222
+ {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
223
+ {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
224
+ {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
225
+ {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
226
+ {"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
227
+ {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
228
+ {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
229
+ {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
230
+ {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
231
+ {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
232
+ {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
233
+ {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
234
+ {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
235
+ {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
236
+ {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
237
+ {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
238
+ {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
239
+ {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
240
+ // <!-- General Punctuation -->
241
+ {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
242
+ // <!-- bullet is NOT the same as bullet operator, U+2219 -->
243
+ {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
244
+ {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
245
+ {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
246
+ {"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
247
+ {"frasl", "8260"}, // fraction slash, U+2044 NEW -->
248
+ // <!-- Letterlike Symbols -->
249
+ {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
250
+ {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
251
+ {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
252
+ {"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
253
+ {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
254
+ // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
255
+ // same glyph could be used to depict both characters -->
256
+ // <!-- Arrows -->
257
+ {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
258
+ {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
259
+ {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
260
+ {"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
261
+ {"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
262
+ {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
263
+ {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
264
+ // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
265
+ // arrow but also does not have any other character for that function.
266
+ // So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
267
+ {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
268
+ {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
269
+ // <!-- ISO 10646 does not say this is the 'implies' character but does not
270
+ // have another character with this function so ?rArr can be used for
271
+ // 'implies' as ISOtech suggests -->
272
+ {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
273
+ {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
274
+ // <!-- Mathematical Operators -->
275
+ {"forall", "8704"}, // for all, U+2200 ISOtech -->
276
+ {"part", "8706"}, // partial differential, U+2202 ISOtech -->
277
+ {"exist", "8707"}, // there exists, U+2203 ISOtech -->
278
+ {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
279
+ {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
280
+ {"isin", "8712"}, // element of, U+2208 ISOtech -->
281
+ {"notin", "8713"}, // not an element of, U+2209 ISOtech -->
282
+ {"ni", "8715"}, // contains as member, U+220B ISOtech -->
283
+ // <!-- should there be a more memorable name than 'ni'? -->
284
+ {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
285
+ // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
286
+ // though the same glyph might be used for both -->
287
+ {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
288
+ // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
289
+ // though the same glyph might be used for both -->
290
+ {"minus", "8722"}, // minus sign, U+2212 ISOtech -->
291
+ {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
292
+ {"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
293
+ {"prop", "8733"}, // proportional to, U+221D ISOtech -->
294
+ {"infin", "8734"}, // infinity, U+221E ISOtech -->
295
+ {"ang", "8736"}, // angle, U+2220 ISOamso -->
296
+ {"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
297
+ {"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
298
+ {"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
299
+ {"cup", "8746"}, // union = cup, U+222A ISOtech -->
300
+ {"int", "8747"}, // integral, U+222B ISOtech -->
301
+ {"there4", "8756"}, // therefore, U+2234 ISOtech -->
302
+ {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
303
+ // <!-- tilde operator is NOT the same character as the tilde, U+007E,although
304
+ // the same glyph might be used to represent both -->
305
+ {"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
306
+ {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
307
+ {"ne", "8800"}, // not equal to, U+2260 ISOtech -->
308
+ {"equiv", "8801"}, // identical to, U+2261 ISOtech -->
309
+ {"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
310
+ {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
311
+ {"sub", "8834"}, // subset of, U+2282 ISOtech -->
312
+ {"sup", "8835"}, // superset of, U+2283 ISOtech -->
313
+ // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
314
+ // Symbol font encoding and is not included. Should it be, for symmetry?
315
+ // It is in ISOamsn --> <!ENTITY nsub", "8836"},
316
+ // not a subset of, U+2284 ISOamsn -->
317
+ {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
318
+ {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
319
+ {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
320
+ {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
321
+ {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
322
+ {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
323
+ // <!-- dot operator is NOT the same character as U+00B7 middle dot -->
324
+ // <!-- Miscellaneous Technical -->
325
+ {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
326
+ {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
327
+ {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
328
+ {"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
329
+ {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
330
+ // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
331
+ // mark' -->
332
+ {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
333
+ // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
334
+ // 'single right-pointing angle quotation mark' -->
335
+ // <!-- Geometric Shapes -->
336
+ {"loz", "9674"}, // lozenge, U+25CA ISOpub -->
337
+ // <!-- Miscellaneous Symbols -->
338
+ {"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
339
+ // <!-- black here seems to mean filled as opposed to hollow -->
340
+ {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
341
+ {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
342
+ {"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
343
+
344
+ // <!-- Latin Extended-A -->
345
+ {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
346
+ {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
347
+ // <!-- ligature is a misnomer, this is a separate character in some languages -->
348
+ {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
349
+ {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
350
+ {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
351
+ // <!-- Spacing Modifier Letters -->
352
+ {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
353
+ {"tilde", "732"}, // small tilde, U+02DC ISOdia -->
354
+ // <!-- General Punctuation -->
355
+ {"ensp", "8194"}, // en space, U+2002 ISOpub -->
356
+ {"emsp", "8195"}, // em space, U+2003 ISOpub -->
357
+ {"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
358
+ {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
359
+ {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
360
+ {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
361
+ {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
362
+ {"ndash", "8211"}, // en dash, U+2013 ISOpub -->
363
+ {"mdash", "8212"}, // em dash, U+2014 ISOpub -->
364
+ {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
365
+ {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
366
+ {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
367
+ {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
368
+ {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
369
+ {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
370
+ {"dagger", "8224"}, // dagger, U+2020 ISOpub -->
371
+ {"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
372
+ {"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
373
+ {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
374
+ // <!-- lsaquo is proposed but not yet ISO standardized -->
375
+ {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
376
+ // <!-- rsaquo is proposed but not yet ISO standardized -->
377
+ {"euro", "8364"}, // -- euro sign, U+20AC NEW -->
378
+ };
379
+
380
+ /**
381
+ * <p>
382
+ * The set of entities supported by standard XML.
383
+ * </p>
384
+ */
385
+ public static final Entities XML;
386
+
387
+ /**
388
+ * <p>
389
+ * The set of entities supported by HTML 3.2.
390
+ * </p>
391
+ */
392
+ public static final Entities HTML32;
393
+
394
+ /**
395
+ * <p>
396
+ * The set of entities supported by HTML 4.0.
397
+ * </p>
398
+ */
399
+ public static final Entities HTML40;
400
+
401
+ static {
402
+ XML = new Entities();
403
+ XML.addEntities(BASIC_ARRAY);
404
+ XML.addEntities(APOS_ARRAY);
405
+ }
406
+
407
+ static {
408
+ HTML32 = new Entities();
409
+ HTML32.addEntities(BASIC_ARRAY);
410
+ HTML32.addEntities(ISO8859_1_ARRAY);
411
+ }
412
+
413
+ static {
414
+ HTML40 = new Entities();
415
+ fillWithHtml40Entities(HTML40);
416
+ }
417
+
418
+ /**
419
+ * <p>
420
+ * Fills the specified entities instance with HTML 40 entities.
421
+ * </p>
422
+ *
423
+ * @param entities
424
+ * the instance to be filled.
425
+ */
426
+ static void fillWithHtml40Entities(Entities entities) {
427
+ entities.addEntities(BASIC_ARRAY);
428
+ entities.addEntities(ISO8859_1_ARRAY);
429
+ entities.addEntities(HTML40_ARRAY);
430
+ }
431
+
432
+ static interface EntityMap {
433
+ /**
434
+ * <p>
435
+ * Add an entry to this entity map.
436
+ * </p>
437
+ *
438
+ * @param name
439
+ * the entity name
440
+ * @param value
441
+ * the entity value
442
+ */
443
+ void add(String name, int value);
444
+
445
+ /**
446
+ * <p>
447
+ * Returns the name of the entity identified by the specified value.
448
+ * </p>
449
+ *
450
+ * @param value
451
+ * the value to locate
452
+ * @return entity name associated with the specified value
453
+ */
454
+ String name(int value);
455
+
456
+ /**
457
+ * <p>
458
+ * Returns the value of the entity identified by the specified name.
459
+ * </p>
460
+ *
461
+ * @param name
462
+ * the name to locate
463
+ * @return entity value associated with the specified name
464
+ */
465
+ int value(String name);
466
+ }
467
+
468
+ static class PrimitiveEntityMap implements EntityMap {
469
+ private Map mapNameToValue = new HashMap();
470
+
471
+ private IntHashMap mapValueToName = new IntHashMap();
472
+
473
+ /**
474
+ * {@inheritDoc}
475
+ */
476
+ public void add(String name, int value) {
477
+ mapNameToValue.put(name, new Integer(value));
478
+ mapValueToName.put(value, name);
479
+ }
480
+
481
+ /**
482
+ * {@inheritDoc}
483
+ */
484
+ public String name(int value) {
485
+ return (String) mapValueToName.get(value);
486
+ }
487
+
488
+ /**
489
+ * {@inheritDoc}
490
+ */
491
+ public int value(String name) {
492
+ Object value = mapNameToValue.get(name);
493
+ if (value == null) {
494
+ return -1;
495
+ }
496
+ return ((Integer) value).intValue();
497
+ }
498
+ }
499
+
500
+ static abstract class MapIntMap implements Entities.EntityMap {
501
+ protected Map mapNameToValue;
502
+
503
+ protected Map mapValueToName;
504
+
505
+ /**
506
+ * {@inheritDoc}
507
+ */
508
+ public void add(String name, int value) {
509
+ mapNameToValue.put(name, new Integer(value));
510
+ mapValueToName.put(new Integer(value), name);
511
+ }
512
+
513
+ /**
514
+ * {@inheritDoc}
515
+ */
516
+ public String name(int value) {
517
+ return (String) mapValueToName.get(new Integer(value));
518
+ }
519
+
520
+ /**
521
+ * {@inheritDoc}
522
+ */
523
+ public int value(String name) {
524
+ Object value = mapNameToValue.get(name);
525
+ if (value == null) {
526
+ return -1;
527
+ }
528
+ return ((Integer) value).intValue();
529
+ }
530
+ }
531
+
532
+ static class HashEntityMap extends MapIntMap {
533
+ /**
534
+ * Constructs a new instance of <code>HashEntityMap</code>.
535
+ */
536
+ public HashEntityMap() {
537
+ mapNameToValue = new HashMap();
538
+ mapValueToName = new HashMap();
539
+ }
540
+ }
541
+
542
+ static class TreeEntityMap extends MapIntMap {
543
+ /**
544
+ * Constructs a new instance of <code>TreeEntityMap</code>.
545
+ */
546
+ public TreeEntityMap() {
547
+ mapNameToValue = new TreeMap();
548
+ mapValueToName = new TreeMap();
549
+ }
550
+ }
551
+
552
+ static class LookupEntityMap extends PrimitiveEntityMap {
553
+ private String[] lookupTable;
554
+
555
+ private int LOOKUP_TABLE_SIZE = 256;
556
+
557
+ /**
558
+ * {@inheritDoc}
559
+ */
560
+ public String name(int value) {
561
+ if (value < LOOKUP_TABLE_SIZE) {
562
+ return lookupTable()[value];
563
+ }
564
+ return super.name(value);
565
+ }
566
+
567
+ /**
568
+ * <p>
569
+ * Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
570
+ * </p>
571
+ *
572
+ * @return the lookup table
573
+ */
574
+ private String[] lookupTable() {
575
+ if (lookupTable == null) {
576
+ createLookupTable();
577
+ }
578
+ return lookupTable;
579
+ }
580
+
581
+ /**
582
+ * <p>
583
+ * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
584
+ * </p>
585
+ */
586
+ private void createLookupTable() {
587
+ lookupTable = new String[LOOKUP_TABLE_SIZE];
588
+ for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
589
+ lookupTable[i] = super.name(i);
590
+ }
591
+ }
592
+ }
593
+
594
+ static class ArrayEntityMap implements EntityMap {
595
+ protected int growBy = 100;
596
+
597
+ protected int size = 0;
598
+
599
+ protected String[] names;
600
+
601
+ protected int[] values;
602
+
603
+ /**
604
+ * Constructs a new instance of <code>ArrayEntityMap</code>.
605
+ */
606
+ public ArrayEntityMap() {
607
+ names = new String[growBy];
608
+ values = new int[growBy];
609
+ }
610
+
611
+ /**
612
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
613
+ * grow.
614
+ *
615
+ * @param growBy
616
+ * array will be initialized to and will grow by this amount
617
+ */
618
+ public ArrayEntityMap(int growBy) {
619
+ this.growBy = growBy;
620
+ names = new String[growBy];
621
+ values = new int[growBy];
622
+ }
623
+
624
+ /**
625
+ * {@inheritDoc}
626
+ */
627
+ public void add(String name, int value) {
628
+ ensureCapacity(size + 1);
629
+ names[size] = name;
630
+ values[size] = value;
631
+ size++;
632
+ }
633
+
634
+ /**
635
+ * Verifies the capacity of the entity array, adjusting the size if necessary.
636
+ *
637
+ * @param capacity
638
+ * size the array should be
639
+ */
640
+ protected void ensureCapacity(int capacity) {
641
+ if (capacity > names.length) {
642
+ int newSize = Math.max(capacity, size + growBy);
643
+ String[] newNames = new String[newSize];
644
+ System.arraycopy(names, 0, newNames, 0, size);
645
+ names = newNames;
646
+ int[] newValues = new int[newSize];
647
+ System.arraycopy(values, 0, newValues, 0, size);
648
+ values = newValues;
649
+ }
650
+ }
651
+
652
+ /**
653
+ * {@inheritDoc}
654
+ */
655
+ public String name(int value) {
656
+ for (int i = 0; i < size; ++i) {
657
+ if (values[i] == value) {
658
+ return names[i];
659
+ }
660
+ }
661
+ return null;
662
+ }
663
+
664
+ /**
665
+ * {@inheritDoc}
666
+ */
667
+ public int value(String name) {
668
+ for (int i = 0; i < size; ++i) {
669
+ if (names[i].equals(name)) {
670
+ return values[i];
671
+ }
672
+ }
673
+ return -1;
674
+ }
675
+ }
676
+
677
+ static class BinaryEntityMap extends ArrayEntityMap {
678
+
679
+ /**
680
+ * Constructs a new instance of <code>BinaryEntityMap</code>.
681
+ */
682
+ public BinaryEntityMap() {
683
+ super();
684
+ }
685
+
686
+ /**
687
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
688
+ * should grow.
689
+ *
690
+ * @param growBy
691
+ * array will be initialized to and will grow by this amount
692
+ */
693
+ public BinaryEntityMap(int growBy) {
694
+ super(growBy);
695
+ }
696
+
697
+ /**
698
+ * Performs a binary search of the entity array for the specified key. This method is based on code in
699
+ * {@link java.util.Arrays}.
700
+ *
701
+ * @param key
702
+ * the key to be found
703
+ * @return the index of the entity array matching the specified key
704
+ */
705
+ private int binarySearch(int key) {
706
+ int low = 0;
707
+ int high = size - 1;
708
+
709
+ while (low <= high) {
710
+ int mid = (low + high) >> 1;
711
+ int midVal = values[mid];
712
+
713
+ if (midVal < key) {
714
+ low = mid + 1;
715
+ } else if (midVal > key) {
716
+ high = mid - 1;
717
+ } else {
718
+ return mid; // key found
719
+ }
720
+ }
721
+ return -(low + 1); // key not found.
722
+ }
723
+
724
+ /**
725
+ * {@inheritDoc}
726
+ */
727
+ public void add(String name, int value) {
728
+ ensureCapacity(size + 1);
729
+ int insertAt = binarySearch(value);
730
+ if (insertAt > 0) {
731
+ return; // note: this means you can't insert the same value twice
732
+ }
733
+ insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
734
+ System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
735
+ values[insertAt] = value;
736
+ System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
737
+ names[insertAt] = name;
738
+ size++;
739
+ }
740
+
741
+ /**
742
+ * {@inheritDoc}
743
+ */
744
+ public String name(int value) {
745
+ int index = binarySearch(value);
746
+ if (index < 0) {
747
+ return null;
748
+ }
749
+ return names[index];
750
+ }
751
+ }
752
+
753
+ // package scoped for testing
754
+ EntityMap map = new Entities.LookupEntityMap();
755
+
756
+ /**
757
+ * <p>
758
+ * Adds entities to this entity.
759
+ * </p>
760
+ *
761
+ * @param entityArray
762
+ * array of entities to be added
763
+ */
764
+ public void addEntities(String[][] entityArray) {
765
+ for (int i = 0; i < entityArray.length; ++i) {
766
+ addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
767
+ }
768
+ }
769
+
770
+ /**
771
+ * <p>
772
+ * Add an entity to this entity.
773
+ * </p>
774
+ *
775
+ * @param name
776
+ * name of the entity
777
+ * @param value
778
+ * vale of the entity
779
+ */
780
+ public void addEntity(String name, int value) {
781
+ map.add(name, value);
782
+ }
783
+
784
+ /**
785
+ * <p>
786
+ * Returns the name of the entity identified by the specified value.
787
+ * </p>
788
+ *
789
+ * @param value
790
+ * the value to locate
791
+ * @return entity name associated with the specified value
792
+ */
793
+ public String entityName(int value) {
794
+ return map.name(value);
795
+ }
796
+
797
+ /**
798
+ * <p>
799
+ * Returns the value of the entity identified by the specified name.
800
+ * </p>
801
+ *
802
+ * @param name
803
+ * the name to locate
804
+ * @return entity value associated with the specified name
805
+ */
806
+ public int entityValue(String name) {
807
+ return map.value(name);
808
+ }
809
+
810
+ /**
811
+ * <p>
812
+ * Escapes the characters in a <code>String</code>.
813
+ * </p>
814
+ *
815
+ * <p>
816
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), escape(&quot;\u00A1&quot;) will return
817
+ * &quot;&amp;foo;&quot;
818
+ * </p>
819
+ *
820
+ * @param str
821
+ * The <code>String</code> to escape.
822
+ * @return A new escaped <code>String</code>.
823
+ */
824
+ public String escape(String str) {
825
+ StringWriter stringWriter = createStringWriter(str);
826
+ try {
827
+ this.escape(stringWriter, str);
828
+ } catch (IOException e) {
829
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
830
+ // throw IOExceptions.
831
+ throw new RuntimeException(e);
832
+ }
833
+ return stringWriter.toString();
834
+ }
835
+
836
+ /**
837
+ * <p>
838
+ * Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
839
+ * passed.
840
+ * </p>
841
+ *
842
+ * @param writer
843
+ * The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
844
+ * @param str
845
+ * The <code>String</code> to escape. Assumed to be a non-null value.
846
+ * @throws IOException
847
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
848
+ * methods.
849
+ *
850
+ * @see #escape(String)
851
+ * @see Writer
852
+ */
853
+ public void escape(Writer writer, String str) throws IOException {
854
+ int len = str.length();
855
+ for (int i = 0; i < len; i++) {
856
+ char c = str.charAt(i);
857
+ String entityName = this.entityName(c);
858
+ if (entityName == null) {
859
+ if (c > 0x7F) {
860
+ writer.write("&#");
861
+ writer.write(Integer.toString(c, 10));
862
+ writer.write(';');
863
+ } else {
864
+ writer.write(c);
865
+ }
866
+ } else {
867
+ writer.write('&');
868
+ writer.write(entityName);
869
+ writer.write(';');
870
+ }
871
+ }
872
+ }
873
+
874
+ /**
875
+ * <p>
876
+ * Unescapes the entities in a <code>String</code>.
877
+ * </p>
878
+ *
879
+ * <p>
880
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), unescape(&quot;&amp;foo;&quot;) will return
881
+ * &quot;\u00A1&quot;
882
+ * </p>
883
+ *
884
+ * @param str
885
+ * The <code>String</code> to escape.
886
+ * @return A new escaped <code>String</code>.
887
+ */
888
+ public String unescape(String str) {
889
+ int firstAmp = str.indexOf('&');
890
+ if (firstAmp < 0) {
891
+ return str;
892
+ } else {
893
+ StringWriter stringWriter = createStringWriter(str);
894
+ try {
895
+ this.doUnescape(stringWriter, str, firstAmp);
896
+ } catch (IOException e) {
897
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
898
+ // do not throw IOExceptions.
899
+ throw new RuntimeException(e);
900
+ }
901
+ return stringWriter.toString();
902
+ }
903
+ }
904
+
905
+ /**
906
+ * Make the StringWriter 10% larger than the source String to avoid growing the writer
907
+ *
908
+ * @param str The source string
909
+ * @return A newly created StringWriter
910
+ */
911
+ private StringWriter createStringWriter(String str) {
912
+ return new StringWriter((int) (str.length() + (str.length() * 0.1)));
913
+ }
914
+
915
+ /**
916
+ * <p>
917
+ * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
918
+ * <code>Writer</code> passed.
919
+ * </p>
920
+ *
921
+ * @param writer
922
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
923
+ * @param str
924
+ * The source <code>String</code> to unescape; assumed to be non-null.
925
+ * @throws IOException
926
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
927
+ * methods.
928
+ *
929
+ * @see #escape(String)
930
+ * @see Writer
931
+ */
932
+ public void unescape(Writer writer, String str) throws IOException {
933
+ int firstAmp = str.indexOf('&');
934
+ if (firstAmp < 0) {
935
+ writer.write(str);
936
+ return;
937
+ } else {
938
+ doUnescape(writer, str, firstAmp);
939
+ }
940
+ }
941
+
942
+ /**
943
+ * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
944
+ *
945
+ * @param writer
946
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
947
+ * @param str
948
+ * The source <code>String</code> to unescape; assumed to be non-null.
949
+ * @param firstAmp
950
+ * The <code>int</code> index of the first ampersand in the source String.
951
+ * @throws IOException
952
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
953
+ * methods.
954
+ */
955
+ private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
956
+ writer.write(str, 0, firstAmp);
957
+ int len = str.length();
958
+ for (int i = firstAmp; i < len; i++) {
959
+ char c = str.charAt(i);
960
+ if (c == '&') {
961
+ int nextIdx = i + 1;
962
+ int semiColonIdx = str.indexOf(';', nextIdx);
963
+ if (semiColonIdx == -1) {
964
+ writer.write(c);
965
+ continue;
966
+ }
967
+ int amphersandIdx = str.indexOf('&', i + 1);
968
+ if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
969
+ // Then the text looks like &...&...;
970
+ writer.write(c);
971
+ continue;
972
+ }
973
+ String entityContent = str.substring(nextIdx, semiColonIdx);
974
+ int entityValue = -1;
975
+ int entityContentLen = entityContent.length();
976
+ if (entityContentLen > 0) {
977
+ if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
978
+ // hexidecimal)
979
+ if (entityContentLen > 1) {
980
+ char isHexChar = entityContent.charAt(1);
981
+ try {
982
+ switch (isHexChar) {
983
+ case 'X' :
984
+ case 'x' : {
985
+ entityValue = Integer.parseInt(entityContent.substring(2), 16);
986
+ break;
987
+ }
988
+ default : {
989
+ entityValue = Integer.parseInt(entityContent.substring(1), 10);
990
+ }
991
+ }
992
+ if (entityValue > 0xFFFF) {
993
+ entityValue = -1;
994
+ }
995
+ } catch (NumberFormatException e) {
996
+ entityValue = -1;
997
+ }
998
+ }
999
+ } else { // escaped value content is an entity name
1000
+ entityValue = this.entityValue(entityContent);
1001
+ }
1002
+ }
1003
+
1004
+ if (entityValue == -1) {
1005
+ writer.write('&');
1006
+ writer.write(entityContent);
1007
+ writer.write(';');
1008
+ } else {
1009
+ writer.write(entityValue);
1010
+ }
1011
+ i = semiColonIdx; // move index up to the semi-colon
1012
+ } else {
1013
+ writer.write(c);
1014
+ }
1015
+ }
1016
+ }
1017
+
1018
+ }