hpricot 0.6-mswin32 → 0.6.164-mswin32

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -6,19 +6,32 @@ require 'rake/testtask'
6
6
  require 'fileutils'
7
7
  include FileUtils
8
8
 
9
+ RbConfig = Config unless defined?(RbConfig)
10
+
9
11
  NAME = "hpricot"
10
- REV = `svn info`[/Revision: (\d+)/, 1] rescue nil
12
+ REV = (`#{ENV['GIT'] || "git"} rev-list HEAD`.split.length + 1).to_s
11
13
  VERS = ENV['VERSION'] || "0.6" + (REV ? ".#{REV}" : "")
12
14
  PKG = "#{NAME}-#{VERS}"
13
- BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
14
- ARCHLIB = "lib/#{::Config::CONFIG['arch']}"
15
- CLEAN.include ["ext/hpricot_scan/#{BIN}", "lib/**/#{BIN}", 'ext/hpricot_scan/Makefile',
16
- '**/.*.sw?', '*.gem', '.config']
15
+ BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp,class}"
16
+ CLEAN.include ["ext/hpricot_scan/#{BIN}", "ext/fast_xs/#{BIN}", "lib/**/#{BIN}",
17
+ 'ext/fast_xs/Makefile', 'ext/hpricot_scan/Makefile',
18
+ '**/.*.sw?', '*.gem', '.config', 'pkg']
17
19
  RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README', '--inline-source']
18
20
  PKG_FILES = %w(CHANGELOG COPYING README Rakefile) +
19
21
  Dir.glob("{bin,doc,test,lib,extras}/**/*") +
20
22
  Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
21
- %w[ext/hpricot_scan/hpricot_scan.c] # needed because it's generated later
23
+ %w[ext/hpricot_scan/hpricot_scan.c ext/hpricot_scan/HpricotScanService.java] # needed because they are generated later
24
+ RAGEL_C_CODE_GENERATION_STYLES = {
25
+ "table_driven" => 'T0',
26
+ "faster_table_driven" => 'T1',
27
+ "flat_table_driven" => 'F0',
28
+ "faster_flat_table_driven" => 'F1',
29
+ "goto_driven" => 'G0',
30
+ "faster_goto_driven" => 'G1',
31
+ "really_fast goto_driven" => 'G2'
32
+ # "n_way_split_really_fast_goto_driven" => 'P<N>'
33
+ }
34
+ DEFAULT_RAGEL_C_CODE_GENERATION = "really_fast goto_driven"
22
35
  SPEC =
23
36
  Gem::Specification.new do |s|
24
37
  s.name = NAME
@@ -32,12 +45,20 @@ SPEC =
32
45
  s.author = "why the lucky stiff"
33
46
  s.email = 'why@ruby-lang.org'
34
47
  s.homepage = 'http://code.whytheluckystiff.net/hpricot/'
48
+ s.rubyforge_project = 'hobix'
35
49
  s.files = PKG_FILES
36
- s.require_paths = [ARCHLIB, "lib"]
50
+ s.require_paths = ["lib"]
37
51
  s.extensions = FileList["ext/**/extconf.rb"].to_a
38
52
  s.bindir = "bin"
39
53
  end
40
54
 
55
+ Win32Spec = SPEC.dup
56
+ Win32Spec.platform = 'mswin32'
57
+ Win32Spec.files = PKG_FILES + ["lib/hpricot_scan.so", "lib/fast_xs.so"]
58
+ Win32Spec.extensions = []
59
+
60
+ WIN32_PKG_DIR = "#{PKG}-mswin32"
61
+
41
62
  desc "Does a full compile, test run"
42
63
  task :default => [:compile, :test]
43
64
 
@@ -49,7 +70,7 @@ task :release => [:package, :package_win32, :package_jruby]
49
70
 
50
71
  desc "Run all the tests"
51
72
  Rake::TestTask.new do |t|
52
- t.libs << "test" << ARCHLIB
73
+ t.libs << "test"
53
74
  t.test_files = FileList['test/test_*.rb']
54
75
  t.verbose = true
55
76
  end
@@ -66,25 +87,47 @@ Rake::GemPackageTask.new(SPEC) do |p|
66
87
  p.gem_spec = SPEC
67
88
  end
68
89
 
69
- extension = "hpricot_scan"
70
- ext = "ext/hpricot_scan"
71
- ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
72
- ext_files = FileList[
73
- "#{ext}/*.c",
74
- "#{ext}/*.h",
75
- "#{ext}/*.rl",
76
- "#{ext}/extconf.rb",
77
- "#{ext}/Makefile",
78
- "lib"
79
- ]
90
+ ['hpricot_scan', 'fast_xs'].each do |extension|
91
+ ext = "ext/#{extension}"
92
+ ext_so = "#{ext}/#{extension}.#{Config::CONFIG['DLEXT']}"
93
+ ext_files = FileList[
94
+ "#{ext}/*.c",
95
+ "#{ext}/*.h",
96
+ "#{ext}/*.rl",
97
+ "#{ext}/extconf.rb",
98
+ "#{ext}/Makefile",
99
+ "lib"
100
+ ]
101
+
102
+ desc "Builds just the #{extension} extension"
103
+ task extension.to_sym => ["#{ext}/Makefile", ext_so ]
104
+
105
+ file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
106
+ Dir.chdir(ext) do ruby "extconf.rb" end
107
+ end
108
+
109
+ file ext_so => ext_files do
110
+ Dir.chdir(ext) do
111
+ sh(RUBY_PLATFORM =~ /win32/ ? 'nmake' : 'make')
112
+ end
113
+ cp ext_so, "lib"
114
+ end
115
+
116
+ desc "Cross-compile the #{extension} extension for win32"
117
+ file "#{extension}_win32" => [WIN32_PKG_DIR] do
118
+ cp "extras/mingw-rbconfig.rb", "#{WIN32_PKG_DIR}/ext/#{extension}/rbconfig.rb"
119
+ sh "cd #{WIN32_PKG_DIR}/ext/#{extension}/ && ruby -I. extconf.rb && make"
120
+ mv "#{WIN32_PKG_DIR}/ext/#{extension}/#{extension}.so", "#{WIN32_PKG_DIR}/lib"
121
+ end
122
+ end
80
123
 
81
124
  task "lib" do
82
125
  directory "lib"
83
126
  end
84
127
 
85
128
  desc "Compiles the Ruby extension"
86
- task :compile => [:hpricot_scan] do
87
- if Dir.glob(File.join(ARCHLIB,"hpricot_scan.*")).length == 0
129
+ task :compile => [:hpricot_scan, :fast_xs] do
130
+ if Dir.glob(File.join("lib","hpricot_scan.*")).length == 0
88
131
  STDERR.puts "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
89
132
  STDERR.puts "Gem actually failed to build. Your system is"
90
133
  STDERR.puts "NOT configured properly to build hpricot."
@@ -94,60 +137,47 @@ task :compile => [:hpricot_scan] do
94
137
  end
95
138
  task :hpricot_scan => [:ragel]
96
139
 
97
- desc "Builds just the #{extension} extension"
98
- task extension.to_sym => ["#{ext}/Makefile", ext_so ]
99
-
100
- file "#{ext}/Makefile" => ["#{ext}/extconf.rb"] do
101
- Dir.chdir(ext) do ruby "extconf.rb" end
102
- end
103
-
104
- file ext_so => ext_files do
105
- Dir.chdir(ext) do
106
- sh(PLATFORM =~ /win32/ ? 'nmake' : 'make')
107
- end
108
- mkdir_p ARCHLIB
109
- cp ext_so, ARCHLIB
110
- end
111
-
112
- desc "returns the ragel version"
140
+ desc "Determines the Ragel version and displays it on the console along with the location of the Ragel binary."
113
141
  task :ragel_version do
114
142
  @ragel_v = `ragel -v`[/(version )(\S*)/,2].to_f
143
+ puts "Using ragel version: #{@ragel_v}, location: #{`which ragel`}"
144
+ @ragel_v
115
145
  end
116
146
 
117
147
  desc "Generates the C scanner code with Ragel."
118
148
  task :ragel => [:ragel_version] do
119
- sh %{ragel ext/hpricot_scan/hpricot_scan.rl | #{@ragel_v >= 5.18 ? 'rlgen-cd' : 'rlcodegen'} -G2 -o ext/hpricot_scan/hpricot_scan.c}
149
+ if @ragel_v >= 6.1
150
+ @ragel_c_code_generation_style = RAGEL_C_CODE_GENERATION_STYLES[DEFAULT_RAGEL_C_CODE_GENERATION]
151
+ sh %{cd ext/hpricot_scan; ragel hpricot_scan.rl -#{@ragel_c_code_generation_style} -o hpricot_scan.c}
152
+ else
153
+ STDERR.puts "Ragel 6.1 or greater is required."
154
+ exit(1)
155
+ end
120
156
  end
121
157
 
122
- desc "Generates the Java scanner code with Ragel."
158
+ # Java only supports the table-driven code
159
+ # generation style at this point.
160
+ desc "Generates the Java scanner code using the Ragel table-driven code generation style."
123
161
  task :ragel_java => [:ragel_version] do
124
- sh %{ragel -J ext/hpricot_scan/hpricot_scan.java.rl | #{@ragel_v >= 5.18 ? 'rlgen-java' : 'rlcodegen'} -o ext/hpricot_scan/HpricotScanService.java}
162
+ if @ragel_v >= 6.1
163
+ puts "compiling with ragel version #{@ragel_v}"
164
+ sh %{ragel -J -o ext/hpricot_scan/HpricotScanService.java ext/hpricot_scan/hpricot_scan.java.rl}
165
+ else
166
+ STDERR.puts "Ragel 6.1 or greater is required."
167
+ exit(1)
168
+ end
125
169
  end
126
170
 
127
171
  ### Win32 Packages ###
128
172
 
129
- Win32Spec = SPEC.dup
130
- Win32Spec.platform = Gem::Platform::WIN32
131
- Win32Spec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.so"]
132
- Win32Spec.extensions = []
133
-
134
- WIN32_PKG_DIR = "#{PKG}-mswin32"
135
-
136
173
  desc "Package up the Win32 distribution."
137
174
  file WIN32_PKG_DIR => [:package] do
138
175
  sh "tar zxf pkg/#{PKG}.tgz"
139
176
  mv PKG, WIN32_PKG_DIR
140
177
  end
141
178
 
142
- desc "Cross-compile the hpricot_scan extension for win32"
143
- file "hpricot_scan_win32" => [WIN32_PKG_DIR] do
144
- cp "extras/mingw-rbconfig.rb", "#{WIN32_PKG_DIR}/ext/hpricot_scan/rbconfig.rb"
145
- sh "cd #{WIN32_PKG_DIR}/ext/hpricot_scan/ && ruby -I. extconf.rb && make"
146
- mv "#{WIN32_PKG_DIR}/ext/hpricot_scan/hpricot_scan.so", "#{WIN32_PKG_DIR}/#{ARCHLIB}"
147
- end
148
-
149
179
  desc "Build the binary RubyGems package for win32"
150
- task :package_win32 => ["hpricot_scan_win32"] do
180
+ task :package_win32 => ["fast_xs_win32", "hpricot_scan_win32"] do
151
181
  Dir.chdir("#{WIN32_PKG_DIR}") do
152
182
  Gem::Builder.new(Win32Spec).build
153
183
  verbose(true) {
@@ -160,19 +190,43 @@ CLEAN.include WIN32_PKG_DIR
160
190
 
161
191
  ### JRuby Packages ###
162
192
 
163
- compile_java = proc do
164
- sh %{javac -source 1.4 -target 1.4 -classpath $JRUBY_HOME/lib/jruby.jar HpricotScanService.java}
165
- sh %{jar cf hpricot_scan.jar HpricotScanService.class}
193
+ def java_classpath_arg
194
+ # A myriad of ways to discover the JRuby classpath
195
+ classpath = begin
196
+ require 'java'
197
+ # Already running in a JRuby JVM
198
+ Java::java.lang.System.getProperty('java.class.path')
199
+ rescue LoadError
200
+ ENV['JRUBY_PARENT_CLASSPATH'] || ENV['JRUBY_HOME'] && FileList["#{ENV['JRUBY_HOME']}/lib/*.jar"].join(File::PATH_SEPARATOR)
201
+ end
202
+ classpath ? "-cp #{classpath}" : ""
203
+ end
204
+
205
+ def compile_java(filename, jarname)
206
+ sh %{javac -source 1.4 -target 1.4 #{java_classpath_arg} #{filename}}
207
+ sh %{jar cf #{jarname} *.class}
166
208
  end
167
209
 
168
- desc "Compiles the JRuby extension"
169
210
  task :hpricot_scan_java => [:ragel_java] do
170
- Dir.chdir("ext/hpricot_scan", &compile_java)
211
+ Dir.chdir "ext/hpricot_scan" do
212
+ compile_java("HpricotScanService.java", "hpricot_scan.jar")
213
+ end
214
+ end
215
+
216
+ task :fast_xs_java do
217
+ Dir.chdir "ext/fast_xs" do
218
+ compile_java("FastXsService.java", "fast_xs.jar")
219
+ end
220
+ end
221
+
222
+ desc "Compiles the JRuby extensions"
223
+ task :hpricot_java => [:hpricot_scan_java, :fast_xs_java] do
224
+ %w(hpricot_scan fast_xs).each {|ext| mv "ext/#{ext}/#{ext}.jar", "lib"}
171
225
  end
172
226
 
173
227
  JRubySpec = SPEC.dup
174
228
  JRubySpec.platform = 'jruby'
175
- JRubySpec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.jar"]
229
+ JRubySpec.files = PKG_FILES + ["lib/hpricot_scan.jar", "lib/fast_xs.jar"]
176
230
  JRubySpec.extensions = []
177
231
 
178
232
  JRUBY_PKG_DIR = "#{PKG}-jruby"
@@ -183,15 +237,10 @@ file JRUBY_PKG_DIR => [:ragel_java, :package] do
183
237
  mv PKG, JRUBY_PKG_DIR
184
238
  end
185
239
 
186
- desc "Cross-compile the hpricot_scan extension for JRuby"
187
- file "hpricot_scan_jruby" => [JRUBY_PKG_DIR] do
188
- Dir.chdir("#{JRUBY_PKG_DIR}/ext/hpricot_scan", &compile_java)
189
- mv "#{JRUBY_PKG_DIR}/ext/hpricot_scan/hpricot_scan.jar", "#{JRUBY_PKG_DIR}/#{ARCHLIB}"
190
- end
191
-
192
240
  desc "Build the RubyGems package for JRuby"
193
- task :package_jruby => ["hpricot_scan_jruby"] do
241
+ task :package_jruby => JRUBY_PKG_DIR do
194
242
  Dir.chdir("#{JRUBY_PKG_DIR}") do
243
+ Rake::Task[:hpricot_java].invoke
195
244
  Gem::Builder.new(JRubySpec).build
196
245
  verbose(true) {
197
246
  mv Dir["*.gem"].first, "../pkg/#{JRUBY_PKG_DIR}.gem"
@@ -0,0 +1,1018 @@
1
+
2
+ import java.io.IOException;
3
+ import java.io.StringWriter;
4
+ import java.io.Writer;
5
+ import java.util.HashMap;
6
+ import java.util.Map;
7
+ import java.util.TreeMap;
8
+ import org.jruby.Ruby;
9
+ import org.jruby.RubyModule;
10
+ import org.jruby.runtime.CallbackFactory;
11
+ import org.jruby.runtime.builtin.IRubyObject;
12
+ import org.jruby.runtime.load.BasicLibraryService;
13
+ import org.jruby.util.collections.IntHashMap;
14
+
15
+ public class FastXsService implements BasicLibraryService {
16
+
17
+ public boolean basicLoad(final Ruby runtime) throws IOException {
18
+ RubyModule string = runtime.getModule("String");
19
+ CallbackFactory fact = runtime.callbackFactory(FastXsService.class);
20
+ string.defineMethod("fast_xs",fact.getFastSingletonMethod("fast_xs"));
21
+ return true;
22
+ }
23
+
24
+ public static IRubyObject fast_xs(IRubyObject recv) {
25
+ String string = recv.convertToString().getUnicodeValue();
26
+ StringWriter writer = new StringWriter ((int)(string.length() * 1.5));
27
+ try {
28
+ Entities.XML.escape(writer, string);
29
+ return recv.getRuntime().newString(writer.toString());
30
+ } catch (IOException e) {
31
+ throw recv.getRuntime().newIOErrorFromException(e);
32
+ }
33
+ }
34
+ }
35
+
36
+ // From Apache commons-lang,
37
+ // http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?revision=560660&view=markup
38
+ /*
39
+ * Licensed to the Apache Software Foundation (ASF) under one or more
40
+ * contributor license agreements. See the NOTICE file distributed with
41
+ * this work for additional information regarding copyright ownership.
42
+ * The ASF licenses this file to You under the Apache License, Version 2.0
43
+ * (the "License"); you may not use this file except in compliance with
44
+ * the License. You may obtain a copy of the License at
45
+ *
46
+ * http://www.apache.org/licenses/LICENSE-2.0
47
+ *
48
+ * Unless required by applicable law or agreed to in writing, software
49
+ * distributed under the License is distributed on an "AS IS" BASIS,
50
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
51
+ * See the License for the specific language governing permissions and
52
+ * limitations under the License.
53
+ */
54
+
55
+ /**
56
+ * <p>
57
+ * Provides HTML and XML entity utilities.
58
+ * </p>
59
+ *
60
+ * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO Entities</a>
61
+ * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities for ISO Latin-1</a>
62
+ * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character entity references</a>
63
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character References</a>
64
+ * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML 4.01 Code positions</a>
65
+ *
66
+ * @author <a href="mailto:alex@purpletech.com">Alexander Day Chaffee</a>
67
+ * @author <a href="mailto:ggregory@seagullsw.com">Gary Gregory</a>
68
+ * @since 2.0
69
+ * @version $Id$
70
+ */
71
+ class Entities {
72
+
73
+ private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote
74
+ {"amp", "38"}, // & - ampersand
75
+ {"lt", "60"}, // < - less-than
76
+ {"gt", "62"}, // > - greater-than
77
+ };
78
+
79
+ private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe
80
+ };
81
+
82
+ // package scoped for testing
83
+ static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space
84
+ {"iexcl", "161"}, // inverted exclamation mark
85
+ {"cent", "162"}, // cent sign
86
+ {"pound", "163"}, // pound sign
87
+ {"curren", "164"}, // currency sign
88
+ {"yen", "165"}, // yen sign = yuan sign
89
+ {"brvbar", "166"}, // broken bar = broken vertical bar
90
+ {"sect", "167"}, // section sign
91
+ {"uml", "168"}, // diaeresis = spacing diaeresis
92
+ {"copy", "169"}, // © - copyright sign
93
+ {"ordf", "170"}, // feminine ordinal indicator
94
+ {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet
95
+ {"not", "172"}, // not sign
96
+ {"shy", "173"}, // soft hyphen = discretionary hyphen
97
+ {"reg", "174"}, // ® - registered trademark sign
98
+ {"macr", "175"}, // macron = spacing macron = overline = APL overbar
99
+ {"deg", "176"}, // degree sign
100
+ {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign
101
+ {"sup2", "178"}, // superscript two = superscript digit two = squared
102
+ {"sup3", "179"}, // superscript three = superscript digit three = cubed
103
+ {"acute", "180"}, // acute accent = spacing acute
104
+ {"micro", "181"}, // micro sign
105
+ {"para", "182"}, // pilcrow sign = paragraph sign
106
+ {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot
107
+ {"cedil", "184"}, // cedilla = spacing cedilla
108
+ {"sup1", "185"}, // superscript one = superscript digit one
109
+ {"ordm", "186"}, // masculine ordinal indicator
110
+ {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet
111
+ {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter
112
+ {"frac12", "189"}, // vulgar fraction one half = fraction one half
113
+ {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters
114
+ {"iquest", "191"}, // inverted question mark = turned question mark
115
+ {"Agrave", "192"}, // À - uppercase A, grave accent
116
+ {"Aacute", "193"}, // Á - uppercase A, acute accent
117
+ {"Acirc", "194"}, // Â - uppercase A, circumflex accent
118
+ {"Atilde", "195"}, // Ã - uppercase A, tilde
119
+ {"Auml", "196"}, // Ä - uppercase A, umlaut
120
+ {"Aring", "197"}, // Å - uppercase A, ring
121
+ {"AElig", "198"}, // Æ - uppercase AE
122
+ {"Ccedil", "199"}, // Ç - uppercase C, cedilla
123
+ {"Egrave", "200"}, // È - uppercase E, grave accent
124
+ {"Eacute", "201"}, // É - uppercase E, acute accent
125
+ {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent
126
+ {"Euml", "203"}, // Ë - uppercase E, umlaut
127
+ {"Igrave", "204"}, // Ì - uppercase I, grave accent
128
+ {"Iacute", "205"}, // Í - uppercase I, acute accent
129
+ {"Icirc", "206"}, // Î - uppercase I, circumflex accent
130
+ {"Iuml", "207"}, // Ï - uppercase I, umlaut
131
+ {"ETH", "208"}, // Ð - uppercase Eth, Icelandic
132
+ {"Ntilde", "209"}, // Ñ - uppercase N, tilde
133
+ {"Ograve", "210"}, // Ò - uppercase O, grave accent
134
+ {"Oacute", "211"}, // Ó - uppercase O, acute accent
135
+ {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent
136
+ {"Otilde", "213"}, // Õ - uppercase O, tilde
137
+ {"Ouml", "214"}, // Ö - uppercase O, umlaut
138
+ {"times", "215"}, // multiplication sign
139
+ {"Oslash", "216"}, // Ø - uppercase O, slash
140
+ {"Ugrave", "217"}, // Ù - uppercase U, grave accent
141
+ {"Uacute", "218"}, // Ú - uppercase U, acute accent
142
+ {"Ucirc", "219"}, // Û - uppercase U, circumflex accent
143
+ {"Uuml", "220"}, // Ü - uppercase U, umlaut
144
+ {"Yacute", "221"}, // Ý - uppercase Y, acute accent
145
+ {"THORN", "222"}, // Þ - uppercase THORN, Icelandic
146
+ {"szlig", "223"}, // ß - lowercase sharps, German
147
+ {"agrave", "224"}, // à - lowercase a, grave accent
148
+ {"aacute", "225"}, // á - lowercase a, acute accent
149
+ {"acirc", "226"}, // â - lowercase a, circumflex accent
150
+ {"atilde", "227"}, // ã - lowercase a, tilde
151
+ {"auml", "228"}, // ä - lowercase a, umlaut
152
+ {"aring", "229"}, // å - lowercase a, ring
153
+ {"aelig", "230"}, // æ - lowercase ae
154
+ {"ccedil", "231"}, // ç - lowercase c, cedilla
155
+ {"egrave", "232"}, // è - lowercase e, grave accent
156
+ {"eacute", "233"}, // é - lowercase e, acute accent
157
+ {"ecirc", "234"}, // ê - lowercase e, circumflex accent
158
+ {"euml", "235"}, // ë - lowercase e, umlaut
159
+ {"igrave", "236"}, // ì - lowercase i, grave accent
160
+ {"iacute", "237"}, // í - lowercase i, acute accent
161
+ {"icirc", "238"}, // î - lowercase i, circumflex accent
162
+ {"iuml", "239"}, // ï - lowercase i, umlaut
163
+ {"eth", "240"}, // ð - lowercase eth, Icelandic
164
+ {"ntilde", "241"}, // ñ - lowercase n, tilde
165
+ {"ograve", "242"}, // ò - lowercase o, grave accent
166
+ {"oacute", "243"}, // ó - lowercase o, acute accent
167
+ {"ocirc", "244"}, // ô - lowercase o, circumflex accent
168
+ {"otilde", "245"}, // õ - lowercase o, tilde
169
+ {"ouml", "246"}, // ö - lowercase o, umlaut
170
+ {"divide", "247"}, // division sign
171
+ {"oslash", "248"}, // ø - lowercase o, slash
172
+ {"ugrave", "249"}, // ù - lowercase u, grave accent
173
+ {"uacute", "250"}, // ú - lowercase u, acute accent
174
+ {"ucirc", "251"}, // û - lowercase u, circumflex accent
175
+ {"uuml", "252"}, // ü - lowercase u, umlaut
176
+ {"yacute", "253"}, // ý - lowercase y, acute accent
177
+ {"thorn", "254"}, // þ - lowercase thorn, Icelandic
178
+ {"yuml", "255"}, // ÿ - lowercase y, umlaut
179
+ };
180
+
181
+ // http://www.w3.org/TR/REC-html40/sgml/entities.html
182
+ // package scoped for testing
183
+ static final String[][] HTML40_ARRAY = {
184
+ // <!-- Latin Extended-B -->
185
+ {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech -->
186
+ // <!-- Greek -->
187
+ {"Alpha", "913"}, // greek capital letter alpha, U+0391 -->
188
+ {"Beta", "914"}, // greek capital letter beta, U+0392 -->
189
+ {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 -->
190
+ {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 -->
191
+ {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 -->
192
+ {"Zeta", "918"}, // greek capital letter zeta, U+0396 -->
193
+ {"Eta", "919"}, // greek capital letter eta, U+0397 -->
194
+ {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 -->
195
+ {"Iota", "921"}, // greek capital letter iota, U+0399 -->
196
+ {"Kappa", "922"}, // greek capital letter kappa, U+039A -->
197
+ {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 -->
198
+ {"Mu", "924"}, // greek capital letter mu, U+039C -->
199
+ {"Nu", "925"}, // greek capital letter nu, U+039D -->
200
+ {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 -->
201
+ {"Omicron", "927"}, // greek capital letter omicron, U+039F -->
202
+ {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 -->
203
+ {"Rho", "929"}, // greek capital letter rho, U+03A1 -->
204
+ // <!-- there is no Sigmaf, and no U+03A2 character either -->
205
+ {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 -->
206
+ {"Tau", "932"}, // greek capital letter tau, U+03A4 -->
207
+ {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 -->
208
+ {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 -->
209
+ {"Chi", "935"}, // greek capital letter chi, U+03A7 -->
210
+ {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 -->
211
+ {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 -->
212
+ {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 -->
213
+ {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 -->
214
+ {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 -->
215
+ {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 -->
216
+ {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 -->
217
+ {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 -->
218
+ {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 -->
219
+ {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 -->
220
+ {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 -->
221
+ {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 -->
222
+ {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 -->
223
+ {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 -->
224
+ {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 -->
225
+ {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 -->
226
+ {"omicron", "959"}, // greek small letter omicron, U+03BF NEW -->
227
+ {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 -->
228
+ {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 -->
229
+ {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 -->
230
+ {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 -->
231
+ {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 -->
232
+ {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 -->
233
+ {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 -->
234
+ {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 -->
235
+ {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 -->
236
+ {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 -->
237
+ {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW -->
238
+ {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW -->
239
+ {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 -->
240
+ // <!-- General Punctuation -->
241
+ {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub -->
242
+ // <!-- bullet is NOT the same as bullet operator, U+2219 -->
243
+ {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub -->
244
+ {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech -->
245
+ {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech -->
246
+ {"oline", "8254"}, // overline = spacing overscore,U+203E NEW -->
247
+ {"frasl", "8260"}, // fraction slash, U+2044 NEW -->
248
+ // <!-- Letterlike Symbols -->
249
+ {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso -->
250
+ {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso -->
251
+ {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso -->
252
+ {"trade", "8482"}, // trade mark sign, U+2122 ISOnum -->
253
+ {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW -->
254
+ // <!-- alef symbol is NOT the same as hebrew letter alef,U+05D0 although the
255
+ // same glyph could be used to depict both characters -->
256
+ // <!-- Arrows -->
257
+ {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum -->
258
+ {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum-->
259
+ {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum -->
260
+ {"darr", "8595"}, // downwards arrow, U+2193 ISOnum -->
261
+ {"harr", "8596"}, // left right arrow, U+2194 ISOamsa -->
262
+ {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW -->
263
+ {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech -->
264
+ // <!-- ISO 10646 does not say that lArr is the same as the 'is implied by'
265
+ // arrow but also does not have any other character for that function.
266
+ // So ? lArr canbe used for 'is implied by' as ISOtech suggests -->
267
+ {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa -->
268
+ {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech -->
269
+ // <!-- ISO 10646 does not say this is the 'implies' character but does not
270
+ // have another character with this function so ?rArr can be used for
271
+ // 'implies' as ISOtech suggests -->
272
+ {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa -->
273
+ {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa -->
274
+ // <!-- Mathematical Operators -->
275
+ {"forall", "8704"}, // for all, U+2200 ISOtech -->
276
+ {"part", "8706"}, // partial differential, U+2202 ISOtech -->
277
+ {"exist", "8707"}, // there exists, U+2203 ISOtech -->
278
+ {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso -->
279
+ {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech -->
280
+ {"isin", "8712"}, // element of, U+2208 ISOtech -->
281
+ {"notin", "8713"}, // not an element of, U+2209 ISOtech -->
282
+ {"ni", "8715"}, // contains as member, U+220B ISOtech -->
283
+ // <!-- should there be a more memorable name than 'ni'? -->
284
+ {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb -->
285
+ // <!-- prod is NOT the same character as U+03A0 'greek capital letter pi'
286
+ // though the same glyph might be used for both -->
287
+ {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb -->
288
+ // <!-- sum is NOT the same character as U+03A3 'greek capital letter sigma'
289
+ // though the same glyph might be used for both -->
290
+ {"minus", "8722"}, // minus sign, U+2212 ISOtech -->
291
+ {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech -->
292
+ {"radic", "8730"}, // square root = radical sign,U+221A ISOtech -->
293
+ {"prop", "8733"}, // proportional to, U+221D ISOtech -->
294
+ {"infin", "8734"}, // infinity, U+221E ISOtech -->
295
+ {"ang", "8736"}, // angle, U+2220 ISOamso -->
296
+ {"and", "8743"}, // logical and = wedge, U+2227 ISOtech -->
297
+ {"or", "8744"}, // logical or = vee, U+2228 ISOtech -->
298
+ {"cap", "8745"}, // intersection = cap, U+2229 ISOtech -->
299
+ {"cup", "8746"}, // union = cup, U+222A ISOtech -->
300
+ {"int", "8747"}, // integral, U+222B ISOtech -->
301
+ {"there4", "8756"}, // therefore, U+2234 ISOtech -->
302
+ {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech -->
303
+ // <!-- tilde operator is NOT the same character as the tilde, U+007E,although
304
+ // the same glyph might be used to represent both -->
305
+ {"cong", "8773"}, // approximately equal to, U+2245 ISOtech -->
306
+ {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr -->
307
+ {"ne", "8800"}, // not equal to, U+2260 ISOtech -->
308
+ {"equiv", "8801"}, // identical to, U+2261 ISOtech -->
309
+ {"le", "8804"}, // less-than or equal to, U+2264 ISOtech -->
310
+ {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech -->
311
+ {"sub", "8834"}, // subset of, U+2282 ISOtech -->
312
+ {"sup", "8835"}, // superset of, U+2283 ISOtech -->
313
+ // <!-- note that nsup, 'not a superset of, U+2283' is not covered by the
314
+ // Symbol font encoding and is not included. Should it be, for symmetry?
315
+ // It is in ISOamsn --> <!ENTITY nsub", "8836"},
316
+ // not a subset of, U+2284 ISOamsn -->
317
+ {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech -->
318
+ {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech -->
319
+ {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb -->
320
+ {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb -->
321
+ {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech -->
322
+ {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb -->
323
+ // <!-- dot operator is NOT the same character as U+00B7 middle dot -->
324
+ // <!-- Miscellaneous Technical -->
325
+ {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc -->
326
+ {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc -->
327
+ {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc -->
328
+ {"rfloor", "8971"}, // right floor, U+230B ISOamsc -->
329
+ {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech -->
330
+ // <!-- lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation
331
+ // mark' -->
332
+ {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech -->
333
+ // <!-- rang is NOT the same character as U+003E 'greater than' or U+203A
334
+ // 'single right-pointing angle quotation mark' -->
335
+ // <!-- Geometric Shapes -->
336
+ {"loz", "9674"}, // lozenge, U+25CA ISOpub -->
337
+ // <!-- Miscellaneous Symbols -->
338
+ {"spades", "9824"}, // black spade suit, U+2660 ISOpub -->
339
+ // <!-- black here seems to mean filled as opposed to hollow -->
340
+ {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub -->
341
+ {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub -->
342
+ {"diams", "9830"}, // black diamond suit, U+2666 ISOpub -->
343
+
344
+ // <!-- Latin Extended-A -->
345
+ {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 -->
346
+ {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 -->
347
+ // <!-- ligature is a misnomer, this is a separate character in some languages -->
348
+ {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 -->
349
+ {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 -->
350
+ {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 -->
351
+ // <!-- Spacing Modifier Letters -->
352
+ {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub -->
353
+ {"tilde", "732"}, // small tilde, U+02DC ISOdia -->
354
+ // <!-- General Punctuation -->
355
+ {"ensp", "8194"}, // en space, U+2002 ISOpub -->
356
+ {"emsp", "8195"}, // em space, U+2003 ISOpub -->
357
+ {"thinsp", "8201"}, // thin space, U+2009 ISOpub -->
358
+ {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 -->
359
+ {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 -->
360
+ {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 -->
361
+ {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 -->
362
+ {"ndash", "8211"}, // en dash, U+2013 ISOpub -->
363
+ {"mdash", "8212"}, // em dash, U+2014 ISOpub -->
364
+ {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum -->
365
+ {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum -->
366
+ {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW -->
367
+ {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum -->
368
+ {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum -->
369
+ {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW -->
370
+ {"dagger", "8224"}, // dagger, U+2020 ISOpub -->
371
+ {"Dagger", "8225"}, // double dagger, U+2021 ISOpub -->
372
+ {"permil", "8240"}, // per mille sign, U+2030 ISOtech -->
373
+ {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed -->
374
+ // <!-- lsaquo is proposed but not yet ISO standardized -->
375
+ {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed -->
376
+ // <!-- rsaquo is proposed but not yet ISO standardized -->
377
+ {"euro", "8364"}, // -- euro sign, U+20AC NEW -->
378
+ };
379
+
380
+ /**
381
+ * <p>
382
+ * The set of entities supported by standard XML.
383
+ * </p>
384
+ */
385
+ public static final Entities XML;
386
+
387
+ /**
388
+ * <p>
389
+ * The set of entities supported by HTML 3.2.
390
+ * </p>
391
+ */
392
+ public static final Entities HTML32;
393
+
394
+ /**
395
+ * <p>
396
+ * The set of entities supported by HTML 4.0.
397
+ * </p>
398
+ */
399
+ public static final Entities HTML40;
400
+
401
+ static {
402
+ XML = new Entities();
403
+ XML.addEntities(BASIC_ARRAY);
404
+ XML.addEntities(APOS_ARRAY);
405
+ }
406
+
407
+ static {
408
+ HTML32 = new Entities();
409
+ HTML32.addEntities(BASIC_ARRAY);
410
+ HTML32.addEntities(ISO8859_1_ARRAY);
411
+ }
412
+
413
+ static {
414
+ HTML40 = new Entities();
415
+ fillWithHtml40Entities(HTML40);
416
+ }
417
+
418
+ /**
419
+ * <p>
420
+ * Fills the specified entities instance with HTML 40 entities.
421
+ * </p>
422
+ *
423
+ * @param entities
424
+ * the instance to be filled.
425
+ */
426
+ static void fillWithHtml40Entities(Entities entities) {
427
+ entities.addEntities(BASIC_ARRAY);
428
+ entities.addEntities(ISO8859_1_ARRAY);
429
+ entities.addEntities(HTML40_ARRAY);
430
+ }
431
+
432
+ static interface EntityMap {
433
+ /**
434
+ * <p>
435
+ * Add an entry to this entity map.
436
+ * </p>
437
+ *
438
+ * @param name
439
+ * the entity name
440
+ * @param value
441
+ * the entity value
442
+ */
443
+ void add(String name, int value);
444
+
445
+ /**
446
+ * <p>
447
+ * Returns the name of the entity identified by the specified value.
448
+ * </p>
449
+ *
450
+ * @param value
451
+ * the value to locate
452
+ * @return entity name associated with the specified value
453
+ */
454
+ String name(int value);
455
+
456
+ /**
457
+ * <p>
458
+ * Returns the value of the entity identified by the specified name.
459
+ * </p>
460
+ *
461
+ * @param name
462
+ * the name to locate
463
+ * @return entity value associated with the specified name
464
+ */
465
+ int value(String name);
466
+ }
467
+
468
+ static class PrimitiveEntityMap implements EntityMap {
469
+ private Map mapNameToValue = new HashMap();
470
+
471
+ private IntHashMap mapValueToName = new IntHashMap();
472
+
473
+ /**
474
+ * {@inheritDoc}
475
+ */
476
+ public void add(String name, int value) {
477
+ mapNameToValue.put(name, new Integer(value));
478
+ mapValueToName.put(value, name);
479
+ }
480
+
481
+ /**
482
+ * {@inheritDoc}
483
+ */
484
+ public String name(int value) {
485
+ return (String) mapValueToName.get(value);
486
+ }
487
+
488
+ /**
489
+ * {@inheritDoc}
490
+ */
491
+ public int value(String name) {
492
+ Object value = mapNameToValue.get(name);
493
+ if (value == null) {
494
+ return -1;
495
+ }
496
+ return ((Integer) value).intValue();
497
+ }
498
+ }
499
+
500
+ static abstract class MapIntMap implements Entities.EntityMap {
501
+ protected Map mapNameToValue;
502
+
503
+ protected Map mapValueToName;
504
+
505
+ /**
506
+ * {@inheritDoc}
507
+ */
508
+ public void add(String name, int value) {
509
+ mapNameToValue.put(name, new Integer(value));
510
+ mapValueToName.put(new Integer(value), name);
511
+ }
512
+
513
+ /**
514
+ * {@inheritDoc}
515
+ */
516
+ public String name(int value) {
517
+ return (String) mapValueToName.get(new Integer(value));
518
+ }
519
+
520
+ /**
521
+ * {@inheritDoc}
522
+ */
523
+ public int value(String name) {
524
+ Object value = mapNameToValue.get(name);
525
+ if (value == null) {
526
+ return -1;
527
+ }
528
+ return ((Integer) value).intValue();
529
+ }
530
+ }
531
+
532
+ static class HashEntityMap extends MapIntMap {
533
+ /**
534
+ * Constructs a new instance of <code>HashEntityMap</code>.
535
+ */
536
+ public HashEntityMap() {
537
+ mapNameToValue = new HashMap();
538
+ mapValueToName = new HashMap();
539
+ }
540
+ }
541
+
542
+ static class TreeEntityMap extends MapIntMap {
543
+ /**
544
+ * Constructs a new instance of <code>TreeEntityMap</code>.
545
+ */
546
+ public TreeEntityMap() {
547
+ mapNameToValue = new TreeMap();
548
+ mapValueToName = new TreeMap();
549
+ }
550
+ }
551
+
552
+ static class LookupEntityMap extends PrimitiveEntityMap {
553
+ private String[] lookupTable;
554
+
555
+ private int LOOKUP_TABLE_SIZE = 256;
556
+
557
+ /**
558
+ * {@inheritDoc}
559
+ */
560
+ public String name(int value) {
561
+ if (value < LOOKUP_TABLE_SIZE) {
562
+ return lookupTable()[value];
563
+ }
564
+ return super.name(value);
565
+ }
566
+
567
+ /**
568
+ * <p>
569
+ * Returns the lookup table for this entity map. The lookup table is created if it has not been previously.
570
+ * </p>
571
+ *
572
+ * @return the lookup table
573
+ */
574
+ private String[] lookupTable() {
575
+ if (lookupTable == null) {
576
+ createLookupTable();
577
+ }
578
+ return lookupTable;
579
+ }
580
+
581
+ /**
582
+ * <p>
583
+ * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names.
584
+ * </p>
585
+ */
586
+ private void createLookupTable() {
587
+ lookupTable = new String[LOOKUP_TABLE_SIZE];
588
+ for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) {
589
+ lookupTable[i] = super.name(i);
590
+ }
591
+ }
592
+ }
593
+
594
+ static class ArrayEntityMap implements EntityMap {
595
+ protected int growBy = 100;
596
+
597
+ protected int size = 0;
598
+
599
+ protected String[] names;
600
+
601
+ protected int[] values;
602
+
603
+ /**
604
+ * Constructs a new instance of <code>ArrayEntityMap</code>.
605
+ */
606
+ public ArrayEntityMap() {
607
+ names = new String[growBy];
608
+ values = new int[growBy];
609
+ }
610
+
611
+ /**
612
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the array should
613
+ * grow.
614
+ *
615
+ * @param growBy
616
+ * array will be initialized to and will grow by this amount
617
+ */
618
+ public ArrayEntityMap(int growBy) {
619
+ this.growBy = growBy;
620
+ names = new String[growBy];
621
+ values = new int[growBy];
622
+ }
623
+
624
+ /**
625
+ * {@inheritDoc}
626
+ */
627
+ public void add(String name, int value) {
628
+ ensureCapacity(size + 1);
629
+ names[size] = name;
630
+ values[size] = value;
631
+ size++;
632
+ }
633
+
634
+ /**
635
+ * Verifies the capacity of the entity array, adjusting the size if necessary.
636
+ *
637
+ * @param capacity
638
+ * size the array should be
639
+ */
640
+ protected void ensureCapacity(int capacity) {
641
+ if (capacity > names.length) {
642
+ int newSize = Math.max(capacity, size + growBy);
643
+ String[] newNames = new String[newSize];
644
+ System.arraycopy(names, 0, newNames, 0, size);
645
+ names = newNames;
646
+ int[] newValues = new int[newSize];
647
+ System.arraycopy(values, 0, newValues, 0, size);
648
+ values = newValues;
649
+ }
650
+ }
651
+
652
+ /**
653
+ * {@inheritDoc}
654
+ */
655
+ public String name(int value) {
656
+ for (int i = 0; i < size; ++i) {
657
+ if (values[i] == value) {
658
+ return names[i];
659
+ }
660
+ }
661
+ return null;
662
+ }
663
+
664
+ /**
665
+ * {@inheritDoc}
666
+ */
667
+ public int value(String name) {
668
+ for (int i = 0; i < size; ++i) {
669
+ if (names[i].equals(name)) {
670
+ return values[i];
671
+ }
672
+ }
673
+ return -1;
674
+ }
675
+ }
676
+
677
+ static class BinaryEntityMap extends ArrayEntityMap {
678
+
679
+ /**
680
+ * Constructs a new instance of <code>BinaryEntityMap</code>.
681
+ */
682
+ public BinaryEntityMap() {
683
+ super();
684
+ }
685
+
686
+ /**
687
+ * Constructs a new instance of <code>ArrayEntityMap</code> specifying the size by which the underlying array
688
+ * should grow.
689
+ *
690
+ * @param growBy
691
+ * array will be initialized to and will grow by this amount
692
+ */
693
+ public BinaryEntityMap(int growBy) {
694
+ super(growBy);
695
+ }
696
+
697
+ /**
698
+ * Performs a binary search of the entity array for the specified key. This method is based on code in
699
+ * {@link java.util.Arrays}.
700
+ *
701
+ * @param key
702
+ * the key to be found
703
+ * @return the index of the entity array matching the specified key
704
+ */
705
+ private int binarySearch(int key) {
706
+ int low = 0;
707
+ int high = size - 1;
708
+
709
+ while (low <= high) {
710
+ int mid = (low + high) >> 1;
711
+ int midVal = values[mid];
712
+
713
+ if (midVal < key) {
714
+ low = mid + 1;
715
+ } else if (midVal > key) {
716
+ high = mid - 1;
717
+ } else {
718
+ return mid; // key found
719
+ }
720
+ }
721
+ return -(low + 1); // key not found.
722
+ }
723
+
724
+ /**
725
+ * {@inheritDoc}
726
+ */
727
+ public void add(String name, int value) {
728
+ ensureCapacity(size + 1);
729
+ int insertAt = binarySearch(value);
730
+ if (insertAt > 0) {
731
+ return; // note: this means you can't insert the same value twice
732
+ }
733
+ insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
734
+ System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
735
+ values[insertAt] = value;
736
+ System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
737
+ names[insertAt] = name;
738
+ size++;
739
+ }
740
+
741
+ /**
742
+ * {@inheritDoc}
743
+ */
744
+ public String name(int value) {
745
+ int index = binarySearch(value);
746
+ if (index < 0) {
747
+ return null;
748
+ }
749
+ return names[index];
750
+ }
751
+ }
752
+
753
+ // package scoped for testing
754
+ EntityMap map = new Entities.LookupEntityMap();
755
+
756
+ /**
757
+ * <p>
758
+ * Adds entities to this entity.
759
+ * </p>
760
+ *
761
+ * @param entityArray
762
+ * array of entities to be added
763
+ */
764
+ public void addEntities(String[][] entityArray) {
765
+ for (int i = 0; i < entityArray.length; ++i) {
766
+ addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1]));
767
+ }
768
+ }
769
+
770
+ /**
771
+ * <p>
772
+ * Add an entity to this entity.
773
+ * </p>
774
+ *
775
+ * @param name
776
+ * name of the entity
777
+ * @param value
778
+ * vale of the entity
779
+ */
780
+ public void addEntity(String name, int value) {
781
+ map.add(name, value);
782
+ }
783
+
784
+ /**
785
+ * <p>
786
+ * Returns the name of the entity identified by the specified value.
787
+ * </p>
788
+ *
789
+ * @param value
790
+ * the value to locate
791
+ * @return entity name associated with the specified value
792
+ */
793
+ public String entityName(int value) {
794
+ return map.name(value);
795
+ }
796
+
797
+ /**
798
+ * <p>
799
+ * Returns the value of the entity identified by the specified name.
800
+ * </p>
801
+ *
802
+ * @param name
803
+ * the name to locate
804
+ * @return entity value associated with the specified name
805
+ */
806
+ public int entityValue(String name) {
807
+ return map.value(name);
808
+ }
809
+
810
+ /**
811
+ * <p>
812
+ * Escapes the characters in a <code>String</code>.
813
+ * </p>
814
+ *
815
+ * <p>
816
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), escape(&quot;\u00A1&quot;) will return
817
+ * &quot;&amp;foo;&quot;
818
+ * </p>
819
+ *
820
+ * @param str
821
+ * The <code>String</code> to escape.
822
+ * @return A new escaped <code>String</code>.
823
+ */
824
+ public String escape(String str) {
825
+ StringWriter stringWriter = createStringWriter(str);
826
+ try {
827
+ this.escape(stringWriter, str);
828
+ } catch (IOException e) {
829
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
830
+ // throw IOExceptions.
831
+ throw new RuntimeException(e);
832
+ }
833
+ return stringWriter.toString();
834
+ }
835
+
836
+ /**
837
+ * <p>
838
+ * Escapes the characters in the <code>String</code> passed and writes the result to the <code>Writer</code>
839
+ * passed.
840
+ * </p>
841
+ *
842
+ * @param writer
843
+ * The <code>Writer</code> to write the results of the escaping to. Assumed to be a non-null value.
844
+ * @param str
845
+ * The <code>String</code> to escape. Assumed to be a non-null value.
846
+ * @throws IOException
847
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
848
+ * methods.
849
+ *
850
+ * @see #escape(String)
851
+ * @see Writer
852
+ */
853
+ public void escape(Writer writer, String str) throws IOException {
854
+ int len = str.length();
855
+ for (int i = 0; i < len; i++) {
856
+ char c = str.charAt(i);
857
+ String entityName = this.entityName(c);
858
+ if (entityName == null) {
859
+ if (c > 0x7F) {
860
+ writer.write("&#");
861
+ writer.write(Integer.toString(c, 10));
862
+ writer.write(';');
863
+ } else {
864
+ writer.write(c);
865
+ }
866
+ } else {
867
+ writer.write('&');
868
+ writer.write(entityName);
869
+ writer.write(';');
870
+ }
871
+ }
872
+ }
873
+
874
+ /**
875
+ * <p>
876
+ * Unescapes the entities in a <code>String</code>.
877
+ * </p>
878
+ *
879
+ * <p>
880
+ * For example, if you have called addEntity(&quot;foo&quot;, 0xA1), unescape(&quot;&amp;foo;&quot;) will return
881
+ * &quot;\u00A1&quot;
882
+ * </p>
883
+ *
884
+ * @param str
885
+ * The <code>String</code> to escape.
886
+ * @return A new escaped <code>String</code>.
887
+ */
888
+ public String unescape(String str) {
889
+ int firstAmp = str.indexOf('&');
890
+ if (firstAmp < 0) {
891
+ return str;
892
+ } else {
893
+ StringWriter stringWriter = createStringWriter(str);
894
+ try {
895
+ this.doUnescape(stringWriter, str, firstAmp);
896
+ } catch (IOException e) {
897
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
898
+ // do not throw IOExceptions.
899
+ throw new RuntimeException(e);
900
+ }
901
+ return stringWriter.toString();
902
+ }
903
+ }
904
+
905
+ /**
906
+ * Make the StringWriter 10% larger than the source String to avoid growing the writer
907
+ *
908
+ * @param str The source string
909
+ * @return A newly created StringWriter
910
+ */
911
+ private StringWriter createStringWriter(String str) {
912
+ return new StringWriter((int) (str.length() + (str.length() * 0.1)));
913
+ }
914
+
915
+ /**
916
+ * <p>
917
+ * Unescapes the escaped entities in the <code>String</code> passed and writes the result to the
918
+ * <code>Writer</code> passed.
919
+ * </p>
920
+ *
921
+ * @param writer
922
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
923
+ * @param str
924
+ * The source <code>String</code> to unescape; assumed to be non-null.
925
+ * @throws IOException
926
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
927
+ * methods.
928
+ *
929
+ * @see #escape(String)
930
+ * @see Writer
931
+ */
932
+ public void unescape(Writer writer, String str) throws IOException {
933
+ int firstAmp = str.indexOf('&');
934
+ if (firstAmp < 0) {
935
+ writer.write(str);
936
+ return;
937
+ } else {
938
+ doUnescape(writer, str, firstAmp);
939
+ }
940
+ }
941
+
942
+ /**
943
+ * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
944
+ *
945
+ * @param writer
946
+ * The <code>Writer</code> to write the results to; assumed to be non-null.
947
+ * @param str
948
+ * The source <code>String</code> to unescape; assumed to be non-null.
949
+ * @param firstAmp
950
+ * The <code>int</code> index of the first ampersand in the source String.
951
+ * @throws IOException
952
+ * when <code>Writer</code> passed throws the exception from calls to the {@link Writer#write(int)}
953
+ * methods.
954
+ */
955
+ private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
956
+ writer.write(str, 0, firstAmp);
957
+ int len = str.length();
958
+ for (int i = firstAmp; i < len; i++) {
959
+ char c = str.charAt(i);
960
+ if (c == '&') {
961
+ int nextIdx = i + 1;
962
+ int semiColonIdx = str.indexOf(';', nextIdx);
963
+ if (semiColonIdx == -1) {
964
+ writer.write(c);
965
+ continue;
966
+ }
967
+ int amphersandIdx = str.indexOf('&', i + 1);
968
+ if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
969
+ // Then the text looks like &...&...;
970
+ writer.write(c);
971
+ continue;
972
+ }
973
+ String entityContent = str.substring(nextIdx, semiColonIdx);
974
+ int entityValue = -1;
975
+ int entityContentLen = entityContent.length();
976
+ if (entityContentLen > 0) {
977
+ if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
978
+ // hexidecimal)
979
+ if (entityContentLen > 1) {
980
+ char isHexChar = entityContent.charAt(1);
981
+ try {
982
+ switch (isHexChar) {
983
+ case 'X' :
984
+ case 'x' : {
985
+ entityValue = Integer.parseInt(entityContent.substring(2), 16);
986
+ break;
987
+ }
988
+ default : {
989
+ entityValue = Integer.parseInt(entityContent.substring(1), 10);
990
+ }
991
+ }
992
+ if (entityValue > 0xFFFF) {
993
+ entityValue = -1;
994
+ }
995
+ } catch (NumberFormatException e) {
996
+ entityValue = -1;
997
+ }
998
+ }
999
+ } else { // escaped value content is an entity name
1000
+ entityValue = this.entityValue(entityContent);
1001
+ }
1002
+ }
1003
+
1004
+ if (entityValue == -1) {
1005
+ writer.write('&');
1006
+ writer.write(entityContent);
1007
+ writer.write(';');
1008
+ } else {
1009
+ writer.write(entityValue);
1010
+ }
1011
+ i = semiColonIdx; // move index up to the semi-colon
1012
+ } else {
1013
+ writer.write(c);
1014
+ }
1015
+ }
1016
+ }
1017
+
1018
+ }