nokogiri-backport 1.11.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (239) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -0
  3. data/LICENSE-DEPENDENCIES.md +1682 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +272 -0
  6. data/bin/nokogiri +118 -0
  7. data/dependencies.yml +74 -0
  8. data/ext/java/nokogiri/EncodingHandler.java +124 -0
  9. data/ext/java/nokogiri/HtmlDocument.java +178 -0
  10. data/ext/java/nokogiri/HtmlElementDescription.java +148 -0
  11. data/ext/java/nokogiri/HtmlEntityLookup.java +79 -0
  12. data/ext/java/nokogiri/HtmlSaxParserContext.java +282 -0
  13. data/ext/java/nokogiri/HtmlSaxPushParser.java +222 -0
  14. data/ext/java/nokogiri/NokogiriService.java +597 -0
  15. data/ext/java/nokogiri/XmlAttr.java +162 -0
  16. data/ext/java/nokogiri/XmlAttributeDecl.java +129 -0
  17. data/ext/java/nokogiri/XmlCdata.java +82 -0
  18. data/ext/java/nokogiri/XmlComment.java +97 -0
  19. data/ext/java/nokogiri/XmlDocument.java +633 -0
  20. data/ext/java/nokogiri/XmlDocumentFragment.java +185 -0
  21. data/ext/java/nokogiri/XmlDtd.java +481 -0
  22. data/ext/java/nokogiri/XmlElement.java +68 -0
  23. data/ext/java/nokogiri/XmlElementContent.java +382 -0
  24. data/ext/java/nokogiri/XmlElementDecl.java +147 -0
  25. data/ext/java/nokogiri/XmlEntityDecl.java +157 -0
  26. data/ext/java/nokogiri/XmlEntityReference.java +101 -0
  27. data/ext/java/nokogiri/XmlNamespace.java +199 -0
  28. data/ext/java/nokogiri/XmlNode.java +1684 -0
  29. data/ext/java/nokogiri/XmlNodeSet.java +434 -0
  30. data/ext/java/nokogiri/XmlProcessingInstruction.java +100 -0
  31. data/ext/java/nokogiri/XmlReader.java +531 -0
  32. data/ext/java/nokogiri/XmlRelaxng.java +151 -0
  33. data/ext/java/nokogiri/XmlSaxParserContext.java +374 -0
  34. data/ext/java/nokogiri/XmlSaxPushParser.java +286 -0
  35. data/ext/java/nokogiri/XmlSchema.java +388 -0
  36. data/ext/java/nokogiri/XmlSyntaxError.java +138 -0
  37. data/ext/java/nokogiri/XmlText.java +110 -0
  38. data/ext/java/nokogiri/XmlXpathContext.java +301 -0
  39. data/ext/java/nokogiri/XsltStylesheet.java +347 -0
  40. data/ext/java/nokogiri/internals/ClosedStreamException.java +10 -0
  41. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +252 -0
  42. data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +20 -0
  43. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +151 -0
  44. data/ext/java/nokogiri/internals/NokogiriDomParser.java +116 -0
  45. data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +121 -0
  46. data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +69 -0
  47. data/ext/java/nokogiri/internals/NokogiriHandler.java +327 -0
  48. data/ext/java/nokogiri/internals/NokogiriHelpers.java +734 -0
  49. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +217 -0
  50. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +127 -0
  51. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +100 -0
  52. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +121 -0
  53. data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +78 -0
  54. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +180 -0
  55. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +72 -0
  56. data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +60 -0
  57. data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +87 -0
  58. data/ext/java/nokogiri/internals/ParserContext.java +259 -0
  59. data/ext/java/nokogiri/internals/ReaderNode.java +488 -0
  60. data/ext/java/nokogiri/internals/SaveContextVisitor.java +778 -0
  61. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +73 -0
  62. data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +168 -0
  63. data/ext/java/nokogiri/internals/XmlDeclHandler.java +42 -0
  64. data/ext/java/nokogiri/internals/XmlDomParserContext.java +274 -0
  65. data/ext/java/nokogiri/internals/XmlSaxParser.java +65 -0
  66. data/ext/java/nokogiri/internals/c14n/AttrCompare.java +119 -0
  67. data/ext/java/nokogiri/internals/c14n/C14nHelper.java +159 -0
  68. data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +37 -0
  69. data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +93 -0
  70. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +252 -0
  71. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +639 -0
  72. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +38 -0
  73. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +38 -0
  74. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +367 -0
  75. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +295 -0
  76. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +40 -0
  77. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +44 -0
  78. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +44 -0
  79. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +43 -0
  80. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +630 -0
  81. data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +173 -0
  82. data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +76 -0
  83. data/ext/java/nokogiri/internals/c14n/Constants.java +42 -0
  84. data/ext/java/nokogiri/internals/c14n/ElementProxy.java +293 -0
  85. data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +93 -0
  86. data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +79 -0
  87. data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +166 -0
  88. data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +76 -0
  89. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +402 -0
  90. data/ext/java/nokogiri/internals/c14n/NodeFilter.java +51 -0
  91. data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +179 -0
  92. data/ext/java/nokogiri/internals/c14n/XMLUtils.java +507 -0
  93. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1745 -0
  94. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +685 -0
  95. data/ext/nokogiri/depend +477 -0
  96. data/ext/nokogiri/extconf.rb +836 -0
  97. data/ext/nokogiri/html_document.c +171 -0
  98. data/ext/nokogiri/html_document.h +10 -0
  99. data/ext/nokogiri/html_element_description.c +279 -0
  100. data/ext/nokogiri/html_element_description.h +10 -0
  101. data/ext/nokogiri/html_entity_lookup.c +32 -0
  102. data/ext/nokogiri/html_entity_lookup.h +8 -0
  103. data/ext/nokogiri/html_sax_parser_context.c +116 -0
  104. data/ext/nokogiri/html_sax_parser_context.h +11 -0
  105. data/ext/nokogiri/html_sax_push_parser.c +87 -0
  106. data/ext/nokogiri/html_sax_push_parser.h +9 -0
  107. data/ext/nokogiri/nokogiri.c +135 -0
  108. data/ext/nokogiri/nokogiri.h +130 -0
  109. data/ext/nokogiri/xml_attr.c +103 -0
  110. data/ext/nokogiri/xml_attr.h +9 -0
  111. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  112. data/ext/nokogiri/xml_attribute_decl.h +9 -0
  113. data/ext/nokogiri/xml_cdata.c +62 -0
  114. data/ext/nokogiri/xml_cdata.h +9 -0
  115. data/ext/nokogiri/xml_comment.c +69 -0
  116. data/ext/nokogiri/xml_comment.h +9 -0
  117. data/ext/nokogiri/xml_document.c +622 -0
  118. data/ext/nokogiri/xml_document.h +23 -0
  119. data/ext/nokogiri/xml_document_fragment.c +48 -0
  120. data/ext/nokogiri/xml_document_fragment.h +10 -0
  121. data/ext/nokogiri/xml_dtd.c +202 -0
  122. data/ext/nokogiri/xml_dtd.h +10 -0
  123. data/ext/nokogiri/xml_element_content.c +123 -0
  124. data/ext/nokogiri/xml_element_content.h +10 -0
  125. data/ext/nokogiri/xml_element_decl.c +69 -0
  126. data/ext/nokogiri/xml_element_decl.h +9 -0
  127. data/ext/nokogiri/xml_encoding_handler.c +79 -0
  128. data/ext/nokogiri/xml_encoding_handler.h +8 -0
  129. data/ext/nokogiri/xml_entity_decl.c +110 -0
  130. data/ext/nokogiri/xml_entity_decl.h +10 -0
  131. data/ext/nokogiri/xml_entity_reference.c +52 -0
  132. data/ext/nokogiri/xml_entity_reference.h +9 -0
  133. data/ext/nokogiri/xml_io.c +63 -0
  134. data/ext/nokogiri/xml_io.h +11 -0
  135. data/ext/nokogiri/xml_libxml2_hacks.c +112 -0
  136. data/ext/nokogiri/xml_libxml2_hacks.h +12 -0
  137. data/ext/nokogiri/xml_namespace.c +111 -0
  138. data/ext/nokogiri/xml_namespace.h +14 -0
  139. data/ext/nokogiri/xml_node.c +1773 -0
  140. data/ext/nokogiri/xml_node.h +13 -0
  141. data/ext/nokogiri/xml_node_set.c +486 -0
  142. data/ext/nokogiri/xml_node_set.h +12 -0
  143. data/ext/nokogiri/xml_processing_instruction.c +56 -0
  144. data/ext/nokogiri/xml_processing_instruction.h +9 -0
  145. data/ext/nokogiri/xml_reader.c +657 -0
  146. data/ext/nokogiri/xml_reader.h +10 -0
  147. data/ext/nokogiri/xml_relax_ng.c +179 -0
  148. data/ext/nokogiri/xml_relax_ng.h +9 -0
  149. data/ext/nokogiri/xml_sax_parser.c +305 -0
  150. data/ext/nokogiri/xml_sax_parser.h +39 -0
  151. data/ext/nokogiri/xml_sax_parser_context.c +262 -0
  152. data/ext/nokogiri/xml_sax_parser_context.h +10 -0
  153. data/ext/nokogiri/xml_sax_push_parser.c +159 -0
  154. data/ext/nokogiri/xml_sax_push_parser.h +9 -0
  155. data/ext/nokogiri/xml_schema.c +276 -0
  156. data/ext/nokogiri/xml_schema.h +9 -0
  157. data/ext/nokogiri/xml_syntax_error.c +64 -0
  158. data/ext/nokogiri/xml_syntax_error.h +13 -0
  159. data/ext/nokogiri/xml_text.c +52 -0
  160. data/ext/nokogiri/xml_text.h +9 -0
  161. data/ext/nokogiri/xml_xpath_context.c +374 -0
  162. data/ext/nokogiri/xml_xpath_context.h +10 -0
  163. data/ext/nokogiri/xslt_stylesheet.c +263 -0
  164. data/ext/nokogiri/xslt_stylesheet.h +14 -0
  165. data/lib/isorelax.jar +0 -0
  166. data/lib/jing.jar +0 -0
  167. data/lib/nekodtd.jar +0 -0
  168. data/lib/nekohtml.jar +0 -0
  169. data/lib/nokogiri/css/node.rb +53 -0
  170. data/lib/nokogiri/css/parser.rb +751 -0
  171. data/lib/nokogiri/css/parser.y +272 -0
  172. data/lib/nokogiri/css/parser_extras.rb +94 -0
  173. data/lib/nokogiri/css/syntax_error.rb +8 -0
  174. data/lib/nokogiri/css/tokenizer.rb +154 -0
  175. data/lib/nokogiri/css/tokenizer.rex +55 -0
  176. data/lib/nokogiri/css/xpath_visitor.rb +260 -0
  177. data/lib/nokogiri/css.rb +28 -0
  178. data/lib/nokogiri/decorators/slop.rb +43 -0
  179. data/lib/nokogiri/html/builder.rb +36 -0
  180. data/lib/nokogiri/html/document.rb +322 -0
  181. data/lib/nokogiri/html/document_fragment.rb +50 -0
  182. data/lib/nokogiri/html/element_description.rb +24 -0
  183. data/lib/nokogiri/html/element_description_defaults.rb +672 -0
  184. data/lib/nokogiri/html/entity_lookup.rb +14 -0
  185. data/lib/nokogiri/html/sax/parser.rb +63 -0
  186. data/lib/nokogiri/html/sax/parser_context.rb +17 -0
  187. data/lib/nokogiri/html/sax/push_parser.rb +37 -0
  188. data/lib/nokogiri/html.rb +38 -0
  189. data/lib/nokogiri/jruby/dependencies.rb +20 -0
  190. data/lib/nokogiri/syntax_error.rb +5 -0
  191. data/lib/nokogiri/version/constant.rb +5 -0
  192. data/lib/nokogiri/version/info.rb +182 -0
  193. data/lib/nokogiri/version.rb +3 -0
  194. data/lib/nokogiri/xml/attr.rb +15 -0
  195. data/lib/nokogiri/xml/attribute_decl.rb +19 -0
  196. data/lib/nokogiri/xml/builder.rb +447 -0
  197. data/lib/nokogiri/xml/cdata.rb +12 -0
  198. data/lib/nokogiri/xml/character_data.rb +8 -0
  199. data/lib/nokogiri/xml/document.rb +290 -0
  200. data/lib/nokogiri/xml/document_fragment.rb +159 -0
  201. data/lib/nokogiri/xml/dtd.rb +33 -0
  202. data/lib/nokogiri/xml/element_content.rb +37 -0
  203. data/lib/nokogiri/xml/element_decl.rb +14 -0
  204. data/lib/nokogiri/xml/entity_decl.rb +20 -0
  205. data/lib/nokogiri/xml/entity_reference.rb +19 -0
  206. data/lib/nokogiri/xml/namespace.rb +14 -0
  207. data/lib/nokogiri/xml/node/save_options.rb +62 -0
  208. data/lib/nokogiri/xml/node.rb +1240 -0
  209. data/lib/nokogiri/xml/node_set.rb +372 -0
  210. data/lib/nokogiri/xml/notation.rb +7 -0
  211. data/lib/nokogiri/xml/parse_options.rb +127 -0
  212. data/lib/nokogiri/xml/pp/character_data.rb +19 -0
  213. data/lib/nokogiri/xml/pp/node.rb +57 -0
  214. data/lib/nokogiri/xml/pp.rb +3 -0
  215. data/lib/nokogiri/xml/processing_instruction.rb +9 -0
  216. data/lib/nokogiri/xml/reader.rb +116 -0
  217. data/lib/nokogiri/xml/relax_ng.rb +37 -0
  218. data/lib/nokogiri/xml/sax/document.rb +172 -0
  219. data/lib/nokogiri/xml/sax/parser.rb +123 -0
  220. data/lib/nokogiri/xml/sax/parser_context.rb +17 -0
  221. data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
  222. data/lib/nokogiri/xml/sax.rb +5 -0
  223. data/lib/nokogiri/xml/schema.rb +72 -0
  224. data/lib/nokogiri/xml/searchable.rb +239 -0
  225. data/lib/nokogiri/xml/syntax_error.rb +71 -0
  226. data/lib/nokogiri/xml/text.rb +10 -0
  227. data/lib/nokogiri/xml/xpath/syntax_error.rb +12 -0
  228. data/lib/nokogiri/xml/xpath.rb +11 -0
  229. data/lib/nokogiri/xml/xpath_context.rb +17 -0
  230. data/lib/nokogiri/xml.rb +76 -0
  231. data/lib/nokogiri/xslt/stylesheet.rb +26 -0
  232. data/lib/nokogiri/xslt.rb +57 -0
  233. data/lib/nokogiri.rb +144 -0
  234. data/lib/serializer.jar +0 -0
  235. data/lib/xalan.jar +0 -0
  236. data/lib/xercesImpl.jar +0 -0
  237. data/lib/xml-apis.jar +0 -0
  238. data/lib/xsd/xmlparser/nokogiri.rb +103 -0
  239. metadata +531 -0
@@ -0,0 +1,778 @@
1
+ /**
2
+ * (The MIT License)
3
+ *
4
+ * Copyright (c) 2008 - 2012:
5
+ *
6
+ * * {Aaron Patterson}[http://tenderlovemaking.com]
7
+ * * {Mike Dalessio}[http://mike.daless.io]
8
+ * * {Charles Nutter}[http://blog.headius.com]
9
+ * * {Sergio Arbeo}[http://www.serabe.com]
10
+ * * {Patrick Mahoney}[http://polycrystal.org]
11
+ * * {Yoko Harada}[http://yokolet.blogspot.com]
12
+ *
13
+ * Permission is hereby granted, free of charge, to any person obtaining
14
+ * a copy of this software and associated documentation files (the
15
+ * 'Software'), to deal in the Software without restriction, including
16
+ * without limitation the rights to use, copy, modify, merge, publish,
17
+ * distribute, sublicense, and/or sell copies of the Software, and to
18
+ * permit persons to whom the Software is furnished to do so, subject to
19
+ * the following conditions:
20
+ *
21
+ * The above copyright notice and this permission notice shall be
22
+ * included in all copies or substantial portions of the Software.
23
+ *
24
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
25
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
27
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
28
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
29
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
30
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31
+ */
32
+
33
+ package nokogiri.internals;
34
+
35
+ import static nokogiri.internals.NokogiriHelpers.canonicalizeWhitespace;
36
+ import static nokogiri.internals.NokogiriHelpers.encodeJavaString;
37
+ import static nokogiri.internals.NokogiriHelpers.isNamespace;
38
+ import static nokogiri.internals.NokogiriHelpers.isBlank;
39
+ import static nokogiri.internals.NokogiriHelpers.shouldEncode;
40
+
41
+ import java.nio.charset.Charset;
42
+ import java.nio.charset.CharsetEncoder;
43
+ import java.util.*;
44
+ import java.util.regex.Matcher;
45
+ import java.util.regex.Pattern;
46
+
47
+ import org.cyberneko.html.HTMLElements;
48
+ import org.w3c.dom.Attr;
49
+ import org.w3c.dom.CDATASection;
50
+ import org.w3c.dom.Comment;
51
+ import org.w3c.dom.Document;
52
+ import org.w3c.dom.DocumentType;
53
+ import org.w3c.dom.Element;
54
+ import org.w3c.dom.Entity;
55
+ import org.w3c.dom.EntityReference;
56
+ import org.w3c.dom.NamedNodeMap;
57
+ import org.w3c.dom.Node;
58
+ import org.w3c.dom.Notation;
59
+ import org.w3c.dom.ProcessingInstruction;
60
+ import org.w3c.dom.Text;
61
+
62
+ /**
63
+ * A class for serializing a document.
64
+ *
65
+ * @author sergio
66
+ * @author Patrick Mahoney <pat@polycrystal.org>
67
+ * @author Yoko Harada <yokolet@gmail.com>
68
+ */
69
+ public class SaveContextVisitor {
70
+
71
+ private final StringBuilder buffer;
72
+ private final Stack<String> indentation;
73
+ private String encoding;
74
+ private final CharSequence indentString;
75
+ private boolean format;
76
+ private final boolean noDecl;
77
+ private final boolean noEmpty;
78
+ private final boolean noXhtml;
79
+ private final boolean asXhtml;
80
+ private boolean asXml;
81
+ private final boolean asHtml;
82
+ private final boolean asBuilder;
83
+ private boolean htmlDoc;
84
+ private final boolean fragment;
85
+ private final boolean canonical, incl_ns, with_comments;
86
+ private boolean subsets;
87
+ private boolean exclusive;
88
+ private final List<Node> c14nNodeList;
89
+ private final Deque<Attr[]> c14nNamespaceStack;
90
+ private final Deque<Attr[]> c14nAttrStack;
91
+ //private List<String> c14nExclusiveInclusivePrefixes = null;
92
+
93
+ /*
94
+ * U can't touch this.
95
+ * http://www.youtube.com/watch?v=WJ2ZFVx6A4Q
96
+ *
97
+ * Taken from libxml save options.
98
+ */
99
+
100
+ public static final int FORMAT = 1;
101
+ public static final int NO_DECL = 2;
102
+ public static final int NO_EMPTY = 4;
103
+ public static final int NO_XHTML = 8;
104
+ public static final int AS_XHTML = 16;
105
+ public static final int AS_XML = 32;
106
+ public static final int AS_HTML = 64;
107
+ public static final int AS_BUILDER = 128;
108
+
109
+ public static final int CANONICAL = 1;
110
+ public static final int INCL_NS = 2;
111
+ public static final int WITH_COMMENTS = 4;
112
+ public static final int SUBSETS = 8;
113
+ public static final int EXCLUSIVE = 16;
114
+
115
+ public SaveContextVisitor(int options, CharSequence indent, String encoding, boolean htmlDoc, boolean fragment, int canonicalOpts) {
116
+ buffer = new StringBuilder();
117
+ this.encoding = encoding;
118
+ indentation = new Stack<String>(); indentation.push("");
119
+ this.htmlDoc = htmlDoc;
120
+ this.fragment = fragment;
121
+ c14nNodeList = new ArrayList<Node>();
122
+ c14nNamespaceStack = new ArrayDeque<Attr[]>();
123
+ c14nAttrStack = new ArrayDeque<Attr[]>();
124
+ format = (options & FORMAT) == FORMAT;
125
+
126
+ noDecl = (options & NO_DECL) == NO_DECL;
127
+ noEmpty = (options & NO_EMPTY) == NO_EMPTY;
128
+ noXhtml = (options & NO_XHTML) == NO_XHTML;
129
+ asXhtml = (options & AS_XHTML) == AS_XHTML;
130
+ asXml = (options & AS_XML) == AS_XML;
131
+ asHtml = (options & AS_HTML) == AS_HTML;
132
+ asBuilder = (options & AS_BUILDER) == AS_BUILDER;
133
+
134
+ canonical = (canonicalOpts & CANONICAL) == CANONICAL;
135
+ incl_ns = (canonicalOpts & INCL_NS) == INCL_NS;
136
+ with_comments = (canonicalOpts & WITH_COMMENTS) == WITH_COMMENTS;
137
+ subsets = (canonicalOpts & SUBSETS) == SUBSETS;
138
+
139
+ if ((format && indent == null) || (format && indent.length() == 0)) indent = " "; // default, two spaces
140
+ if ((!format && indent != null) && indent.length() > 0) format = true;
141
+ if ((asBuilder && indent == null) || (asBuilder && indent.length() == 0)) indent = " "; // default, two spaces
142
+ indentString = indent;
143
+ if (!asXml && !asHtml && !asXhtml && !asBuilder) asXml = true;
144
+ }
145
+
146
+ @Override
147
+ public String toString() {
148
+ return buffer.toString();
149
+ }
150
+
151
+ public StringBuilder getInternalBuffer() { return buffer; }
152
+
153
+ public void setHtmlDoc(boolean htmlDoc) {
154
+ this.htmlDoc = htmlDoc;
155
+ }
156
+
157
+ public void setEncoding(String encoding) {
158
+ this.encoding = encoding;
159
+ }
160
+
161
+ public boolean enter(Node node) {
162
+ if (node instanceof Document) {
163
+ return enter((Document)node);
164
+ }
165
+ if (node instanceof Element) {
166
+ return enter((Element)node);
167
+ }
168
+ if (node instanceof Attr) {
169
+ return enter((Attr)node);
170
+ }
171
+ if (node instanceof Text) {
172
+ return enter((Text)node);
173
+ }
174
+ if (node instanceof CDATASection) {
175
+ return enter((CDATASection)node);
176
+ }
177
+ if (node instanceof Comment) {
178
+ return enter((Comment)node);
179
+ }
180
+ if (node instanceof DocumentType) {
181
+ return enter((DocumentType)node);
182
+ }
183
+ if (node instanceof Entity) {
184
+ return enter((Entity)node);
185
+ }
186
+ if (node instanceof EntityReference) {
187
+ return enter((EntityReference) node);
188
+ }
189
+ if (node instanceof Notation) {
190
+ return enter((Notation)node);
191
+ }
192
+ if (node instanceof ProcessingInstruction) {
193
+ return enter((ProcessingInstruction)node);
194
+ }
195
+ return false;
196
+ }
197
+
198
+ public void leave(Node node) {
199
+ if (node instanceof Document) {
200
+ leave((Document)node);
201
+ return;
202
+ }
203
+ if (node instanceof Element) {
204
+ leave((Element)node);
205
+ return;
206
+ }
207
+ if (node instanceof Attr) {
208
+ leave((Attr)node);
209
+ return;
210
+ }
211
+ if (node instanceof Text) {
212
+ return;
213
+ }
214
+ if (node instanceof CDATASection) {
215
+ leave((CDATASection)node);
216
+ return;
217
+ }
218
+ if (node instanceof Comment) {
219
+ leave((Comment)node);
220
+ return;
221
+ }
222
+ if (node instanceof DocumentType) {
223
+ leave((DocumentType)node);
224
+ return;
225
+ }
226
+ if (node instanceof Entity) {
227
+ leave((Entity)node);
228
+ return;
229
+ }
230
+ if (node instanceof EntityReference) {
231
+ leave((EntityReference) node);
232
+ return;
233
+ }
234
+ if (node instanceof Notation) {
235
+ leave((Notation)node);
236
+ return;
237
+ }
238
+ if (node instanceof ProcessingInstruction) {
239
+ leave((ProcessingInstruction)node);
240
+ return;
241
+ }
242
+ }
243
+
244
+ public boolean enter(String string) {
245
+ buffer.append(string);
246
+ return true;
247
+ }
248
+
249
+ public void leave(String string) {
250
+ // no-op
251
+ }
252
+
253
+ public boolean enter(Attr attr) {
254
+ String name = attr.getName();
255
+ buffer.append(name);
256
+ if (!asHtml || !isHtmlBooleanAttr(name)) {
257
+ buffer.append('=');
258
+ buffer.append('"');
259
+ String value = replaceCharsetIfNecessary(attr);
260
+ buffer.append(serializeAttrTextContent(value, htmlDoc));
261
+ buffer.append('"');
262
+ }
263
+ return true;
264
+ }
265
+
266
+ private static final Pattern CHARSET =
267
+ Pattern.compile("charset(()|\\s+)=(()|\\s+)(\\w|\\_|\\.|\\-)+", Pattern.CASE_INSENSITIVE);
268
+
269
+ private String replaceCharsetIfNecessary(Attr attr) {
270
+ String value = attr.getValue();
271
+ if (encoding == null) return value; // unable to replace in any case
272
+ if (!"content".equals(attr.getName().toLowerCase())) return value; // must be content attr
273
+ if (!"meta".equals(attr.getOwnerElement().getNodeName().toLowerCase())) return value;
274
+ Matcher m = CHARSET.matcher(value);
275
+ if (!m.find()) return value;
276
+ if (value.contains(encoding)) return value; // no need to replace
277
+ return value.replace(m.group(), "charset=" + encoding);
278
+ }
279
+
280
+ static final Set<String> HTML_BOOLEAN_ATTRS;
281
+ static {
282
+ final String[] _HTML_BOOLEAN_ATTRS = {
283
+ "checked", "compact", "declare", "defer", "disabled", "ismap",
284
+ "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
285
+ "selected"
286
+ };
287
+ HTML_BOOLEAN_ATTRS = new HashSet<String>(Arrays.asList(_HTML_BOOLEAN_ATTRS));
288
+ }
289
+
290
+ private static boolean isHtmlBooleanAttr(String name) {
291
+ return HTML_BOOLEAN_ATTRS.contains(name);
292
+ }
293
+
294
+ private static CharSequence serializeAttrTextContent(String str, boolean htmlDoc) {
295
+ if (str == null || str.length() == 0) return "";
296
+
297
+ StringBuilder buffer = new StringBuilder(str.length() + 16);
298
+
299
+ for (int i = 0; i < str.length(); i++) {
300
+ char c; switch (c = str.charAt(i)) {
301
+ case '\n': buffer.append("&#10;"); break;
302
+ case '\r': buffer.append("&#13;"); break;
303
+ case '\t': buffer.append("&#9;"); break;
304
+ case '"': if (htmlDoc) buffer.append("%22");
305
+ else buffer.append("&quot;");
306
+ break;
307
+ case '<': buffer.append("&lt;"); break;
308
+ case '>': buffer.append("&gt;"); break;
309
+ case '&': buffer.append("&amp;"); break;
310
+ default: buffer.append(c);
311
+ }
312
+ }
313
+
314
+ return buffer;
315
+ }
316
+
317
+ public void leave(Attr attr) {
318
+ // no-op
319
+ }
320
+
321
+ public boolean enter(CDATASection cdata) {
322
+ buffer.append("<![CDATA[");
323
+ buffer.append(cdata.getData());
324
+ buffer.append("]]>");
325
+ return true;
326
+ }
327
+
328
+ public void leave(CDATASection cdata) {
329
+ // no-op
330
+ }
331
+
332
+ public boolean enter(Comment comment) {
333
+ if (canonical) {
334
+ c14nNodeList.add(comment);
335
+ if (!with_comments) return true;
336
+ }
337
+ buffer.append("<!--");
338
+ buffer.append(comment.getData());
339
+ buffer.append("-->");
340
+ return true;
341
+ }
342
+
343
+ public void leave(Comment comment) {
344
+ // no-op
345
+ }
346
+
347
+ public boolean enter(Document document) {
348
+ if (!noDecl) {
349
+ buffer.append("<?xml version=\"");
350
+ buffer.append(document.getXmlVersion());
351
+ buffer.append("\"");
352
+
353
+ if (encoding != null) {
354
+ buffer.append(" encoding=\"");
355
+ buffer.append(encoding);
356
+ buffer.append("\"");
357
+ }
358
+ buffer.append("?>\n");
359
+ }
360
+ return true;
361
+ }
362
+
363
+ public void leave(Document document) {
364
+ // no-op
365
+ }
366
+
367
+ public boolean enter(DocumentType docType) {
368
+ if (canonical) {
369
+ c14nNodeList.add(docType);
370
+ return true;
371
+ }
372
+ String name = docType.getName();
373
+ String pubId = docType.getPublicId();
374
+ String sysId = docType.getSystemId();
375
+ String internalSubset = docType.getInternalSubset();
376
+ if (docType.getPreviousSibling() != null) {
377
+ buffer.append('\n');
378
+ }
379
+ buffer.append("<!DOCTYPE ").append(name).append(' ');
380
+ if (pubId != null) {
381
+ buffer.append("PUBLIC \"").append(pubId).append('"');
382
+ if (sysId != null) buffer.append(" \"").append(sysId).append('"');
383
+ } else if (sysId != null) {
384
+ buffer.append("SYSTEM \"").append(sysId).append('"');
385
+ }
386
+ if (internalSubset != null) {
387
+ buffer.append(' ').append('[');
388
+ buffer.append(internalSubset);
389
+ buffer.append(']');
390
+ }
391
+ buffer.append(">\n");
392
+ return true;
393
+ }
394
+
395
+ public void leave(DocumentType docType) {
396
+ // no-op
397
+ }
398
+
399
+ public boolean enter(Element element) {
400
+ if (canonical) {
401
+ c14nNodeList.add(element);
402
+ if (element == element.getOwnerDocument().getDocumentElement()) {
403
+ c14nNodeList.add(element.getOwnerDocument());
404
+ }
405
+ }
406
+ String current = indentation.peek();
407
+ buffer.append(current);
408
+ if (needIndent(element)) {
409
+ indentation.push(current + indentString);
410
+ }
411
+ String name = element.getTagName();
412
+ buffer.append('<').append(name);
413
+ Attr[] attrs = getAttrsAndNamespaces(element);
414
+ for (Attr attr : attrs) {
415
+ if (attr.getSpecified()) {
416
+ buffer.append(' ');
417
+ enter(attr);
418
+ leave(attr);
419
+ }
420
+ }
421
+ if (element.hasChildNodes()) {
422
+ buffer.append('>');
423
+ if (needBreakInOpening(element)) buffer.append('\n');
424
+ return true;
425
+ }
426
+ // no child
427
+ if (asHtml) {
428
+ buffer.append('>');
429
+ } else if (asXml && noEmpty) {
430
+ buffer.append('>');
431
+ } else if (asXhtml) {
432
+ if (isEmpty(name)) {
433
+ buffer.append(" />"); // see http://www.w3.org/TR/xhtml1/#C_2
434
+ } else {
435
+ buffer.append('>');
436
+ }
437
+ } else {
438
+ buffer.append("/>");
439
+ }
440
+ if (needBreakInOpening(element)) {
441
+ buffer.append('\n');
442
+ }
443
+ return true;
444
+ }
445
+
446
+ private boolean needIndent(Element element) {
447
+ if (containsText(element)) return false;
448
+ if (fragment) return false; // a given option might be fragment and format. fragment matters
449
+ if (format || asBuilder) return true;
450
+ return false;
451
+ }
452
+
453
+ private boolean needBreakInOpening(Element element) {
454
+ if (containsText(element)) return false;
455
+ if (fragment) return false;
456
+ if (format) return true;
457
+ if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true;
458
+ if (format && element.getNextSibling() == null && element.hasChildNodes()) return true;
459
+ return false;
460
+ }
461
+
462
+ private boolean isEmpty(String name) {
463
+ HTMLElements.Element element = HTMLElements.getElement(name);
464
+ return element.isEmpty();
465
+ }
466
+
467
+ private Attr[] getAttrsAndNamespaces(Element element) {
468
+ NamedNodeMap attrs = element.getAttributes();
469
+ if (!canonical) {
470
+ if (attrs == null || attrs.getLength() == 0) return new Attr[0];
471
+ Attr[] attrsAndNamespaces = new Attr[attrs.getLength()];
472
+ for (int i=0; i<attrs.getLength(); i++) {
473
+ attrsAndNamespaces[i] = (Attr) attrs.item(i);
474
+ }
475
+ return attrsAndNamespaces;
476
+ } else {
477
+ List<Attr> namespaces = new ArrayList<Attr>();
478
+ List<Attr> attributes = new ArrayList<Attr>();
479
+ if (subsets) {
480
+ getAttrsOfAncestors(element.getParentNode(), namespaces, attributes);
481
+ Attr[] namespaceOfAncestors = getSortedArray(namespaces);
482
+ Attr[] attributeOfAncestors = getSortedArray(attributes);
483
+ c14nNamespaceStack.push(namespaceOfAncestors);
484
+ c14nAttrStack.push(attributeOfAncestors);
485
+ subsets = false; // namespace propagation should be done only once on top level node.
486
+ }
487
+
488
+ getNamespacesAndAttrs(element, namespaces, attributes);
489
+
490
+ Attr[] namespaceArray = getSortedArray(namespaces);
491
+ Attr[] attributeArray = getSortedArray(attributes);
492
+ Attr[] allAttrs = new Attr[namespaceArray.length + attributeArray.length];
493
+ for (int i=0; i<allAttrs.length; i++) {
494
+ if (i < namespaceArray.length) {
495
+ allAttrs[i] = namespaceArray[i];
496
+ } else {
497
+ allAttrs[i] = attributeArray[i-namespaceArray.length];
498
+ }
499
+ }
500
+ c14nNamespaceStack.push(namespaceArray);
501
+ c14nAttrStack.push(attributeArray);
502
+ return allAttrs;
503
+ }
504
+
505
+ }
506
+
507
+ private void getAttrsOfAncestors(Node parent, List<Attr> namespaces, List<Attr> attributes) {
508
+ if (parent == null) return;
509
+ NamedNodeMap attrs = parent.getAttributes();
510
+ if (attrs == null || attrs.getLength() == 0) return;
511
+ for (int i=0; i < attrs.getLength(); i++) {
512
+ Attr attr = (Attr)attrs.item(i);
513
+ if (isNamespace(attr.getNodeName())) namespaces.add(attr);
514
+ else attributes.add(attr);
515
+ }
516
+ getAttrsOfAncestors(parent.getParentNode(), namespaces, attributes);
517
+ }
518
+
519
+ private void getNamespacesAndAttrs(Node current, List<Attr> namespaces, List<Attr> attributes) {
520
+ NamedNodeMap attrs = current.getAttributes();
521
+ for (int i=0; i<attrs.getLength(); i++) {
522
+ Attr attr = (Attr)attrs.item(i);
523
+ if (isNamespace(attr.getNodeName())) {
524
+ getNamespacesWithPropagated(namespaces, attr);
525
+ } else {
526
+ getAttributesWithPropagated(attributes, attr);
527
+ }
528
+ if (exclusive) {
529
+ verifyXmlSpace(attributes, attrs);
530
+ }
531
+ }
532
+ }
533
+
534
+ private void getNamespacesWithPropagated(List<Attr> namespaces, Attr attr) {
535
+ boolean newNamespace = true;
536
+ Iterator<Attr[]> iter = c14nNamespaceStack.iterator();
537
+ while (iter.hasNext()) {
538
+ Attr[] parentNamespaces = iter.next();
539
+ for (int n=0; n < parentNamespaces.length; n++) {
540
+ if (parentNamespaces[n].getNodeName().equals(attr.getNodeName())) {
541
+ if (parentNamespaces[n].getNodeValue().equals(attr.getNodeValue())) {
542
+ // exactly the same namespace should not be added
543
+ newNamespace = false;
544
+ } else {
545
+ // in case of namespace url change, propagated namespace will be override
546
+ namespaces.remove(parentNamespaces[n]);
547
+ }
548
+ }
549
+ }
550
+ if (newNamespace && !namespaces.contains(attr)) namespaces.add(attr);
551
+ }
552
+ }
553
+
554
+ private void getAttributesWithPropagated(List<Attr> attributes, Attr attr) {
555
+ boolean newAttribute = true;
556
+ Iterator<Attr[]> iter = c14nAttrStack.iterator();
557
+ while (iter.hasNext()) {
558
+ Attr[] parentAttr = iter.next();
559
+ for (int n=0; n < parentAttr.length; n++) {
560
+ if (!parentAttr[n].getNodeName().startsWith("xml:")) continue;
561
+ if (parentAttr[n].getNodeName().equals(attr.getNodeName())) {
562
+ if (parentAttr[n].getNodeValue().equals(attr.getNodeValue())) {
563
+ // exactly the same attribute should not be added
564
+ newAttribute = false;
565
+ } else {
566
+ // in case of attribute value change, propagated attribute will be override
567
+ attributes.remove(parentAttr[n]);
568
+ }
569
+ }
570
+ }
571
+ if (newAttribute) attributes.add(attr);
572
+ }
573
+ }
574
+
575
+ private void verifyXmlSpace(List<Attr> attributes, NamedNodeMap attrs) {
576
+ Attr attr = (Attr) attrs.getNamedItem("xml:space");
577
+ if (attr == null) {
578
+ for (int i=0; i < attributes.size(); i++) {
579
+ if (attributes.get(i).getNodeName().equals("xml:space")) {
580
+ attributes.remove(i);
581
+ break;
582
+ }
583
+ }
584
+ }
585
+ }
586
+
587
+ private Attr[] getSortedArray(List<Attr> attrList) {
588
+ Attr[] attrArray = attrList.toArray(new Attr[0]);
589
+ Arrays.sort(attrArray, new Comparator<Attr>() {
590
+ @Override
591
+ public int compare(Attr attr0, Attr attr1) {
592
+ return attr0.getNodeName().compareTo(attr1.getNodeName());
593
+ }
594
+ });
595
+ return attrArray;
596
+ }
597
+
598
+ public void leave(Element element) {
599
+ if (canonical) {
600
+ c14nNamespaceStack.poll();
601
+ c14nAttrStack.poll();
602
+ }
603
+ String name = element.getTagName();
604
+ if (element.hasChildNodes()) {
605
+ if (needIndentInClosing(element)) {
606
+ indentation.pop();
607
+ buffer.append(indentation.peek());
608
+ } else if (asBuilder) {
609
+ if (!containsText(element)) indentation.pop();
610
+ }
611
+ buffer.append("</").append(name).append('>');
612
+ if (needBreakInClosing(element)) {
613
+ buffer.append('\n');
614
+ }
615
+ return;
616
+ }
617
+ // no child, but HTML might need a closing tag.
618
+ if (asHtml || noEmpty) {
619
+ if (!isEmpty(name) && noEmpty) {
620
+ buffer.append("</").append(name).append('>');
621
+ }
622
+ }
623
+ if (needBreakInClosing(element)) {
624
+ if (!containsText(element)) indentation.pop();
625
+ buffer.append('\n');
626
+ }
627
+ }
628
+
629
+ private boolean needIndentInClosing(Element element) {
630
+ if (containsText(element)) return false;
631
+
632
+ if (fragment) return false; // a given option might be fragment and format. fragment matters
633
+ if (format) return true;
634
+ if (asBuilder && element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.ELEMENT_NODE) return true;
635
+ return false;
636
+ }
637
+
638
+ private boolean needBreakInClosing(Element element) {
639
+ if (fragment) return false;
640
+ if (format || asBuilder) return true;
641
+ return false;
642
+ }
643
+
644
+ private boolean containsText(Element element) {
645
+ return (element.getFirstChild() != null && element.getFirstChild().getNodeType() == Node.TEXT_NODE);
646
+ }
647
+
648
+ public boolean enter(Entity entity) {
649
+ String name = entity.getNodeName();
650
+ String pubId = entity.getPublicId();
651
+ String sysId = entity.getSystemId();
652
+ String notation = entity.getNotationName();
653
+ buffer.append("<!ENTITY ");
654
+ buffer.append(name);
655
+ if (pubId != null) {
656
+ buffer.append(" PUBLIC \"");
657
+ buffer.append(pubId);
658
+ buffer.append("\"");
659
+ }
660
+ if (sysId != null) {
661
+ buffer.append(" SYSTEM \"");
662
+ buffer.append(sysId);
663
+ buffer.append("\"");
664
+ }
665
+ if (notation != null) {
666
+ buffer.append(" NDATA ");
667
+ buffer.append(notation);
668
+ }
669
+ buffer.append(">");
670
+ return true;
671
+ }
672
+
673
+ public void leave(Entity entity) {
674
+ // no-op
675
+ }
676
+
677
+ public boolean enter(EntityReference entityRef) {
678
+ buffer.append('&').append(entityRef.getNodeName()).append(';');
679
+ return true;
680
+ }
681
+ public void leave(EntityReference entityRef) {
682
+ // no-op
683
+ }
684
+
685
+ public boolean enter(Notation notation) {
686
+ String name = notation.getNodeName();
687
+ String pubId = notation.getPublicId();
688
+ String sysId = notation.getSystemId();
689
+ buffer.append("<!NOTATION ");
690
+ buffer.append(name);
691
+ if (pubId != null) {
692
+ buffer.append(" PUBLIC \"");
693
+ buffer.append(pubId);
694
+ buffer.append("\"");
695
+ if (sysId != null) {
696
+ buffer.append(" \"");
697
+ buffer.append(sysId);
698
+ buffer.append("\"");
699
+ }
700
+ } else if (sysId != null) {
701
+ buffer.append(" SYSTEM \"");
702
+ buffer.append(sysId);
703
+ buffer.append("\"");
704
+ }
705
+ buffer.append(">");
706
+ return true;
707
+ }
708
+
709
+ public void leave(Notation notation) {
710
+ // no-op
711
+ }
712
+
713
+ public boolean enter(ProcessingInstruction pi) {
714
+ buffer.append("<?");
715
+ buffer.append(pi.getTarget());
716
+ buffer.append(" ");
717
+ buffer.append(pi.getData());
718
+ if (asHtml) buffer.append(">");
719
+ else buffer.append("?>");
720
+ buffer.append("\n");
721
+ if (canonical) c14nNodeList.add(pi);
722
+ return true;
723
+ }
724
+
725
+ public void leave(ProcessingInstruction pi) {
726
+ // no-op
727
+ }
728
+
729
+ private boolean isHtmlScript(Text text) {
730
+ return htmlDoc && text.getParentNode().getNodeName().equals("script");
731
+ }
732
+
733
+ private boolean isHtmlStyle(Text text) {
734
+ return htmlDoc && text.getParentNode().getNodeName().equals("style");
735
+ }
736
+
737
+ public boolean enter(Text text) {
738
+ CharSequence textContent = text.getNodeValue();
739
+ if (canonical) {
740
+ c14nNodeList.add(text);
741
+ if (isBlank(textContent)) {
742
+ buffer.append(canonicalizeWhitespace(textContent));
743
+ return true;
744
+ }
745
+ }
746
+
747
+ if (shouldEncode(text) && !isHtmlScript(text) && !isHtmlStyle(text)) {
748
+ textContent = encodeJavaString(textContent);
749
+ }
750
+
751
+ textContent = encodeStringToHtmlEntity(textContent);
752
+ buffer.append(textContent);
753
+ return true;
754
+ }
755
+
756
+ private CharSequence encodeStringToHtmlEntity(CharSequence text) {
757
+ if (encoding == null) return text;
758
+
759
+ CharsetEncoder encoder = Charset.forName(encoding).newEncoder();
760
+ StringBuilder sb = new StringBuilder(text.length() + 16);
761
+ // make sure we can handle code points that are higher than 2 bytes
762
+ for ( int i = 0; i < text.length(); ) {
763
+ int code = Character.codePointAt(text, i);
764
+ // TODO not sure about bigger offset then 2 ?!
765
+ int offset = code > 65535 ? 2 : 1;
766
+ CharSequence substr = text.subSequence(i, i + offset);
767
+ boolean canEncode = encoder.canEncode(substr);
768
+ if (canEncode) {
769
+ sb.append(substr);
770
+ }
771
+ else {
772
+ sb.append("&#x").append(Integer.toHexString(code)).append(';');
773
+ }
774
+ i += offset;
775
+ }
776
+ return sb;
777
+ }
778
+ }