nokogiri 1.5.0.beta.1 → 1.5.0.beta.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- data/CHANGELOG.ja.rdoc +28 -8
- data/CHANGELOG.rdoc +23 -0
- data/Manifest.txt +63 -1
- data/README.ja.rdoc +1 -1
- data/README.rdoc +22 -4
- data/Rakefile +6 -2
- data/ext/java/nokogiri/EncodingHandler.java +92 -0
- data/ext/java/nokogiri/HtmlDocument.java +116 -0
- data/ext/java/nokogiri/HtmlElementDescription.java +111 -0
- data/ext/java/nokogiri/HtmlEntityLookup.java +45 -0
- data/ext/java/nokogiri/HtmlSaxParserContext.java +218 -0
- data/ext/java/nokogiri/NokogiriService.java +370 -0
- data/ext/java/nokogiri/XmlAttr.java +147 -0
- data/ext/java/nokogiri/XmlAttributeDecl.java +98 -0
- data/ext/java/nokogiri/XmlCdata.java +50 -0
- data/ext/java/nokogiri/XmlComment.java +47 -0
- data/ext/java/nokogiri/XmlDocument.java +463 -0
- data/ext/java/nokogiri/XmlDocumentFragment.java +207 -0
- data/ext/java/nokogiri/XmlDtd.java +427 -0
- data/ext/java/nokogiri/XmlElement.java +172 -0
- data/ext/java/nokogiri/XmlElementContent.java +350 -0
- data/ext/java/nokogiri/XmlElementDecl.java +115 -0
- data/ext/java/nokogiri/XmlEntityDecl.java +129 -0
- data/ext/java/nokogiri/XmlEntityReference.java +42 -0
- data/ext/java/nokogiri/XmlNamespace.java +77 -0
- data/ext/java/nokogiri/XmlNode.java +1399 -0
- data/ext/java/nokogiri/XmlNodeSet.java +248 -0
- data/ext/java/nokogiri/XmlProcessingInstruction.java +70 -0
- data/ext/java/nokogiri/XmlReader.java +373 -0
- data/ext/java/nokogiri/XmlRelaxng.java +166 -0
- data/ext/java/nokogiri/XmlSaxParserContext.java +308 -0
- data/ext/java/nokogiri/XmlSaxPushParser.java +146 -0
- data/ext/java/nokogiri/XmlSchema.java +142 -0
- data/ext/java/nokogiri/XmlSyntaxError.java +84 -0
- data/ext/java/nokogiri/XmlText.java +96 -0
- data/ext/java/nokogiri/XmlXpathContext.java +130 -0
- data/ext/java/nokogiri/XsltStylesheet.java +126 -0
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +181 -0
- data/ext/java/nokogiri/internals/NokogiriDocumentCache.java +39 -0
- data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +42 -0
- data/ext/java/nokogiri/internals/NokogiriHandler.java +251 -0
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +526 -0
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +136 -0
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +80 -0
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +37 -0
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +54 -0
- data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +49 -0
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +88 -0
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +23 -0
- data/ext/java/nokogiri/internals/ParserContext.java +235 -0
- data/ext/java/nokogiri/internals/PushInputStream.java +381 -0
- data/ext/java/nokogiri/internals/ReaderNode.java +431 -0
- data/ext/java/nokogiri/internals/SaveContext.java +249 -0
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +35 -0
- data/ext/java/nokogiri/internals/XmlDeclHandler.java +10 -0
- data/ext/java/nokogiri/internals/XmlDomParser.java +45 -0
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +201 -0
- data/ext/java/nokogiri/internals/XmlSaxParser.java +33 -0
- data/ext/nokogiri/depend +32 -0
- data/ext/nokogiri/extconf.rb +61 -32
- data/ext/nokogiri/nokogiri.c +0 -5
- data/ext/nokogiri/nokogiri.h +2 -2
- data/ext/nokogiri/xml_document.c +5 -0
- data/ext/nokogiri/xml_libxml2_hacks.c +112 -0
- data/ext/nokogiri/xml_libxml2_hacks.h +12 -0
- data/ext/nokogiri/xml_node.c +56 -16
- data/ext/nokogiri/xml_node_set.c +7 -7
- data/ext/nokogiri/xml_reader.c +20 -1
- data/ext/nokogiri/xml_relax_ng.c +0 -7
- data/ext/nokogiri/xml_xpath_context.c +2 -0
- data/lib/isorelax.jar +0 -0
- data/lib/jing.jar +0 -0
- data/lib/nekodtd.jar +0 -0
- data/lib/nekohtml.jar +0 -0
- data/lib/nokogiri.rb +1 -2
- data/lib/nokogiri/css/generated_parser.rb +155 -148
- data/lib/nokogiri/css/generated_tokenizer.rb +2 -1
- data/lib/nokogiri/css/parser.y +3 -0
- data/lib/nokogiri/css/xpath_visitor.rb +1 -7
- data/lib/nokogiri/html.rb +2 -2
- data/lib/nokogiri/html/document_fragment.rb +7 -4
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/version.rb +3 -6
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +1 -2
- data/lib/nokogiri/xml/document_fragment.rb +7 -0
- data/lib/nokogiri/xml/node.rb +5 -3
- data/lib/nokogiri/xml/node_set.rb +25 -0
- data/lib/nokogiri/xml/reader.rb +2 -0
- data/lib/nokogiri/xml/sax/document.rb +3 -1
- data/lib/xercesImpl.jar +0 -0
- data/spec/helper.rb +3 -0
- data/spec/xml/reader_spec.rb +307 -0
- data/tasks/test.rb +1 -1
- data/test/css/test_parser.rb +11 -1
- data/test/html/sax/test_parser_context.rb +2 -2
- data/test/html/test_document.rb +2 -2
- data/test/html/test_document_fragment.rb +34 -6
- data/test/test_memory_leak.rb +2 -2
- data/test/test_reader.rb +28 -6
- data/test/test_xslt_transforms.rb +2 -3
- data/test/xml/test_attr.rb +31 -4
- data/test/xml/test_builder.rb +5 -5
- data/test/xml/test_cdata.rb +3 -3
- data/test/xml/test_document.rb +8 -8
- data/test/xml/test_document_fragment.rb +4 -12
- data/test/xml/test_node.rb +1 -1
- data/test/xml/test_node_reparenting.rb +26 -11
- data/test/xml/test_node_set.rb +38 -2
- data/test/xml/test_text.rb +11 -2
- data/test/xml/test_unparented_node.rb +1 -1
- data/test/xml/test_xpath.rb +11 -7
- metadata +68 -5
- data/lib/nokogiri/version_warning.rb +0 -14
@@ -0,0 +1,130 @@
|
|
1
|
+
package nokogiri;
|
2
|
+
|
3
|
+
import java.util.Set;
|
4
|
+
import java.util.regex.Matcher;
|
5
|
+
import java.util.regex.Pattern;
|
6
|
+
|
7
|
+
import javax.xml.xpath.XPath;
|
8
|
+
import javax.xml.xpath.XPathConstants;
|
9
|
+
import javax.xml.xpath.XPathExpression;
|
10
|
+
import javax.xml.xpath.XPathExpressionException;
|
11
|
+
import javax.xml.xpath.XPathFactory;
|
12
|
+
|
13
|
+
import nokogiri.internals.NokogiriNamespaceContext;
|
14
|
+
import nokogiri.internals.NokogiriXPathFunctionResolver;
|
15
|
+
|
16
|
+
import org.jruby.Ruby;
|
17
|
+
import org.jruby.RubyBoolean;
|
18
|
+
import org.jruby.RubyClass;
|
19
|
+
import org.jruby.RubyException;
|
20
|
+
import org.jruby.RubyNumeric;
|
21
|
+
import org.jruby.RubyObject;
|
22
|
+
import org.jruby.RubyString;
|
23
|
+
import org.jruby.anno.JRubyClass;
|
24
|
+
import org.jruby.anno.JRubyMethod;
|
25
|
+
import org.jruby.exceptions.RaiseException;
|
26
|
+
import org.jruby.runtime.ThreadContext;
|
27
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
28
|
+
import org.w3c.dom.NodeList;
|
29
|
+
|
30
|
+
@JRubyClass(name="Nokogiri::XML::XPathContext")
|
31
|
+
public class XmlXpathContext extends RubyObject {
|
32
|
+
private XmlNode context;
|
33
|
+
private XPath xpath;
|
34
|
+
|
35
|
+
public XmlXpathContext(Ruby ruby, RubyClass rubyClass, XmlNode context) {
|
36
|
+
super(ruby, rubyClass);
|
37
|
+
this.context = context;
|
38
|
+
this.xpath = XPathFactory.newInstance().newXPath();
|
39
|
+
this.xpath.setNamespaceContext(new NokogiriNamespaceContext());
|
40
|
+
}
|
41
|
+
|
42
|
+
@JRubyMethod(name = "new", meta = true)
|
43
|
+
public static IRubyObject rbNew(ThreadContext context, IRubyObject cls, IRubyObject node) {
|
44
|
+
XmlNode xmlNode = (XmlNode)node;
|
45
|
+
return new XmlXpathContext(context.getRuntime(), (RubyClass)cls, xmlNode);
|
46
|
+
}
|
47
|
+
|
48
|
+
@JRubyMethod
|
49
|
+
public IRubyObject evaluate(ThreadContext context, IRubyObject expr, IRubyObject handler) {
|
50
|
+
String src = expr.convertToString().asJavaString();
|
51
|
+
try {
|
52
|
+
if(!handler.isNil()) {
|
53
|
+
if (!isContainsPrefix(src)) {
|
54
|
+
Set<String> methodNames = handler.getMetaClass().getMethods().keySet();
|
55
|
+
for (String name : methodNames) {
|
56
|
+
src = src.replaceAll(name, NokogiriNamespaceContext.NOKOGIRI_PREFIX+":"+name);
|
57
|
+
}
|
58
|
+
}
|
59
|
+
xpath.setXPathFunctionResolver(new NokogiriXPathFunctionResolver(handler));
|
60
|
+
}
|
61
|
+
XPathExpression xpathExpression = xpath.compile(src);
|
62
|
+
return node_set(context, xpathExpression);
|
63
|
+
//return new XmlXpath(context.getRuntime(), (RubyClass)context.getRuntime().getClassFromPath("Nokogiri::XML::XPath"), xpathExpression, this.context);
|
64
|
+
} catch (XPathExpressionException xpee) {
|
65
|
+
xpee = new XPathExpressionException(src);
|
66
|
+
RubyException e =
|
67
|
+
XmlSyntaxError.createXPathSyntaxError(getRuntime(), xpee);
|
68
|
+
throw new RaiseException(e);
|
69
|
+
}
|
70
|
+
}
|
71
|
+
|
72
|
+
protected IRubyObject node_set(ThreadContext rbctx, XPathExpression xpathExpression) {
|
73
|
+
XmlNodeSet result = null;
|
74
|
+
try {
|
75
|
+
result = tryGetNodeSet(xpathExpression);
|
76
|
+
// result.relink_namespace(context);
|
77
|
+
result.setDocument(context.document(rbctx));
|
78
|
+
return result;
|
79
|
+
} catch (XPathExpressionException xpee) {
|
80
|
+
try {
|
81
|
+
return tryGetOpaqueValue(xpathExpression);
|
82
|
+
} catch (XPathExpressionException xpee_opaque) {
|
83
|
+
RubyException e = XmlSyntaxError.createXPathSyntaxError(getRuntime(), xpee_opaque);
|
84
|
+
throw new RaiseException(e);
|
85
|
+
}
|
86
|
+
}
|
87
|
+
}
|
88
|
+
|
89
|
+
private XmlNodeSet tryGetNodeSet(XPathExpression xpathExpression) throws XPathExpressionException {
|
90
|
+
NodeList nodes = (NodeList)xpathExpression.evaluate(context.node, XPathConstants.NODESET);
|
91
|
+
return new XmlNodeSet(getRuntime(), nodes);
|
92
|
+
}
|
93
|
+
|
94
|
+
private static Pattern number_pattern = Pattern.compile("\\d.*");
|
95
|
+
private static Pattern boolean_pattern = Pattern.compile("true|false");
|
96
|
+
|
97
|
+
private IRubyObject tryGetOpaqueValue(XPathExpression xpathExpression) throws XPathExpressionException {
|
98
|
+
String string = (String)xpathExpression.evaluate(context.node, XPathConstants.STRING);
|
99
|
+
if (doesMatch(number_pattern, string)) return RubyNumeric.dbl2num(getRuntime(), Double.parseDouble(string));
|
100
|
+
if (doesMatch(boolean_pattern, string)) return RubyBoolean.newBoolean(getRuntime(), Boolean.parseBoolean(string));
|
101
|
+
return RubyString.newString(getRuntime(), string);
|
102
|
+
}
|
103
|
+
|
104
|
+
private boolean doesMatch(Pattern pattern, String string) {
|
105
|
+
Matcher m = pattern.matcher(string);
|
106
|
+
return m.matches();
|
107
|
+
}
|
108
|
+
|
109
|
+
private boolean isContainsPrefix(String str) {
|
110
|
+
Set<String> prefixes = ((NokogiriNamespaceContext)xpath.getNamespaceContext()).getAllPrefixes();
|
111
|
+
for (String prefix : prefixes) {
|
112
|
+
if (str.contains(prefix + ":")) {
|
113
|
+
return true;
|
114
|
+
}
|
115
|
+
}
|
116
|
+
return false;
|
117
|
+
}
|
118
|
+
|
119
|
+
|
120
|
+
@JRubyMethod
|
121
|
+
public IRubyObject evaluate(ThreadContext context, IRubyObject expr) {
|
122
|
+
return this.evaluate(context, expr, context.getRuntime().getNil());
|
123
|
+
}
|
124
|
+
|
125
|
+
@JRubyMethod
|
126
|
+
public IRubyObject register_ns(ThreadContext context, IRubyObject prefix, IRubyObject uri) {
|
127
|
+
((NokogiriNamespaceContext) this.xpath.getNamespaceContext()).registerNamespace(prefix.convertToString().asJavaString(), uri.convertToString().asJavaString());
|
128
|
+
return this;
|
129
|
+
}
|
130
|
+
}
|
@@ -0,0 +1,126 @@
|
|
1
|
+
package nokogiri;
|
2
|
+
|
3
|
+
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
|
4
|
+
|
5
|
+
import java.util.regex.Matcher;
|
6
|
+
import java.util.regex.Pattern;
|
7
|
+
|
8
|
+
import javax.xml.transform.Templates;
|
9
|
+
import javax.xml.transform.Transformer;
|
10
|
+
import javax.xml.transform.TransformerConfigurationException;
|
11
|
+
import javax.xml.transform.TransformerException;
|
12
|
+
import javax.xml.transform.TransformerFactory;
|
13
|
+
import javax.xml.transform.dom.DOMResult;
|
14
|
+
import javax.xml.transform.dom.DOMSource;
|
15
|
+
|
16
|
+
import org.jruby.Ruby;
|
17
|
+
import org.jruby.RubyArray;
|
18
|
+
import org.jruby.RubyClass;
|
19
|
+
import org.jruby.RubyObject;
|
20
|
+
import org.jruby.anno.JRubyClass;
|
21
|
+
import org.jruby.anno.JRubyMethod;
|
22
|
+
import org.jruby.javasupport.util.RuntimeHelpers;
|
23
|
+
import org.jruby.runtime.ThreadContext;
|
24
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
25
|
+
import org.w3c.dom.Document;
|
26
|
+
|
27
|
+
@JRubyClass(name="Nokogiri::XSLT::Stylesheet")
|
28
|
+
public class XsltStylesheet extends RubyObject {
|
29
|
+
|
30
|
+
private Templates sheet;
|
31
|
+
|
32
|
+
public XsltStylesheet(Ruby ruby, RubyClass rubyClass) {
|
33
|
+
super(ruby, rubyClass);
|
34
|
+
}
|
35
|
+
|
36
|
+
private void addParametersToTransformer(ThreadContext context, Transformer transf, IRubyObject parameters) {
|
37
|
+
Ruby ruby = context.getRuntime();
|
38
|
+
RubyArray params = parameters.convertToArray();
|
39
|
+
int limit = params.getLength();
|
40
|
+
if(limit % 2 == 1) limit--;
|
41
|
+
|
42
|
+
for(int i = 0; i < limit; i+=2) {
|
43
|
+
String name = params.aref(ruby.newFixnum(i)).asJavaString();
|
44
|
+
String value = params.aref(ruby.newFixnum(i+1)).asJavaString();
|
45
|
+
transf.setParameter(name, unparseValue(value));
|
46
|
+
}
|
47
|
+
}
|
48
|
+
|
49
|
+
private Pattern p = Pattern.compile("'.{1,}'");
|
50
|
+
|
51
|
+
private String unparseValue(String orig) {
|
52
|
+
Matcher m = p.matcher(orig);
|
53
|
+
if ((orig.startsWith("\"") && orig.endsWith("\"")) || m.matches()) {
|
54
|
+
orig = orig.substring(1, orig.length()-1);
|
55
|
+
}
|
56
|
+
|
57
|
+
return orig;
|
58
|
+
}
|
59
|
+
|
60
|
+
@JRubyMethod(meta = true)
|
61
|
+
public static IRubyObject parse_stylesheet_doc(ThreadContext context, IRubyObject cls, IRubyObject document) {
|
62
|
+
|
63
|
+
Ruby ruby = context.getRuntime();
|
64
|
+
|
65
|
+
if(!(document instanceof XmlDocument)) {
|
66
|
+
throw ruby.newArgumentError("doc must be a Nokogiri::XML::Document instance");
|
67
|
+
}
|
68
|
+
|
69
|
+
XmlDocument xmlDoc = (XmlDocument) document;
|
70
|
+
|
71
|
+
RubyArray errors = (RubyArray) xmlDoc.getInstanceVariable("@errors");
|
72
|
+
|
73
|
+
if(!errors.isEmpty()) {
|
74
|
+
throw ruby.newRuntimeError(errors.first().asJavaString());
|
75
|
+
}
|
76
|
+
|
77
|
+
Document doc = ((XmlDocument) xmlDoc.dup_implementation(context, true)).getDocument();
|
78
|
+
|
79
|
+
XsltStylesheet xslt = new XsltStylesheet(ruby, (RubyClass) cls);
|
80
|
+
try {
|
81
|
+
xslt.sheet = TransformerFactory.newInstance().newTemplates(new DOMSource(doc));
|
82
|
+
} catch (TransformerConfigurationException ex) {
|
83
|
+
ruby.newRuntimeError("could not parse xslt stylesheet");
|
84
|
+
}
|
85
|
+
|
86
|
+
return xslt;
|
87
|
+
}
|
88
|
+
|
89
|
+
@JRubyMethod
|
90
|
+
public IRubyObject serialize(ThreadContext context, IRubyObject doc) {
|
91
|
+
System.out.println("Serialize called in stylesheet");
|
92
|
+
return RuntimeHelpers.invoke(context,
|
93
|
+
RuntimeHelpers.invoke(context, doc, "root"),
|
94
|
+
"to_s");
|
95
|
+
}
|
96
|
+
|
97
|
+
@JRubyMethod(rest = true, required=1, optional=2)
|
98
|
+
public IRubyObject transform(ThreadContext context, IRubyObject[] args) {
|
99
|
+
Ruby ruby = context.getRuntime();
|
100
|
+
|
101
|
+
DOMSource docSource = new DOMSource(((XmlDocument) args[0]).getDocument());
|
102
|
+
DOMResult result = new DOMResult();
|
103
|
+
|
104
|
+
try{
|
105
|
+
Transformer transf = this.sheet.newTransformer();
|
106
|
+
if(args.length > 1) {
|
107
|
+
addParametersToTransformer(context, transf, args[1]);
|
108
|
+
}
|
109
|
+
transf.transform(docSource, result);
|
110
|
+
} catch(TransformerConfigurationException ex) {
|
111
|
+
throw ruby.newRuntimeError("Could not transform the document.");
|
112
|
+
} catch(TransformerException ex) {
|
113
|
+
throw ruby.newRuntimeError("Could not transform the document.");
|
114
|
+
}
|
115
|
+
|
116
|
+
if ("html".equals(result.getNode().getFirstChild().getNodeName())) {
|
117
|
+
return new HtmlDocument(ruby,
|
118
|
+
getNokogiriClass(ruby, "Nokogiri::HTML::Document"),
|
119
|
+
(Document) result.getNode());
|
120
|
+
} else {
|
121
|
+
return new XmlDocument(ruby,
|
122
|
+
getNokogiriClass(ruby, "Nokogiri::XML::Document"),
|
123
|
+
(Document) result.getNode());
|
124
|
+
}
|
125
|
+
}
|
126
|
+
}
|
@@ -0,0 +1,181 @@
|
|
1
|
+
package nokogiri.internals;
|
2
|
+
|
3
|
+
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
|
4
|
+
import static nokogiri.internals.NokogiriHelpers.isNamespace;
|
5
|
+
import nokogiri.HtmlDocument;
|
6
|
+
import nokogiri.XmlDocument;
|
7
|
+
|
8
|
+
import org.apache.xerces.parsers.DOMParser;
|
9
|
+
import org.apache.xerces.xni.Augmentations;
|
10
|
+
import org.apache.xerces.xni.QName;
|
11
|
+
import org.apache.xerces.xni.XMLAttributes;
|
12
|
+
import org.apache.xerces.xni.XNIException;
|
13
|
+
import org.apache.xerces.xni.parser.XMLDocumentFilter;
|
14
|
+
import org.apache.xerces.xni.parser.XMLParserConfiguration;
|
15
|
+
import org.cyberneko.html.HTMLConfiguration;
|
16
|
+
import org.cyberneko.html.filters.DefaultFilter;
|
17
|
+
import org.jruby.Ruby;
|
18
|
+
import org.jruby.RubyClass;
|
19
|
+
import org.jruby.runtime.ThreadContext;
|
20
|
+
import org.jruby.runtime.builtin.IRubyObject;
|
21
|
+
import org.w3c.dom.Document;
|
22
|
+
|
23
|
+
/**
|
24
|
+
*
|
25
|
+
* @author sergio
|
26
|
+
*/
|
27
|
+
public class HtmlDomParserContext extends XmlDomParserContext {
|
28
|
+
protected static final String PROPERTY_FILTERS =
|
29
|
+
"http://cyberneko.org/html/properties/filters";
|
30
|
+
protected static final String PROPERTY_ELEM_NAMES =
|
31
|
+
"http://cyberneko.org/html/properties/names/elems";
|
32
|
+
protected static final String PROPERTY_ATTRS_NAMES =
|
33
|
+
"http://cyberneko.org/html/properties/names/attrs";
|
34
|
+
protected static final String FEATURE_DOCUMENT_FRAGMENT =
|
35
|
+
"http://cyberneko.org/html/features/balance-tags/document-fragment";
|
36
|
+
protected static final String FEATURE_REPORT_ERRORS =
|
37
|
+
"http://cyberneko.org/html/features/report-errors";
|
38
|
+
|
39
|
+
public HtmlDomParserContext(Ruby runtime, IRubyObject options) {
|
40
|
+
super(runtime, options);
|
41
|
+
}
|
42
|
+
|
43
|
+
public HtmlDomParserContext(Ruby runtime, IRubyObject encoding, IRubyObject options) {
|
44
|
+
super(runtime, encoding, options);
|
45
|
+
}
|
46
|
+
|
47
|
+
@Override
|
48
|
+
protected void initErrorHandler() {
|
49
|
+
if (continuesOnError()) {
|
50
|
+
errorHandler = new NokogiriNonStrictErrorHandler4NekoHtml();
|
51
|
+
} else if (options.noError) {
|
52
|
+
errorHandler = new NokogiriNonStrictErrorHandler4NekoHtml(options.noError);
|
53
|
+
} else {
|
54
|
+
errorHandler = new NokogiriStrictErrorHandler();
|
55
|
+
}
|
56
|
+
}
|
57
|
+
|
58
|
+
@Override
|
59
|
+
protected void initParser(Ruby runtime) {
|
60
|
+
XMLParserConfiguration config = new HTMLConfiguration();
|
61
|
+
XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
|
62
|
+
XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
|
63
|
+
//XMLDocumentFilter[] filters = { removeNSAttrsFilter};
|
64
|
+
XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
|
65
|
+
|
66
|
+
config.setErrorHandler(this.errorHandler);
|
67
|
+
parser = new DOMParser(config);
|
68
|
+
|
69
|
+
setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
|
70
|
+
setProperty(PROPERTY_ELEM_NAMES, "lower");
|
71
|
+
setProperty(PROPERTY_ATTRS_NAMES, "lower");
|
72
|
+
setFeature(FEATURE_REPORT_ERRORS, true);
|
73
|
+
setFeature("http://xml.org/sax/features/namespaces", false);
|
74
|
+
setProperty(PROPERTY_FILTERS, filters);
|
75
|
+
}
|
76
|
+
|
77
|
+
/**
|
78
|
+
* Enable NekoHTML feature for balancing tags in a document
|
79
|
+
* fragment.
|
80
|
+
*/
|
81
|
+
public void enableDocumentFragment() {
|
82
|
+
setFeature(FEATURE_DOCUMENT_FRAGMENT, true);
|
83
|
+
}
|
84
|
+
|
85
|
+
@Override
|
86
|
+
protected XmlDocument getNewEmptyDocument(ThreadContext context) {
|
87
|
+
IRubyObject[] args = new IRubyObject[0];
|
88
|
+
return (XmlDocument) XmlDocument.rbNew(context,
|
89
|
+
getNokogiriClass(context.getRuntime(), "Nokogiri::XML::Document"),
|
90
|
+
args);
|
91
|
+
}
|
92
|
+
|
93
|
+
@Override
|
94
|
+
protected XmlDocument wrapDocument(ThreadContext context,
|
95
|
+
RubyClass klass,
|
96
|
+
Document doc) {
|
97
|
+
HtmlDocument htmlDocument = new HtmlDocument(context.getRuntime(), klass, doc);
|
98
|
+
htmlDocument.setEncoding(ruby_encoding);
|
99
|
+
return htmlDocument;
|
100
|
+
}
|
101
|
+
|
102
|
+
/**
|
103
|
+
* Filter to strip out attributes that pertain to XML namespaces.
|
104
|
+
*
|
105
|
+
* @author sergio
|
106
|
+
* @author Patrick Mahoney <pat@polycrystal.org>
|
107
|
+
*/
|
108
|
+
public static class RemoveNSAttrsFilter extends DefaultFilter {
|
109
|
+
@Override
|
110
|
+
public void startElement(QName element, XMLAttributes attrs,
|
111
|
+
Augmentations augs) throws XNIException {
|
112
|
+
int i;
|
113
|
+
for (i = 0; i < attrs.getLength(); ++i) {
|
114
|
+
if (isNamespace(attrs.getQName(i))) {
|
115
|
+
attrs.removeAttributeAt(i);
|
116
|
+
--i;
|
117
|
+
}
|
118
|
+
}
|
119
|
+
|
120
|
+
element.uri = null;
|
121
|
+
super.startElement(element, attrs, augs);
|
122
|
+
}
|
123
|
+
}
|
124
|
+
|
125
|
+
public static class ElementValidityCheckFilter extends DefaultFilter {
|
126
|
+
private NokogiriErrorHandler errorHandler;
|
127
|
+
|
128
|
+
private ElementValidityCheckFilter(NokogiriErrorHandler errorHandler) {
|
129
|
+
this.errorHandler = errorHandler;
|
130
|
+
}
|
131
|
+
|
132
|
+
// element names from xhtml1-strict.dtd
|
133
|
+
private static String[][] element_names = {
|
134
|
+
{"a", "abbr", "acronym", "address", "area"},
|
135
|
+
{"b", "base", "basefont", "bdo", "big", "blockquote", "body", "br", "button"},
|
136
|
+
{"caption", "cite", "code", "col", "colgroup"},
|
137
|
+
{"dd", "del", "dfn", "div", "dl", "dt"},
|
138
|
+
{"em"},
|
139
|
+
{"fieldset", "font", "form", "frame", "frameset"},
|
140
|
+
{}, // g
|
141
|
+
{"h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html"},
|
142
|
+
{"i", "iframe", "img", "input", "ins"},
|
143
|
+
{}, // j
|
144
|
+
{"kbd"},
|
145
|
+
{"label", "legend", "li", "link"},
|
146
|
+
{"map", "meta"},
|
147
|
+
{"noframes", "noscript"},
|
148
|
+
{"object", "ol", "optgroup", "option"},
|
149
|
+
{"p", "param", "pre"},
|
150
|
+
{"q"},
|
151
|
+
{}, // r
|
152
|
+
{"s", "samp", "script", "select", "small", "span", "strike", "strong", "style", "sub", "sup"},
|
153
|
+
{"table", "tbody", "td", "textarea", "tfoot", "th", "thead", "title", "tr", "tt"},
|
154
|
+
{"u", "ul"},
|
155
|
+
{"var"},
|
156
|
+
{}, // w
|
157
|
+
{}, // x
|
158
|
+
{}, // y
|
159
|
+
{} // z
|
160
|
+
};
|
161
|
+
|
162
|
+
private boolean isValid(String testee) {
|
163
|
+
char[] c = testee.toCharArray();
|
164
|
+
int index = new Integer(c[0]) - 97;
|
165
|
+
for (int i=0; i<element_names[index].length; i++) {
|
166
|
+
if (testee.equals(element_names[index][i])) {
|
167
|
+
return true;
|
168
|
+
}
|
169
|
+
}
|
170
|
+
return false;
|
171
|
+
}
|
172
|
+
|
173
|
+
@Override
|
174
|
+
public void startElement(QName name, XMLAttributes attrs, Augmentations augs) throws XNIException {
|
175
|
+
if (!isValid(name.rawname)) {
|
176
|
+
errorHandler.addError(new Exception("Tag " + name.rawname + " invalid"));
|
177
|
+
}
|
178
|
+
super.startElement(name, attrs, augs);
|
179
|
+
}
|
180
|
+
}
|
181
|
+
}
|
@@ -0,0 +1,39 @@
|
|
1
|
+
package nokogiri.internals;
|
2
|
+
|
3
|
+
import java.util.Hashtable;
|
4
|
+
import nokogiri.XmlDocument;
|
5
|
+
import org.w3c.dom.Document;
|
6
|
+
|
7
|
+
/**
|
8
|
+
*
|
9
|
+
* @author sergio
|
10
|
+
*/
|
11
|
+
public class NokogiriDocumentCache {
|
12
|
+
|
13
|
+
private static NokogiriDocumentCache instance;
|
14
|
+
protected Hashtable<Document, XmlDocument> cache;
|
15
|
+
|
16
|
+
private NokogiriDocumentCache() {
|
17
|
+
this.cache = new Hashtable<Document, XmlDocument>();
|
18
|
+
}
|
19
|
+
|
20
|
+
public static NokogiriDocumentCache getInstance() {
|
21
|
+
if(instance == null) {
|
22
|
+
instance = new NokogiriDocumentCache();
|
23
|
+
}
|
24
|
+
return instance;
|
25
|
+
}
|
26
|
+
|
27
|
+
public XmlDocument getXmlDocument(Document doc) {
|
28
|
+
return this.cache.get(doc);
|
29
|
+
}
|
30
|
+
|
31
|
+
public void putDocument(Document doc, XmlDocument xmlDoc) {
|
32
|
+
this.cache.put(doc, xmlDoc);
|
33
|
+
}
|
34
|
+
|
35
|
+
public XmlDocument removeDocument(Document doc) {
|
36
|
+
return this.cache.remove(doc);
|
37
|
+
}
|
38
|
+
|
39
|
+
}
|