nokogiri 1.10.10-java → 1.11.0-java
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/LICENSE-DEPENDENCIES.md +1015 -947
- data/README.md +165 -91
- data/ext/java/nokogiri/HtmlDocument.java +34 -46
- data/ext/java/nokogiri/HtmlSaxParserContext.java +88 -58
- data/ext/java/nokogiri/HtmlSaxPushParser.java +1 -1
- data/ext/java/nokogiri/NokogiriService.java +1 -1
- data/ext/java/nokogiri/XmlAttr.java +13 -20
- data/ext/java/nokogiri/XmlAttributeDecl.java +11 -12
- data/ext/java/nokogiri/XmlCdata.java +3 -4
- data/ext/java/nokogiri/XmlComment.java +1 -1
- data/ext/java/nokogiri/XmlDocument.java +148 -175
- data/ext/java/nokogiri/XmlDocumentFragment.java +13 -31
- data/ext/java/nokogiri/XmlDtd.java +5 -8
- data/ext/java/nokogiri/XmlElement.java +1 -20
- data/ext/java/nokogiri/XmlElementDecl.java +23 -28
- data/ext/java/nokogiri/XmlEntityDecl.java +23 -27
- data/ext/java/nokogiri/XmlEntityReference.java +2 -2
- data/ext/java/nokogiri/XmlNamespace.java +72 -89
- data/ext/java/nokogiri/XmlNode.java +303 -406
- data/ext/java/nokogiri/XmlNodeSet.java +70 -76
- data/ext/java/nokogiri/XmlReader.java +12 -13
- data/ext/java/nokogiri/XmlRelaxng.java +10 -3
- data/ext/java/nokogiri/XmlSaxParserContext.java +15 -10
- data/ext/java/nokogiri/XmlSchema.java +87 -27
- data/ext/java/nokogiri/XmlSyntaxError.java +2 -6
- data/ext/java/nokogiri/XmlText.java +12 -9
- data/ext/java/nokogiri/XmlXpathContext.java +55 -25
- data/ext/java/nokogiri/XsltStylesheet.java +7 -15
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +52 -46
- data/ext/java/nokogiri/internals/NokogiriHandler.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +71 -135
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +90 -58
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +9 -2
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +67 -10
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +4 -2
- data/ext/java/nokogiri/internals/ParserContext.java +27 -73
- data/ext/java/nokogiri/internals/ReaderNode.java +2 -4
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +18 -33
- data/ext/nokogiri/depend +476 -357
- data/ext/nokogiri/extconf.rb +507 -357
- data/ext/nokogiri/html_document.c +79 -78
- data/ext/nokogiri/html_sax_parser_context.c +2 -2
- data/ext/nokogiri/nokogiri.c +34 -40
- data/ext/nokogiri/xml_document.c +18 -4
- data/ext/nokogiri/xml_io.c +8 -6
- data/ext/nokogiri/xml_node.c +21 -1
- data/ext/nokogiri/xml_node_set.c +1 -1
- data/ext/nokogiri/xml_reader.c +6 -17
- data/ext/nokogiri/xml_relax_ng.c +29 -11
- data/ext/nokogiri/xml_sax_parser.c +2 -7
- data/ext/nokogiri/xml_sax_parser_context.c +2 -2
- data/ext/nokogiri/xml_schema.c +55 -13
- data/ext/nokogiri/xml_xpath_context.c +80 -4
- data/ext/nokogiri/xslt_stylesheet.c +1 -8
- data/lib/nokogiri.rb +22 -22
- data/lib/nokogiri/css.rb +1 -0
- data/lib/nokogiri/css/node.rb +1 -0
- data/lib/nokogiri/css/parser.rb +63 -62
- data/lib/nokogiri/css/parser.y +2 -2
- data/lib/nokogiri/css/parser_extras.rb +39 -36
- data/lib/nokogiri/css/syntax_error.rb +1 -0
- data/lib/nokogiri/css/tokenizer.rb +1 -0
- data/lib/nokogiri/css/xpath_visitor.rb +73 -43
- data/lib/nokogiri/decorators/slop.rb +1 -0
- data/lib/nokogiri/html.rb +1 -0
- data/lib/nokogiri/html/builder.rb +1 -0
- data/lib/nokogiri/html/document.rb +13 -26
- data/lib/nokogiri/html/document_fragment.rb +1 -0
- data/lib/nokogiri/html/element_description.rb +1 -0
- data/lib/nokogiri/html/element_description_defaults.rb +1 -0
- data/lib/nokogiri/html/entity_lookup.rb +1 -0
- data/lib/nokogiri/html/sax/parser.rb +1 -0
- data/lib/nokogiri/html/sax/parser_context.rb +1 -0
- data/lib/nokogiri/html/sax/push_parser.rb +1 -0
- data/lib/nokogiri/jruby/dependencies.rb +20 -0
- data/lib/nokogiri/nokogiri.jar +0 -0
- data/lib/nokogiri/syntax_error.rb +1 -0
- data/lib/nokogiri/version.rb +3 -109
- data/lib/nokogiri/version/constant.rb +5 -0
- data/lib/nokogiri/version/info.rb +182 -0
- data/lib/nokogiri/xml.rb +1 -0
- data/lib/nokogiri/xml/attr.rb +1 -0
- data/lib/nokogiri/xml/attribute_decl.rb +1 -0
- data/lib/nokogiri/xml/builder.rb +3 -2
- data/lib/nokogiri/xml/cdata.rb +1 -0
- data/lib/nokogiri/xml/character_data.rb +1 -0
- data/lib/nokogiri/xml/document.rb +20 -15
- data/lib/nokogiri/xml/document_fragment.rb +5 -6
- data/lib/nokogiri/xml/dtd.rb +1 -0
- data/lib/nokogiri/xml/element_content.rb +1 -0
- data/lib/nokogiri/xml/element_decl.rb +1 -0
- data/lib/nokogiri/xml/entity_decl.rb +1 -0
- data/lib/nokogiri/xml/entity_reference.rb +1 -0
- data/lib/nokogiri/xml/namespace.rb +1 -0
- data/lib/nokogiri/xml/node.rb +587 -249
- data/lib/nokogiri/xml/node/save_options.rb +1 -0
- data/lib/nokogiri/xml/node_set.rb +1 -0
- data/lib/nokogiri/xml/notation.rb +1 -0
- data/lib/nokogiri/xml/parse_options.rb +10 -3
- data/lib/nokogiri/xml/pp.rb +1 -0
- data/lib/nokogiri/xml/pp/character_data.rb +1 -0
- data/lib/nokogiri/xml/pp/node.rb +1 -0
- data/lib/nokogiri/xml/processing_instruction.rb +1 -0
- data/lib/nokogiri/xml/reader.rb +7 -3
- data/lib/nokogiri/xml/relax_ng.rb +7 -2
- data/lib/nokogiri/xml/sax.rb +1 -0
- data/lib/nokogiri/xml/sax/document.rb +1 -0
- data/lib/nokogiri/xml/sax/parser.rb +1 -0
- data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
- data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
- data/lib/nokogiri/xml/schema.rb +13 -4
- data/lib/nokogiri/xml/searchable.rb +25 -16
- data/lib/nokogiri/xml/syntax_error.rb +1 -0
- data/lib/nokogiri/xml/text.rb +1 -0
- data/lib/nokogiri/xml/xpath.rb +1 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -0
- data/lib/nokogiri/xml/xpath_context.rb +1 -0
- data/lib/nokogiri/xslt.rb +1 -0
- data/lib/nokogiri/xslt/stylesheet.rb +1 -0
- data/lib/xsd/xmlparser/nokogiri.rb +1 -0
- metadata +86 -159
- data/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java +0 -107
- data/ext/java/nokogiri/internals/UncloseableInputStream.java +0 -102
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/nokogiri.h +0 -121
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
data/README.md
CHANGED
@@ -1,105 +1,153 @@
|
|
1
|
+
<div><img src="https://nokogiri.org/images/nokogiri-serif-black.png" align="right"/></div>
|
2
|
+
|
1
3
|
# Nokogiri
|
2
4
|
|
3
|
-
|
5
|
+
Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for reading, writing, modifying, and querying documents. It is fast and standards-compliant by relying on native parsers like libxml2 (C) and xerces (Java).
|
6
|
+
|
7
|
+
## Guiding Principles
|
8
|
+
|
9
|
+
Some guiding principles Nokogiri tries to follow:
|
4
10
|
|
5
|
-
|
6
|
-
|
7
|
-
or CSS3 selectors.
|
11
|
+
- be secure-by-default by treating all documents as **untrusted** by default
|
12
|
+
- be a **thin-as-reasonable layer** on top of the underlying parsers, and don't attempt to fix behavioral differences between the parsers
|
8
13
|
|
9
14
|
|
10
|
-
##
|
15
|
+
## Features Overview
|
11
16
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
17
|
+
- DOM Parser for XML and HTML4
|
18
|
+
- SAX Parser for XML and HTML4
|
19
|
+
- Push Parser for XML and HTML4
|
20
|
+
- Document search via XPath 1.0
|
21
|
+
- Document search via CSS3 selectors, with some jquery-like extensions
|
22
|
+
- XSD Schema validation
|
23
|
+
- XSLT transformation
|
24
|
+
- "Builder" DSL for XML and HTML documents
|
19
25
|
|
20
26
|
|
21
27
|
## Status
|
22
28
|
|
23
|
-
[![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/
|
29
|
+
[![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/cruby-2.7/badge)](https://ci.nokogiri.org/teams/nokogiri-core/pipelines/nokogiri)
|
24
30
|
[![Appveyor CI](https://ci.appveyor.com/api/projects/status/xj2pqwvlxwuwgr06/branch/master?svg=true)](https://ci.appveyor.com/project/flavorjones/nokogiri/branch/master)
|
25
31
|
[![Code Climate](https://codeclimate.com/github/sparklemotion/nokogiri.svg)](https://codeclimate.com/github/sparklemotion/nokogiri)
|
26
32
|
[![Test Coverage](https://api.codeclimate.com/v1/badges/59c67b0e8976027a45ad/test_coverage)](https://codeclimate.com/github/sparklemotion/nokogiri/test_coverage)
|
27
33
|
|
28
34
|
[![Gem Version](https://badge.fury.io/rb/nokogiri.svg)](https://rubygems.org/gems/nokogiri)
|
29
|
-
[![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score
|
30
|
-
[![Tidelift dependencies](https://tidelift.com/badges/
|
35
|
+
[![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
|
36
|
+
[![Tidelift dependencies](https://tidelift.com/badges/package/rubygems/nokogiri)](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
|
31
37
|
|
32
38
|
|
33
|
-
##
|
39
|
+
## Support and Help
|
34
40
|
|
35
|
-
|
36
|
-
* XML/HTML SAX parser
|
37
|
-
* XML/HTML Push parser
|
38
|
-
* XPath 1.0 support for document searching
|
39
|
-
* CSS3 selector support for document searching
|
40
|
-
* XML/HTML builder
|
41
|
-
* XSLT transformer
|
41
|
+
All official documentation is posted at https://nokogiri.org (the source for which is at https://github.com/sparklemotion/nokogiri.org/, and we welcome contributions).
|
42
42
|
|
43
|
-
|
44
|
-
or Java, depending on your Ruby), which means it's fast and
|
45
|
-
standards-compliant.
|
43
|
+
### Reading
|
46
44
|
|
45
|
+
Your first stops for API documentation should be:
|
47
46
|
|
48
|
-
|
47
|
+
- RDocs: https://nokogiri.org/rdoc/index.html
|
48
|
+
- An excellent community-maintained [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet)
|
49
49
|
|
50
|
-
If this doesn't work:
|
51
50
|
|
52
|
-
|
53
|
-
gem install nokogiri
|
54
|
-
```
|
51
|
+
### Questions
|
55
52
|
|
56
|
-
|
53
|
+
If you'd like to talk to a human:
|
57
54
|
|
58
|
-
|
55
|
+
- The Discord chat channel is `#nokogiri-💎` at https://discord.gg/UyQnKrT
|
56
|
+
- The Gitter chat channel is https://gitter.im/sparklemotion/nokogiri
|
57
|
+
- The IRC chat channel is `#nokogiri` on freenode.
|
58
|
+
- The Nokogiri mailing list is active at https://groups.google.com/group/nokogiri-talk
|
59
|
+
- The Nokogiri bug tracker is at https://github.com/sparklemotion/nokogiri/issues
|
59
60
|
|
60
|
-
|
61
|
-
installation. The vast majority of them are out of date and therefore
|
62
|
-
incorrect. __Please do not use Stack Overflow.__
|
61
|
+
Consider subscribing to [Tidelift][tidelift] which provides license assurances and timely security notifications for your open source dependencies, including Nokogiri. [Tidelift][tidelift] subscriptions also help the Nokogiri maintainers fund our [automated testing](https://ci.nokogiri.org) which in turn allows us to ship releases, bugfixes, and security updates more often.
|
63
62
|
|
64
|
-
|
65
|
-
when the above instructions don't work for you. This allows us to both
|
66
|
-
help you directly and improve the documentation.
|
63
|
+
[tidelift]: https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme
|
67
64
|
|
68
65
|
|
69
|
-
###
|
66
|
+
### Security and Vulnerability Reporting
|
70
67
|
|
71
|
-
|
68
|
+
Please report vulnerabilities at https://hackerone.com/nokogiri
|
72
69
|
|
73
|
-
|
74
|
-
* SuSE: https://download.opensuse.org/repositories/devel:/languages:/ruby:/extensions/
|
75
|
-
* Fedora: http://s390.koji.fedoraproject.org/koji/packageinfo?packageID=6756
|
70
|
+
Full information and description of our security policy is in [`SECURITY.md`](SECURITY.md)
|
76
71
|
|
77
72
|
|
78
|
-
|
73
|
+
### Semantic Versioning Policy
|
79
74
|
|
80
|
-
|
75
|
+
Nokogiri follows [Semantic Versioning](https://semver.org/) (since 2017 or so). [![Dependabot's SemVer compatibility score for Nokogiri](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
|
81
76
|
|
82
|
-
|
83
|
-
* The Nokogiri bug tracker is here: https://github.com/sparklemotion/nokogiri/issues
|
84
|
-
* Before filing a bug report, please read our submission guidelines: http://nokogiri.org/tutorials/getting_help.html
|
85
|
-
* The IRC channel is `#nokogiri` on freenode.
|
86
|
-
* The project's GitHub wiki has an excellent community-maintained [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet) which might be useful.
|
77
|
+
We bump `Major.Minor.Patch` versions following this guidance:
|
87
78
|
|
88
|
-
|
79
|
+
`Major`: (we've never done this)
|
89
80
|
|
90
|
-
|
81
|
+
- Significant backwards-incompatible changes to the public API that would require rewriting existing application code.
|
82
|
+
- Some examples of backwards-incompatible changes we might someday consider for a Major release are at [`ROADMAP.md`](ROADMAP.md).
|
91
83
|
|
84
|
+
`Minor`:
|
92
85
|
|
93
|
-
|
86
|
+
- Features and bugfixes.
|
87
|
+
- Updating packaged libraries for non-security-related reasons.
|
88
|
+
- Dropping support for EOLed Ruby versions. [Some folks find this objectionable](https://github.com/sparklemotion/nokogiri/issues/1568), but [SemVer says this is OK if the public API hasn't changed](https://semver.org/#what-should-i-do-if-i-update-my-own-dependencies-without-changing-the-public-api).
|
89
|
+
- Backwards-incompatible changes to internal or private methods and constants. These are detailed in the "Changes" section of each changelog entry.
|
94
90
|
|
95
|
-
|
91
|
+
`Patch`:
|
96
92
|
|
97
|
-
|
93
|
+
- Bugfixes.
|
94
|
+
- Security updates.
|
95
|
+
- Updating packaged libraries for security-related reasons.
|
98
96
|
|
99
97
|
|
100
|
-
##
|
98
|
+
## Installation
|
99
|
+
|
100
|
+
Requirements:
|
101
|
+
|
102
|
+
- Ruby >= 2.5
|
103
|
+
- JRuby >= 9.2.0.0
|
104
|
+
|
105
|
+
|
106
|
+
### Native Gems: Faster, more reliable installation
|
107
|
+
|
108
|
+
"Native gems" contain pre-compiled libraries for a specific machine architecture. On supported platforms, this removes the need for compiling the C extension and the packaged libraries, or for system dependencies to exist. This results in **much faster installation** and **more reliable installation**, which as you probably know are the biggest headaches for Nokogiri users.
|
109
|
+
|
110
|
+
### Supported Platforms
|
111
|
+
|
112
|
+
As of v1.11.0, Nokogiri ships pre-compiled, "native" gems for the following platforms:
|
113
|
+
|
114
|
+
- Linux: `x86-linux` and `x86_64-linux` (req: `glibc >= 2.17`), including musl platforms like Alpine
|
115
|
+
- Darwin/MacOS: `x86_64-darwin` and `arm64-darwin`
|
116
|
+
- Windows: `x86-mingw32` and `x64-mingw32`
|
117
|
+
- Java: any platform running JRuby 9.2 or higher
|
118
|
+
|
119
|
+
To determine whether your system supports one of these gems, look at the output of `bundle platform` or `ruby -e 'puts Gem::Platform.local.to_s'`.
|
120
|
+
|
121
|
+
If you're on a supported platform, either `gem install` or `bundle install` should install a native gem without any additional action on your part. This installation should only take a few seconds, and your output should look something like:
|
122
|
+
|
123
|
+
``` sh
|
124
|
+
$ gem install nokogiri
|
125
|
+
Fetching nokogiri-1.11.0-x86_64-linux.gem
|
126
|
+
Successfully installed nokogiri-1.11.0-x86_64-linux
|
127
|
+
1 gem installed
|
128
|
+
```
|
129
|
+
|
130
|
+
|
131
|
+
### Other Installation Options
|
132
|
+
|
133
|
+
Because Nokogiri is a C extension, it requires that you have a C compiler toolchain, Ruby development header files, and some system dependencies installed.
|
134
|
+
|
135
|
+
The following may work for you if you have an appropriately-configured system:
|
136
|
+
|
137
|
+
``` bash
|
138
|
+
gem install nokogiri
|
139
|
+
```
|
101
140
|
|
102
|
-
|
141
|
+
If you have any issues, please visit [Installing Nokogiri](https://nokogiri.org/tutorials/installing_nokogiri.html) for more complete instructions and troubleshooting.
|
142
|
+
|
143
|
+
|
144
|
+
## How To Use Nokogiri
|
145
|
+
|
146
|
+
Nokogiri is a large library, and so it's challenging to briefly summarize it. We've tried to provide long, real-world examples at [Tutorials](https://nokogiri.org/tutorials/toc.html).
|
147
|
+
|
148
|
+
### Parsing and Querying
|
149
|
+
|
150
|
+
Here is example usage for parsing and querying a document:
|
103
151
|
|
104
152
|
```ruby
|
105
153
|
#! /usr/bin/env ruby
|
@@ -108,51 +156,26 @@ require 'nokogiri'
|
|
108
156
|
require 'open-uri'
|
109
157
|
|
110
158
|
# Fetch and parse HTML document
|
111
|
-
doc = Nokogiri::HTML(open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
|
159
|
+
doc = Nokogiri::HTML(URI.open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
|
112
160
|
|
113
|
-
|
161
|
+
# Search for nodes by css
|
114
162
|
doc.css('nav ul.menu li a', 'article h2').each do |link|
|
115
163
|
puts link.content
|
116
164
|
end
|
117
165
|
|
118
|
-
|
166
|
+
# Search for nodes by xpath
|
119
167
|
doc.xpath('//nav//ul//li/a', '//article//h2').each do |link|
|
120
168
|
puts link.content
|
121
169
|
end
|
122
170
|
|
123
|
-
|
171
|
+
# Or mix and match
|
124
172
|
doc.search('nav ul.menu li a', '//article//h2').each do |link|
|
125
173
|
puts link.content
|
126
174
|
end
|
127
175
|
```
|
128
176
|
|
129
177
|
|
130
|
-
|
131
|
-
|
132
|
-
* Ruby 2.3.0 or higher, including any development packages necessary
|
133
|
-
to compile native extensions.
|
134
|
-
|
135
|
-
* In Nokogiri 1.6.0 and later libxml2 and libxslt are bundled with the
|
136
|
-
gem, but if you want to use the system versions:
|
137
|
-
|
138
|
-
* First, check out [the long list](http://www.xmlsoft.org/news.html)
|
139
|
-
of fixes and changes between releases before deciding to use any
|
140
|
-
version older than is bundled with Nokogiri.
|
141
|
-
|
142
|
-
* At install time, set the environment variable
|
143
|
-
`NOKOGIRI_USE_SYSTEM_LIBRARIES` or else use the
|
144
|
-
`--use-system-libraries` argument. (See
|
145
|
-
https://nokogiri.org/tutorials/installing_nokogiri.html#install-with-system-libraries
|
146
|
-
for specifics.)
|
147
|
-
|
148
|
-
* libxml2 >=2.6.21 with iconv support
|
149
|
-
(libxml2-dev/-devel is also required)
|
150
|
-
|
151
|
-
* libxslt, built with and supported by the given libxml2
|
152
|
-
(libxslt-dev/-devel is also required)
|
153
|
-
|
154
|
-
|
155
|
-
## Encoding
|
178
|
+
### Encoding
|
156
179
|
|
157
180
|
Strings are always stored as UTF-8 internally. Methods that return
|
158
181
|
text values will always return UTF-8 encoded strings. Methods that
|
@@ -178,11 +201,43 @@ explicitly setting the encoding to EUC-JP on the parser:
|
|
178
201
|
```
|
179
202
|
|
180
203
|
|
181
|
-
##
|
204
|
+
## Technical Overview
|
205
|
+
|
206
|
+
### Guiding Principles
|
207
|
+
|
208
|
+
As noted above, two guiding principles of the software are:
|
209
|
+
|
210
|
+
- be secure-by-default by treating all documents as **untrusted** by default
|
211
|
+
- be a **thin-as-reasonable layer** on top of the underlying parsers, and don't attempt to fix behavioral differences between the parsers
|
212
|
+
|
213
|
+
Notably, despite all parsers being standards-compliant, there are behavioral inconsistencies between the parsers used in the CRuby and JRuby implementations, and Nokogiri does not and should not attempt to remove these inconsistencies. Instead, we surface these differences in the test suite when they are important/semantic; or we intentionally write tests to depend only on the important/semantic bits (omitting whitespace from regex matchers on results, for example).
|
214
|
+
|
215
|
+
|
216
|
+
### CRuby
|
217
|
+
|
218
|
+
The Ruby (a.k.a., CRuby, MRI, YARV) implementation is a C extension that depends on libxml2 and libxslt (which in turn depend on zlib and possibly libiconv).
|
219
|
+
|
220
|
+
These dependencies are met by default by Nokogiri's packaged versions of the libxml2 and libxslt source code, but a configuration option `--use-system-libraries` is provided to allow specification of alternative library locations. See [Installing Nokogiri](https://nokogiri.org/tutorials/installing_nokogiri.html) for full documentation.
|
221
|
+
|
222
|
+
We provide native gems by pre-compiling libxml2 and libxslt (and potentially zlib and libiconv) and packaging them into the gem file. In this case, no compilation is necessary at installation time, which leads to faster and more reliable installation.
|
223
|
+
|
224
|
+
See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems.
|
225
|
+
|
226
|
+
|
227
|
+
### JRuby
|
228
|
+
|
229
|
+
The Java (a.k.a. JRuby) implementation is a Java extension that depends primarily on Xerces and NekoHTML for parsing, though additional dependencies are on `isorelax`, `nekodtd`, `jing`, `serializer`, `xalan-j`, and `xml-apis`.
|
230
|
+
|
231
|
+
These dependencies are provided by pre-compiled jar files packaged in the `java` platform gem.
|
232
|
+
|
233
|
+
See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems.
|
234
|
+
|
235
|
+
|
236
|
+
## Contributing
|
182
237
|
|
183
238
|
```bash
|
184
|
-
|
185
|
-
|
239
|
+
bundle install
|
240
|
+
bundle exec rake compile test
|
186
241
|
```
|
187
242
|
|
188
243
|
|
@@ -196,3 +251,22 @@ We've adopted the Contributor Covenant code of conduct, which you can read in fu
|
|
196
251
|
This project is licensed under the terms of the MIT license.
|
197
252
|
|
198
253
|
See this license at [`LICENSE.md`](LICENSE.md).
|
254
|
+
|
255
|
+
|
256
|
+
### Dependencies
|
257
|
+
|
258
|
+
Some additional libraries may be distributed with your version of Nokogiri. Please see [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for a discussion of the variations as well as the licenses thereof.
|
259
|
+
|
260
|
+
|
261
|
+
## Authors
|
262
|
+
|
263
|
+
- Mike Dalessio
|
264
|
+
- Aaron Patterson
|
265
|
+
- Yoko Harada
|
266
|
+
- Akinori MUSHA
|
267
|
+
- John Shahid
|
268
|
+
- Karol Bucek
|
269
|
+
- Lars Kanis
|
270
|
+
- Sergio Arbeo
|
271
|
+
- Timothy Elliott
|
272
|
+
- Nobuyoshi Nakada
|
@@ -36,7 +36,6 @@ import org.jruby.Ruby;
|
|
36
36
|
import org.jruby.RubyClass;
|
37
37
|
import org.jruby.anno.JRubyClass;
|
38
38
|
import org.jruby.anno.JRubyMethod;
|
39
|
-
import org.jruby.runtime.Arity;
|
40
39
|
import org.jruby.runtime.Helpers;
|
41
40
|
import org.jruby.runtime.ThreadContext;
|
42
41
|
import org.jruby.runtime.builtin.IRubyObject;
|
@@ -48,6 +47,8 @@ import org.w3c.dom.NodeList;
|
|
48
47
|
|
49
48
|
import nokogiri.internals.HtmlDomParserContext;
|
50
49
|
|
50
|
+
import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
|
51
|
+
|
51
52
|
/**
|
52
53
|
* Class for Nokogiri::HTML::Document.
|
53
54
|
*
|
@@ -65,21 +66,25 @@ public class HtmlDocument extends XmlDocument {
|
|
65
66
|
public HtmlDocument(Ruby ruby, RubyClass klazz) {
|
66
67
|
super(ruby, klazz);
|
67
68
|
}
|
68
|
-
|
69
|
+
|
70
|
+
public HtmlDocument(Ruby runtime, Document document) {
|
71
|
+
this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document);
|
72
|
+
}
|
73
|
+
|
69
74
|
public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) {
|
70
75
|
super(ruby, klazz, doc);
|
71
76
|
}
|
72
77
|
|
73
78
|
@JRubyMethod(name="new", meta = true, rest = true, required=0)
|
74
|
-
public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz,
|
75
|
-
|
79
|
+
public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) {
|
80
|
+
final Ruby runtime = context.runtime;
|
76
81
|
HtmlDocument htmlDocument;
|
77
82
|
try {
|
78
|
-
Document docNode = createNewDocument();
|
79
|
-
htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(
|
80
|
-
htmlDocument.setDocumentNode(context, docNode);
|
83
|
+
Document docNode = createNewDocument(runtime);
|
84
|
+
htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(runtime, (RubyClass) klazz);
|
85
|
+
htmlDocument.setDocumentNode(context.runtime, docNode);
|
81
86
|
} catch (Exception ex) {
|
82
|
-
throw
|
87
|
+
throw asRuntimeError(runtime, "couldn't create document: ", ex);
|
83
88
|
}
|
84
89
|
|
85
90
|
Helpers.invoke(context, htmlDocument, "initialize", args);
|
@@ -109,46 +114,29 @@ public class HtmlDocument extends XmlDocument {
|
|
109
114
|
return internalSubset;
|
110
115
|
}
|
111
116
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
Arity.checkArgumentCount(ruby, args, 4, 4);
|
117
|
-
HtmlDomParserContext ctx =
|
118
|
-
new HtmlDomParserContext(ruby, args[2], args[3]);
|
119
|
-
ctx.setInputSource(context, args[0], args[1]);
|
120
|
-
return ctx.parse(context, klass, args[1]);
|
121
|
-
}
|
122
|
-
|
123
|
-
public void setDocumentNode(ThreadContext context, Node node) {
|
124
|
-
super.setNode(context, node);
|
125
|
-
Ruby runtime = context.getRuntime();
|
126
|
-
if (node != null) {
|
127
|
-
Document document = (Document)node;
|
128
|
-
document.normalize();
|
129
|
-
stabilzeAttrValue(document.getDocumentElement());
|
130
|
-
}
|
117
|
+
@Override
|
118
|
+
void init(Ruby runtime, Document document) {
|
119
|
+
stabilizeTextContent(document);
|
120
|
+
document.normalize();
|
131
121
|
setInstanceVariable("@decorators", runtime.getNil());
|
122
|
+
if (document.getDocumentElement() != null) {
|
123
|
+
stabilizeAttrs(document.getDocumentElement());
|
124
|
+
}
|
132
125
|
}
|
133
|
-
|
134
|
-
private void
|
135
|
-
if (node == null) return;
|
126
|
+
|
127
|
+
private static void stabilizeAttrs(Node node) {
|
136
128
|
if (node.hasAttributes()) {
|
137
129
|
NamedNodeMap nodeMap = node.getAttributes();
|
138
130
|
for (int i=0; i<nodeMap.getLength(); i++) {
|
139
131
|
Node n = nodeMap.item(i);
|
140
132
|
if (n instanceof Attr) {
|
141
|
-
|
142
|
-
String attrName = attr.getName();
|
143
|
-
// not sure, but need to get value always before document is referred.
|
144
|
-
// or lose attribute value
|
145
|
-
String attrValue = attr.getValue(); // don't delete this line
|
133
|
+
stabilizeAttr((Attr) n);
|
146
134
|
}
|
147
135
|
}
|
148
136
|
}
|
149
137
|
NodeList children = node.getChildNodes();
|
150
138
|
for (int i=0; i<children.getLength(); i++) {
|
151
|
-
|
139
|
+
stabilizeAttrs(children.item(i));
|
152
140
|
}
|
153
141
|
}
|
154
142
|
|
@@ -167,11 +155,11 @@ public class HtmlDocument extends XmlDocument {
|
|
167
155
|
* Read the HTML document from +io+ with given +url+, +encoding+,
|
168
156
|
* and +options+. See Nokogiri::HTML.parse
|
169
157
|
*/
|
170
|
-
@JRubyMethod(meta = true,
|
171
|
-
public static IRubyObject read_io(ThreadContext context,
|
172
|
-
|
173
|
-
|
174
|
-
return
|
158
|
+
@JRubyMethod(meta = true, required = 4)
|
159
|
+
public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
|
160
|
+
HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
|
161
|
+
ctx.setIOInputSource(context, args[0], args[1]);
|
162
|
+
return ctx.parse(context, (RubyClass) klass, args[1]);
|
175
163
|
}
|
176
164
|
|
177
165
|
/*
|
@@ -181,10 +169,10 @@ public class HtmlDocument extends XmlDocument {
|
|
181
169
|
* Read the HTML document contained in +string+ with given +url+, +encoding+,
|
182
170
|
* and +options+. See Nokogiri::HTML.parse
|
183
171
|
*/
|
184
|
-
@JRubyMethod(meta = true,
|
185
|
-
public static IRubyObject read_memory(ThreadContext context,
|
186
|
-
|
187
|
-
|
188
|
-
return
|
172
|
+
@JRubyMethod(meta = true, required = 4)
|
173
|
+
public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
|
174
|
+
HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
|
175
|
+
ctx.setStringInputSource(context, args[0], args[1]);
|
176
|
+
return ctx.parse(context, (RubyClass) klass, args[1]);
|
189
177
|
}
|
190
178
|
}
|