nokogiri 1.10.6-java → 1.11.0-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (162) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +3 -0
  3. data/LICENSE-DEPENDENCIES.md +1015 -947
  4. data/README.md +165 -91
  5. data/ext/java/nokogiri/HtmlDocument.java +34 -46
  6. data/ext/java/nokogiri/HtmlSaxParserContext.java +88 -58
  7. data/ext/java/nokogiri/HtmlSaxPushParser.java +1 -1
  8. data/ext/java/nokogiri/NokogiriService.java +1 -1
  9. data/ext/java/nokogiri/XmlAttr.java +13 -20
  10. data/ext/java/nokogiri/XmlAttributeDecl.java +11 -12
  11. data/ext/java/nokogiri/XmlCdata.java +3 -4
  12. data/ext/java/nokogiri/XmlComment.java +1 -1
  13. data/ext/java/nokogiri/XmlDocument.java +148 -175
  14. data/ext/java/nokogiri/XmlDocumentFragment.java +13 -31
  15. data/ext/java/nokogiri/XmlDtd.java +5 -8
  16. data/ext/java/nokogiri/XmlElement.java +1 -20
  17. data/ext/java/nokogiri/XmlElementDecl.java +23 -28
  18. data/ext/java/nokogiri/XmlEntityDecl.java +23 -27
  19. data/ext/java/nokogiri/XmlEntityReference.java +2 -2
  20. data/ext/java/nokogiri/XmlNamespace.java +72 -89
  21. data/ext/java/nokogiri/XmlNode.java +303 -406
  22. data/ext/java/nokogiri/XmlNodeSet.java +72 -77
  23. data/ext/java/nokogiri/XmlReader.java +12 -13
  24. data/ext/java/nokogiri/XmlRelaxng.java +10 -3
  25. data/ext/java/nokogiri/XmlSaxParserContext.java +15 -10
  26. data/ext/java/nokogiri/XmlSchema.java +87 -27
  27. data/ext/java/nokogiri/XmlSyntaxError.java +2 -6
  28. data/ext/java/nokogiri/XmlText.java +12 -9
  29. data/ext/java/nokogiri/XmlXpathContext.java +55 -25
  30. data/ext/java/nokogiri/XsltStylesheet.java +7 -15
  31. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +52 -46
  32. data/ext/java/nokogiri/internals/NokogiriHandler.java +1 -1
  33. data/ext/java/nokogiri/internals/NokogiriHelpers.java +71 -135
  34. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +90 -58
  35. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +9 -2
  36. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +67 -10
  37. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +4 -2
  38. data/ext/java/nokogiri/internals/ParserContext.java +27 -73
  39. data/ext/java/nokogiri/internals/ReaderNode.java +2 -4
  40. data/ext/java/nokogiri/internals/XmlDomParserContext.java +18 -33
  41. data/ext/nokogiri/depend +476 -357
  42. data/ext/nokogiri/extconf.rb +507 -357
  43. data/ext/nokogiri/html_document.c +79 -78
  44. data/ext/nokogiri/html_sax_parser_context.c +2 -2
  45. data/ext/nokogiri/nokogiri.c +34 -40
  46. data/ext/nokogiri/xml_document.c +18 -4
  47. data/ext/nokogiri/xml_io.c +8 -6
  48. data/ext/nokogiri/xml_node.c +21 -1
  49. data/ext/nokogiri/xml_node_set.c +1 -1
  50. data/ext/nokogiri/xml_reader.c +6 -17
  51. data/ext/nokogiri/xml_relax_ng.c +29 -11
  52. data/ext/nokogiri/xml_sax_parser.c +2 -7
  53. data/ext/nokogiri/xml_sax_parser_context.c +2 -2
  54. data/ext/nokogiri/xml_schema.c +84 -13
  55. data/ext/nokogiri/xml_xpath_context.c +80 -4
  56. data/ext/nokogiri/xslt_stylesheet.c +1 -8
  57. data/lib/nokogiri.rb +22 -22
  58. data/lib/nokogiri/css.rb +1 -0
  59. data/lib/nokogiri/css/node.rb +1 -0
  60. data/lib/nokogiri/css/parser.rb +63 -62
  61. data/lib/nokogiri/css/parser.y +2 -2
  62. data/lib/nokogiri/css/parser_extras.rb +39 -36
  63. data/lib/nokogiri/css/syntax_error.rb +1 -0
  64. data/lib/nokogiri/css/tokenizer.rb +1 -0
  65. data/lib/nokogiri/css/xpath_visitor.rb +73 -43
  66. data/lib/nokogiri/decorators/slop.rb +1 -0
  67. data/lib/nokogiri/html.rb +1 -0
  68. data/lib/nokogiri/html/builder.rb +1 -0
  69. data/lib/nokogiri/html/document.rb +13 -26
  70. data/lib/nokogiri/html/document_fragment.rb +1 -0
  71. data/lib/nokogiri/html/element_description.rb +1 -0
  72. data/lib/nokogiri/html/element_description_defaults.rb +1 -0
  73. data/lib/nokogiri/html/entity_lookup.rb +1 -0
  74. data/lib/nokogiri/html/sax/parser.rb +1 -0
  75. data/lib/nokogiri/html/sax/parser_context.rb +1 -0
  76. data/lib/nokogiri/html/sax/push_parser.rb +1 -0
  77. data/lib/nokogiri/jruby/dependencies.rb +20 -0
  78. data/lib/nokogiri/nokogiri.jar +0 -0
  79. data/lib/nokogiri/syntax_error.rb +1 -0
  80. data/lib/nokogiri/version.rb +3 -109
  81. data/lib/nokogiri/version/constant.rb +5 -0
  82. data/lib/nokogiri/version/info.rb +182 -0
  83. data/lib/nokogiri/xml.rb +1 -0
  84. data/lib/nokogiri/xml/attr.rb +1 -0
  85. data/lib/nokogiri/xml/attribute_decl.rb +1 -0
  86. data/lib/nokogiri/xml/builder.rb +3 -2
  87. data/lib/nokogiri/xml/cdata.rb +1 -0
  88. data/lib/nokogiri/xml/character_data.rb +1 -0
  89. data/lib/nokogiri/xml/document.rb +20 -15
  90. data/lib/nokogiri/xml/document_fragment.rb +5 -6
  91. data/lib/nokogiri/xml/dtd.rb +1 -0
  92. data/lib/nokogiri/xml/element_content.rb +1 -0
  93. data/lib/nokogiri/xml/element_decl.rb +1 -0
  94. data/lib/nokogiri/xml/entity_decl.rb +1 -0
  95. data/lib/nokogiri/xml/entity_reference.rb +1 -0
  96. data/lib/nokogiri/xml/namespace.rb +1 -0
  97. data/lib/nokogiri/xml/node.rb +587 -249
  98. data/lib/nokogiri/xml/node/save_options.rb +1 -0
  99. data/lib/nokogiri/xml/node_set.rb +1 -0
  100. data/lib/nokogiri/xml/notation.rb +1 -0
  101. data/lib/nokogiri/xml/parse_options.rb +10 -3
  102. data/lib/nokogiri/xml/pp.rb +1 -0
  103. data/lib/nokogiri/xml/pp/character_data.rb +1 -0
  104. data/lib/nokogiri/xml/pp/node.rb +1 -0
  105. data/lib/nokogiri/xml/processing_instruction.rb +1 -0
  106. data/lib/nokogiri/xml/reader.rb +7 -3
  107. data/lib/nokogiri/xml/relax_ng.rb +7 -2
  108. data/lib/nokogiri/xml/sax.rb +1 -0
  109. data/lib/nokogiri/xml/sax/document.rb +1 -0
  110. data/lib/nokogiri/xml/sax/parser.rb +1 -0
  111. data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
  112. data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
  113. data/lib/nokogiri/xml/schema.rb +13 -4
  114. data/lib/nokogiri/xml/searchable.rb +25 -16
  115. data/lib/nokogiri/xml/syntax_error.rb +1 -0
  116. data/lib/nokogiri/xml/text.rb +1 -0
  117. data/lib/nokogiri/xml/xpath.rb +1 -0
  118. data/lib/nokogiri/xml/xpath/syntax_error.rb +1 -0
  119. data/lib/nokogiri/xml/xpath_context.rb +1 -0
  120. data/lib/nokogiri/xslt.rb +1 -0
  121. data/lib/nokogiri/xslt/stylesheet.rb +1 -0
  122. data/lib/xsd/xmlparser/nokogiri.rb +1 -0
  123. metadata +92 -157
  124. data/ext/java/nokogiri/internals/NokogiriEncodingReaderWrapper.java +0 -107
  125. data/ext/java/nokogiri/internals/UncloseableInputStream.java +0 -102
  126. data/ext/nokogiri/html_document.h +0 -10
  127. data/ext/nokogiri/html_element_description.h +0 -10
  128. data/ext/nokogiri/html_entity_lookup.h +0 -8
  129. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  130. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  131. data/ext/nokogiri/nokogiri.h +0 -121
  132. data/ext/nokogiri/xml_attr.h +0 -9
  133. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  134. data/ext/nokogiri/xml_cdata.h +0 -9
  135. data/ext/nokogiri/xml_comment.h +0 -9
  136. data/ext/nokogiri/xml_document.h +0 -23
  137. data/ext/nokogiri/xml_document_fragment.h +0 -10
  138. data/ext/nokogiri/xml_dtd.h +0 -10
  139. data/ext/nokogiri/xml_element_content.h +0 -10
  140. data/ext/nokogiri/xml_element_decl.h +0 -9
  141. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  142. data/ext/nokogiri/xml_entity_decl.h +0 -10
  143. data/ext/nokogiri/xml_entity_reference.h +0 -9
  144. data/ext/nokogiri/xml_io.h +0 -11
  145. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  146. data/ext/nokogiri/xml_namespace.h +0 -14
  147. data/ext/nokogiri/xml_node.h +0 -13
  148. data/ext/nokogiri/xml_node_set.h +0 -12
  149. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  150. data/ext/nokogiri/xml_reader.h +0 -10
  151. data/ext/nokogiri/xml_relax_ng.h +0 -9
  152. data/ext/nokogiri/xml_sax_parser.h +0 -39
  153. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  154. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  155. data/ext/nokogiri/xml_schema.h +0 -9
  156. data/ext/nokogiri/xml_syntax_error.h +0 -13
  157. data/ext/nokogiri/xml_text.h +0 -9
  158. data/ext/nokogiri/xml_xpath_context.h +0 -10
  159. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  160. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  161. data/patches/libxml2/0002-Remove-script-macro-support.patch +0 -40
  162. data/patches/libxml2/0003-Update-entities-to-remove-handling-of-ssi.patch +0 -44
data/README.md CHANGED
@@ -1,105 +1,153 @@
1
+ <div><img src="https://nokogiri.org/images/nokogiri-serif-black.png" align="right"/></div>
2
+
1
3
  # Nokogiri
2
4
 
3
- ## Description
5
+ Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for reading, writing, modifying, and querying documents. It is fast and standards-compliant by relying on native parsers like libxml2 (C) and xerces (Java).
6
+
7
+ ## Guiding Principles
8
+
9
+ Some guiding principles Nokogiri tries to follow:
4
10
 
5
- Nokogiri (鋸) is an HTML, XML, SAX, and Reader parser. Among
6
- Nokogiri's many features is the ability to search documents via XPath
7
- or CSS3 selectors.
11
+ - be secure-by-default by treating all documents as **untrusted** by default
12
+ - be a **thin-as-reasonable layer** on top of the underlying parsers, and don't attempt to fix behavioral differences between the parsers
8
13
 
9
14
 
10
- ## Links
15
+ ## Features Overview
11
16
 
12
- * https://nokogiri.org
13
- * [Installation Help](https://nokogiri.org/tutorials/installing_nokogiri.html)
14
- * [Tutorials](https://nokogiri.org)
15
- * [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet)
16
- * [GitHub](https://github.com/sparklemotion/nokogiri)
17
- * [Mailing List](https://groups.google.com/group/nokogiri-talk)
18
- * [Chat/Gitter](https://gitter.im/sparklemotion/nokogiri)
17
+ - DOM Parser for XML and HTML4
18
+ - SAX Parser for XML and HTML4
19
+ - Push Parser for XML and HTML4
20
+ - Document search via XPath 1.0
21
+ - Document search via CSS3 selectors, with some jquery-like extensions
22
+ - XSD Schema validation
23
+ - XSLT transformation
24
+ - "Builder" DSL for XML and HTML documents
19
25
 
20
26
 
21
27
  ## Status
22
28
 
23
- [![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/ruby-2.4-system/badge)](https://ci.nokogiri.org/teams/nokogiri-core/pipelines/nokogiri)
29
+ [![Concourse CI](https://ci.nokogiri.org/api/v1/teams/nokogiri-core/pipelines/nokogiri/jobs/cruby-2.7/badge)](https://ci.nokogiri.org/teams/nokogiri-core/pipelines/nokogiri)
24
30
  [![Appveyor CI](https://ci.appveyor.com/api/projects/status/xj2pqwvlxwuwgr06/branch/master?svg=true)](https://ci.appveyor.com/project/flavorjones/nokogiri/branch/master)
25
31
  [![Code Climate](https://codeclimate.com/github/sparklemotion/nokogiri.svg)](https://codeclimate.com/github/sparklemotion/nokogiri)
26
32
  [![Test Coverage](https://api.codeclimate.com/v1/badges/59c67b0e8976027a45ad/test_coverage)](https://codeclimate.com/github/sparklemotion/nokogiri/test_coverage)
27
33
 
28
34
  [![Gem Version](https://badge.fury.io/rb/nokogiri.svg)](https://rubygems.org/gems/nokogiri)
29
- [![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score.html?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)
30
- [![Tidelift dependencies](https://tidelift.com/badges/github/sparklemotion/nokogiri)](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
35
+ [![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
36
+ [![Tidelift dependencies](https://tidelift.com/badges/package/rubygems/nokogiri)](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
31
37
 
32
38
 
33
- ## Features
39
+ ## Support and Help
34
40
 
35
- * XML/HTML DOM parser which handles broken HTML
36
- * XML/HTML SAX parser
37
- * XML/HTML Push parser
38
- * XPath 1.0 support for document searching
39
- * CSS3 selector support for document searching
40
- * XML/HTML builder
41
- * XSLT transformer
41
+ All official documentation is posted at https://nokogiri.org (the source for which is at https://github.com/sparklemotion/nokogiri.org/, and we welcome contributions).
42
42
 
43
- Nokogiri parses and searches XML/HTML using native libraries (either C
44
- or Java, depending on your Ruby), which means it's fast and
45
- standards-compliant.
43
+ ### Reading
46
44
 
45
+ Your first stops for API documentation should be:
47
46
 
48
- ## Installation
47
+ - RDocs: https://nokogiri.org/rdoc/index.html
48
+ - An excellent community-maintained [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet)
49
49
 
50
- If this doesn't work:
51
50
 
52
- ```
53
- gem install nokogiri
54
- ```
51
+ ### Questions
55
52
 
56
- then please start troubleshooting here:
53
+ If you'd like to talk to a human:
57
54
 
58
- > https://nokogiri.org/tutorials/installing_nokogiri.html
55
+ - The Discord chat channel is `#nokogiri-💎` at https://discord.gg/UyQnKrT
56
+ - The Gitter chat channel is https://gitter.im/sparklemotion/nokogiri
57
+ - The IRC chat channel is `#nokogiri` on freenode.
58
+ - The Nokogiri mailing list is active at https://groups.google.com/group/nokogiri-talk
59
+ - The Nokogiri bug tracker is at https://github.com/sparklemotion/nokogiri/issues
59
60
 
60
- There are currently 1,237 Stack Overflow questions about Nokogiri
61
- installation. The vast majority of them are out of date and therefore
62
- incorrect. __Please do not use Stack Overflow.__
61
+ Consider subscribing to [Tidelift][tidelift] which provides license assurances and timely security notifications for your open source dependencies, including Nokogiri. [Tidelift][tidelift] subscriptions also help the Nokogiri maintainers fund our [automated testing](https://ci.nokogiri.org) which in turn allows us to ship releases, bugfixes, and security updates more often.
63
62
 
64
- Instead, [tell us](https://nokogiri.org/tutorials/getting_help.html)
65
- when the above instructions don't work for you. This allows us to both
66
- help you directly and improve the documentation.
63
+ [tidelift]: https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme
67
64
 
68
65
 
69
- ### Binary packages
66
+ ### Security and Vulnerability Reporting
70
67
 
71
- Binary packages are available for some distributions.
68
+ Please report vulnerabilities at https://hackerone.com/nokogiri
72
69
 
73
- * Debian: https://packages.debian.org/sid/ruby-nokogiri
74
- * SuSE: https://download.opensuse.org/repositories/devel:/languages:/ruby:/extensions/
75
- * Fedora: http://s390.koji.fedoraproject.org/koji/packageinfo?packageID=6756
70
+ Full information and description of our security policy is in [`SECURITY.md`](SECURITY.md)
76
71
 
77
72
 
78
- ## Support
73
+ ### Semantic Versioning Policy
79
74
 
80
- All official documentation is posted at https://nokogiri.org (the source for which is at https://github.com/sparklemotion/nokogiri.org/, and we welcome contributions).
75
+ Nokogiri follows [Semantic Versioning](https://semver.org/) (since 2017 or so). [![Dependabot's SemVer compatibility score for Nokogiri](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
81
76
 
82
- * The Nokogiri mailing list is active: https://groups.google.com/group/nokogiri-talk
83
- * The Nokogiri bug tracker is here: https://github.com/sparklemotion/nokogiri/issues
84
- * Before filing a bug report, please read our submission guidelines: http://nokogiri.org/tutorials/getting_help.html
85
- * The IRC channel is `#nokogiri` on freenode.
86
- * The project's GitHub wiki has an excellent community-maintained [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet) which might be useful.
77
+ We bump `Major.Minor.Patch` versions following this guidance:
87
78
 
88
- Consider subscribing to [Tidelift][tidelift] which provides license assurances and timely security notifications for your open source dependencies, including Nokogiri. [Tidelift][tidelift] subscriptions also help the Nokogiri maintainers fund our [automated testing](https://ci.nokogiri.org) which in turn allows us to ship releases, bugfixes, and security updates more often.
79
+ `Major`: (we've never done this)
89
80
 
90
- [tidelift]: https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme
81
+ - Significant backwards-incompatible changes to the public API that would require rewriting existing application code.
82
+ - Some examples of backwards-incompatible changes we might someday consider for a Major release are at [`ROADMAP.md`](ROADMAP.md).
91
83
 
84
+ `Minor`:
92
85
 
93
- ## Security and Vulnerability Reporting
86
+ - Features and bugfixes.
87
+ - Updating packaged libraries for non-security-related reasons.
88
+ - Dropping support for EOLed Ruby versions. [Some folks find this objectionable](https://github.com/sparklemotion/nokogiri/issues/1568), but [SemVer says this is OK if the public API hasn't changed](https://semver.org/#what-should-i-do-if-i-update-my-own-dependencies-without-changing-the-public-api).
89
+ - Backwards-incompatible changes to internal or private methods and constants. These are detailed in the "Changes" section of each changelog entry.
94
90
 
95
- Please report vulnerabilities at https://hackerone.com/nokogiri
91
+ `Patch`:
96
92
 
97
- Full information and description of our security policy is in [`SECURITY.md`](SECURITY.md)
93
+ - Bugfixes.
94
+ - Security updates.
95
+ - Updating packaged libraries for security-related reasons.
98
96
 
99
97
 
100
- ## Synopsis
98
+ ## Installation
99
+
100
+ Requirements:
101
+
102
+ - Ruby >= 2.5
103
+ - JRuby >= 9.2.0.0
104
+
105
+
106
+ ### Native Gems: Faster, more reliable installation
107
+
108
+ "Native gems" contain pre-compiled libraries for a specific machine architecture. On supported platforms, this removes the need for compiling the C extension and the packaged libraries, or for system dependencies to exist. This results in **much faster installation** and **more reliable installation**, which as you probably know are the biggest headaches for Nokogiri users.
109
+
110
+ ### Supported Platforms
111
+
112
+ As of v1.11.0, Nokogiri ships pre-compiled, "native" gems for the following platforms:
113
+
114
+ - Linux: `x86-linux` and `x86_64-linux` (req: `glibc >= 2.17`), including musl platforms like Alpine
115
+ - Darwin/MacOS: `x86_64-darwin` and `arm64-darwin`
116
+ - Windows: `x86-mingw32` and `x64-mingw32`
117
+ - Java: any platform running JRuby 9.2 or higher
118
+
119
+ To determine whether your system supports one of these gems, look at the output of `bundle platform` or `ruby -e 'puts Gem::Platform.local.to_s'`.
120
+
121
+ If you're on a supported platform, either `gem install` or `bundle install` should install a native gem without any additional action on your part. This installation should only take a few seconds, and your output should look something like:
122
+
123
+ ``` sh
124
+ $ gem install nokogiri
125
+ Fetching nokogiri-1.11.0-x86_64-linux.gem
126
+ Successfully installed nokogiri-1.11.0-x86_64-linux
127
+ 1 gem installed
128
+ ```
129
+
130
+
131
+ ### Other Installation Options
132
+
133
+ Because Nokogiri is a C extension, it requires that you have a C compiler toolchain, Ruby development header files, and some system dependencies installed.
134
+
135
+ The following may work for you if you have an appropriately-configured system:
136
+
137
+ ``` bash
138
+ gem install nokogiri
139
+ ```
101
140
 
102
- Nokogiri is a large library, but here is example usage for parsing and examining a document:
141
+ If you have any issues, please visit [Installing Nokogiri](https://nokogiri.org/tutorials/installing_nokogiri.html) for more complete instructions and troubleshooting.
142
+
143
+
144
+ ## How To Use Nokogiri
145
+
146
+ Nokogiri is a large library, and so it's challenging to briefly summarize it. We've tried to provide long, real-world examples at [Tutorials](https://nokogiri.org/tutorials/toc.html).
147
+
148
+ ### Parsing and Querying
149
+
150
+ Here is example usage for parsing and querying a document:
103
151
 
104
152
  ```ruby
105
153
  #! /usr/bin/env ruby
@@ -108,51 +156,26 @@ require 'nokogiri'
108
156
  require 'open-uri'
109
157
 
110
158
  # Fetch and parse HTML document
111
- doc = Nokogiri::HTML(open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
159
+ doc = Nokogiri::HTML(URI.open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
112
160
 
113
- puts "### Search for nodes by css"
161
+ # Search for nodes by css
114
162
  doc.css('nav ul.menu li a', 'article h2').each do |link|
115
163
  puts link.content
116
164
  end
117
165
 
118
- puts "### Search for nodes by xpath"
166
+ # Search for nodes by xpath
119
167
  doc.xpath('//nav//ul//li/a', '//article//h2').each do |link|
120
168
  puts link.content
121
169
  end
122
170
 
123
- puts "### Or mix and match."
171
+ # Or mix and match
124
172
  doc.search('nav ul.menu li a', '//article//h2').each do |link|
125
173
  puts link.content
126
174
  end
127
175
  ```
128
176
 
129
177
 
130
- ## Requirements
131
-
132
- * Ruby 2.3.0 or higher, including any development packages necessary
133
- to compile native extensions.
134
-
135
- * In Nokogiri 1.6.0 and later libxml2 and libxslt are bundled with the
136
- gem, but if you want to use the system versions:
137
-
138
- * First, check out [the long list](http://www.xmlsoft.org/news.html)
139
- of fixes and changes between releases before deciding to use any
140
- version older than is bundled with Nokogiri.
141
-
142
- * At install time, set the environment variable
143
- `NOKOGIRI_USE_SYSTEM_LIBRARIES` or else use the
144
- `--use-system-libraries` argument. (See
145
- https://nokogiri.org/tutorials/installing_nokogiri.html#install-with-system-libraries
146
- for specifics.)
147
-
148
- * libxml2 >=2.6.21 with iconv support
149
- (libxml2-dev/-devel is also required)
150
-
151
- * libxslt, built with and supported by the given libxml2
152
- (libxslt-dev/-devel is also required)
153
-
154
-
155
- ## Encoding
178
+ ### Encoding
156
179
 
157
180
  Strings are always stored as UTF-8 internally. Methods that return
158
181
  text values will always return UTF-8 encoded strings. Methods that
@@ -178,11 +201,43 @@ explicitly setting the encoding to EUC-JP on the parser:
178
201
  ```
179
202
 
180
203
 
181
- ## Development
204
+ ## Technical Overview
205
+
206
+ ### Guiding Principles
207
+
208
+ As noted above, two guiding principles of the software are:
209
+
210
+ - be secure-by-default by treating all documents as **untrusted** by default
211
+ - be a **thin-as-reasonable layer** on top of the underlying parsers, and don't attempt to fix behavioral differences between the parsers
212
+
213
+ Notably, despite all parsers being standards-compliant, there are behavioral inconsistencies between the parsers used in the CRuby and JRuby implementations, and Nokogiri does not and should not attempt to remove these inconsistencies. Instead, we surface these differences in the test suite when they are important/semantic; or we intentionally write tests to depend only on the important/semantic bits (omitting whitespace from regex matchers on results, for example).
214
+
215
+
216
+ ### CRuby
217
+
218
+ The Ruby (a.k.a., CRuby, MRI, YARV) implementation is a C extension that depends on libxml2 and libxslt (which in turn depend on zlib and possibly libiconv).
219
+
220
+ These dependencies are met by default by Nokogiri's packaged versions of the libxml2 and libxslt source code, but a configuration option `--use-system-libraries` is provided to allow specification of alternative library locations. See [Installing Nokogiri](https://nokogiri.org/tutorials/installing_nokogiri.html) for full documentation.
221
+
222
+ We provide native gems by pre-compiling libxml2 and libxslt (and potentially zlib and libiconv) and packaging them into the gem file. In this case, no compilation is necessary at installation time, which leads to faster and more reliable installation.
223
+
224
+ See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems.
225
+
226
+
227
+ ### JRuby
228
+
229
+ The Java (a.k.a. JRuby) implementation is a Java extension that depends primarily on Xerces and NekoHTML for parsing, though additional dependencies are on `isorelax`, `nekodtd`, `jing`, `serializer`, `xalan-j`, and `xml-apis`.
230
+
231
+ These dependencies are provided by pre-compiled jar files packaged in the `java` platform gem.
232
+
233
+ See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems.
234
+
235
+
236
+ ## Contributing
182
237
 
183
238
  ```bash
184
- bundle install
185
- bundle exec rake compile test
239
+ bundle install
240
+ bundle exec rake compile test
186
241
  ```
187
242
 
188
243
 
@@ -196,3 +251,22 @@ We've adopted the Contributor Covenant code of conduct, which you can read in fu
196
251
  This project is licensed under the terms of the MIT license.
197
252
 
198
253
  See this license at [`LICENSE.md`](LICENSE.md).
254
+
255
+
256
+ ### Dependencies
257
+
258
+ Some additional libraries may be distributed with your version of Nokogiri. Please see [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for a discussion of the variations as well as the licenses thereof.
259
+
260
+
261
+ ## Authors
262
+
263
+ - Mike Dalessio
264
+ - Aaron Patterson
265
+ - Yoko Harada
266
+ - Akinori MUSHA
267
+ - John Shahid
268
+ - Karol Bucek
269
+ - Lars Kanis
270
+ - Sergio Arbeo
271
+ - Timothy Elliott
272
+ - Nobuyoshi Nakada
@@ -36,7 +36,6 @@ import org.jruby.Ruby;
36
36
  import org.jruby.RubyClass;
37
37
  import org.jruby.anno.JRubyClass;
38
38
  import org.jruby.anno.JRubyMethod;
39
- import org.jruby.runtime.Arity;
40
39
  import org.jruby.runtime.Helpers;
41
40
  import org.jruby.runtime.ThreadContext;
42
41
  import org.jruby.runtime.builtin.IRubyObject;
@@ -48,6 +47,8 @@ import org.w3c.dom.NodeList;
48
47
 
49
48
  import nokogiri.internals.HtmlDomParserContext;
50
49
 
50
+ import static nokogiri.internals.NokogiriHelpers.getNokogiriClass;
51
+
51
52
  /**
52
53
  * Class for Nokogiri::HTML::Document.
53
54
  *
@@ -65,21 +66,25 @@ public class HtmlDocument extends XmlDocument {
65
66
  public HtmlDocument(Ruby ruby, RubyClass klazz) {
66
67
  super(ruby, klazz);
67
68
  }
68
-
69
+
70
+ public HtmlDocument(Ruby runtime, Document document) {
71
+ this(runtime, getNokogiriClass(runtime, "Nokogiri::XML::Document"), document);
72
+ }
73
+
69
74
  public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) {
70
75
  super(ruby, klazz, doc);
71
76
  }
72
77
 
73
78
  @JRubyMethod(name="new", meta = true, rest = true, required=0)
74
- public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz,
75
- IRubyObject[] args) {
79
+ public static IRubyObject rbNew(ThreadContext context, IRubyObject klazz, IRubyObject[] args) {
80
+ final Ruby runtime = context.runtime;
76
81
  HtmlDocument htmlDocument;
77
82
  try {
78
- Document docNode = createNewDocument();
79
- htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(context.getRuntime(), (RubyClass) klazz);
80
- htmlDocument.setDocumentNode(context, docNode);
83
+ Document docNode = createNewDocument(runtime);
84
+ htmlDocument = (HtmlDocument) NokogiriService.HTML_DOCUMENT_ALLOCATOR.allocate(runtime, (RubyClass) klazz);
85
+ htmlDocument.setDocumentNode(context.runtime, docNode);
81
86
  } catch (Exception ex) {
82
- throw context.getRuntime().newRuntimeError("couldn't create document: " + ex);
87
+ throw asRuntimeError(runtime, "couldn't create document: ", ex);
83
88
  }
84
89
 
85
90
  Helpers.invoke(context, htmlDocument, "initialize", args);
@@ -109,46 +114,29 @@ public class HtmlDocument extends XmlDocument {
109
114
  return internalSubset;
110
115
  }
111
116
 
112
- public static IRubyObject do_parse(ThreadContext context,
113
- IRubyObject klass,
114
- IRubyObject[] args) {
115
- Ruby ruby = context.getRuntime();
116
- Arity.checkArgumentCount(ruby, args, 4, 4);
117
- HtmlDomParserContext ctx =
118
- new HtmlDomParserContext(ruby, args[2], args[3]);
119
- ctx.setInputSource(context, args[0], args[1]);
120
- return ctx.parse(context, klass, args[1]);
121
- }
122
-
123
- public void setDocumentNode(ThreadContext context, Node node) {
124
- super.setNode(context, node);
125
- Ruby runtime = context.getRuntime();
126
- if (node != null) {
127
- Document document = (Document)node;
128
- document.normalize();
129
- stabilzeAttrValue(document.getDocumentElement());
130
- }
117
+ @Override
118
+ void init(Ruby runtime, Document document) {
119
+ stabilizeTextContent(document);
120
+ document.normalize();
131
121
  setInstanceVariable("@decorators", runtime.getNil());
122
+ if (document.getDocumentElement() != null) {
123
+ stabilizeAttrs(document.getDocumentElement());
124
+ }
132
125
  }
133
-
134
- private void stabilzeAttrValue(Node node) {
135
- if (node == null) return;
126
+
127
+ private static void stabilizeAttrs(Node node) {
136
128
  if (node.hasAttributes()) {
137
129
  NamedNodeMap nodeMap = node.getAttributes();
138
130
  for (int i=0; i<nodeMap.getLength(); i++) {
139
131
  Node n = nodeMap.item(i);
140
132
  if (n instanceof Attr) {
141
- Attr attr = (Attr)n;
142
- String attrName = attr.getName();
143
- // not sure, but need to get value always before document is referred.
144
- // or lose attribute value
145
- String attrValue = attr.getValue(); // don't delete this line
133
+ stabilizeAttr((Attr) n);
146
134
  }
147
135
  }
148
136
  }
149
137
  NodeList children = node.getChildNodes();
150
138
  for (int i=0; i<children.getLength(); i++) {
151
- stabilzeAttrValue(children.item(i));
139
+ stabilizeAttrs(children.item(i));
152
140
  }
153
141
  }
154
142
 
@@ -167,11 +155,11 @@ public class HtmlDocument extends XmlDocument {
167
155
  * Read the HTML document from +io+ with given +url+, +encoding+,
168
156
  * and +options+. See Nokogiri::HTML.parse
169
157
  */
170
- @JRubyMethod(meta = true, rest = true)
171
- public static IRubyObject read_io(ThreadContext context,
172
- IRubyObject cls,
173
- IRubyObject[] args) {
174
- return do_parse(context, cls, args);
158
+ @JRubyMethod(meta = true, required = 4)
159
+ public static IRubyObject read_io(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
160
+ HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
161
+ ctx.setIOInputSource(context, args[0], args[1]);
162
+ return ctx.parse(context, (RubyClass) klass, args[1]);
175
163
  }
176
164
 
177
165
  /*
@@ -181,10 +169,10 @@ public class HtmlDocument extends XmlDocument {
181
169
  * Read the HTML document contained in +string+ with given +url+, +encoding+,
182
170
  * and +options+. See Nokogiri::HTML.parse
183
171
  */
184
- @JRubyMethod(meta = true, rest = true)
185
- public static IRubyObject read_memory(ThreadContext context,
186
- IRubyObject cls,
187
- IRubyObject[] args) {
188
- return do_parse(context, cls, args);
172
+ @JRubyMethod(meta = true, required = 4)
173
+ public static IRubyObject read_memory(ThreadContext context, IRubyObject klass, IRubyObject[] args) {
174
+ HtmlDomParserContext ctx = new HtmlDomParserContext(context.runtime, args[2], args[3]);
175
+ ctx.setStringInputSource(context, args[0], args[1]);
176
+ return ctx.parse(context, (RubyClass) klass, args[1]);
189
177
  }
190
178
  }