nokogiri 1.8.5 → 1.13.6
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +3 -21
- data/LICENSE-DEPENDENCIES.md +1159 -868
- data/LICENSE.md +5 -28
- data/README.md +196 -90
- data/bin/nokogiri +63 -50
- data/dependencies.yml +13 -59
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +750 -420
- data/ext/nokogiri/gumbo.c +584 -0
- data/ext/nokogiri/html4_document.c +166 -0
- data/ext/nokogiri/html4_element_description.c +294 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +119 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +228 -91
- data/ext/nokogiri/nokogiri.h +191 -89
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +41 -36
- data/ext/nokogiri/xml_attribute_decl.c +18 -18
- data/ext/nokogiri/xml_cdata.c +13 -18
- data/ext/nokogiri/xml_comment.c +19 -26
- data/ext/nokogiri/xml_document.c +291 -216
- data/ext/nokogiri/xml_document_fragment.c +12 -16
- data/ext/nokogiri/xml_dtd.c +56 -50
- data/ext/nokogiri/xml_element_content.c +31 -26
- data/ext/nokogiri/xml_element_decl.c +22 -22
- data/ext/nokogiri/xml_encoding_handler.c +43 -18
- data/ext/nokogiri/xml_entity_decl.c +32 -30
- data/ext/nokogiri/xml_entity_reference.c +16 -18
- data/ext/nokogiri/xml_namespace.c +61 -52
- data/ext/nokogiri/xml_node.c +1044 -616
- data/ext/nokogiri/xml_node_set.c +174 -162
- data/ext/nokogiri/xml_processing_instruction.c +17 -19
- data/ext/nokogiri/xml_reader.c +226 -175
- data/ext/nokogiri/xml_relax_ng.c +52 -28
- data/ext/nokogiri/xml_sax_parser.c +112 -112
- data/ext/nokogiri/xml_sax_parser_context.c +112 -86
- data/ext/nokogiri/xml_sax_push_parser.c +36 -27
- data/ext/nokogiri/xml_schema.c +112 -33
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +13 -17
- data/ext/nokogiri/xml_xpath_context.c +223 -115
- data/ext/nokogiri/xslt_stylesheet.c +265 -173
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +101 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +626 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/gumbo.h +943 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +4875 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +222 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +169 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3463 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +68 -0
- data/gumbo-parser/src/util.h +30 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +10 -8
- data/lib/nokogiri/css/parser.rb +397 -377
- data/lib/nokogiri/css/parser.y +250 -245
- data/lib/nokogiri/css/parser_extras.rb +54 -49
- data/lib/nokogiri/css/syntax_error.rb +3 -1
- data/lib/nokogiri/css/tokenizer.rb +107 -104
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +218 -91
- data/lib/nokogiri/css.rb +50 -17
- data/lib/nokogiri/decorators/slop.rb +9 -7
- data/lib/nokogiri/extension.rb +31 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +38 -27
- data/lib/nokogiri/{html → html4}/builder.rb +4 -2
- data/lib/nokogiri/{html → html4}/document.rb +103 -105
- data/lib/nokogiri/html4/document_fragment.rb +54 -0
- data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
- data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
- data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
- data/lib/nokogiri/html4.rb +46 -0
- data/lib/nokogiri/html5/document.rb +91 -0
- data/lib/nokogiri/html5/document_fragment.rb +83 -0
- data/lib/nokogiri/html5/node.rb +100 -0
- data/lib/nokogiri/html5.rb +478 -0
- data/lib/nokogiri/jruby/dependencies.rb +21 -0
- data/lib/nokogiri/syntax_error.rb +2 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +222 -0
- data/lib/nokogiri/version.rb +3 -108
- data/lib/nokogiri/xml/attr.rb +6 -3
- data/lib/nokogiri/xml/attribute_decl.rb +3 -1
- data/lib/nokogiri/xml/builder.rb +97 -53
- data/lib/nokogiri/xml/cdata.rb +3 -1
- data/lib/nokogiri/xml/character_data.rb +2 -0
- data/lib/nokogiri/xml/document.rb +224 -86
- data/lib/nokogiri/xml/document_fragment.rb +57 -44
- data/lib/nokogiri/xml/dtd.rb +4 -2
- data/lib/nokogiri/xml/element_content.rb +2 -0
- data/lib/nokogiri/xml/element_decl.rb +3 -1
- data/lib/nokogiri/xml/entity_decl.rb +4 -2
- data/lib/nokogiri/xml/entity_reference.rb +2 -0
- data/lib/nokogiri/xml/namespace.rb +3 -0
- data/lib/nokogiri/xml/node/save_options.rb +10 -5
- data/lib/nokogiri/xml/node.rb +895 -377
- data/lib/nokogiri/xml/node_set.rb +92 -65
- data/lib/nokogiri/xml/notation.rb +13 -0
- data/lib/nokogiri/xml/parse_options.rb +22 -8
- data/lib/nokogiri/xml/pp/character_data.rb +9 -6
- data/lib/nokogiri/xml/pp/node.rb +25 -26
- data/lib/nokogiri/xml/pp.rb +4 -2
- data/lib/nokogiri/xml/processing_instruction.rb +3 -1
- data/lib/nokogiri/xml/reader.rb +23 -28
- data/lib/nokogiri/xml/relax_ng.rb +8 -2
- data/lib/nokogiri/xml/sax/document.rb +45 -49
- data/lib/nokogiri/xml/sax/parser.rb +38 -34
- data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
- data/lib/nokogiri/xml/sax.rb +6 -4
- data/lib/nokogiri/xml/schema.rb +19 -9
- data/lib/nokogiri/xml/searchable.rb +112 -72
- data/lib/nokogiri/xml/syntax_error.rb +6 -4
- data/lib/nokogiri/xml/text.rb +2 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
- data/lib/nokogiri/xml/xpath.rb +15 -4
- data/lib/nokogiri/xml/xpath_context.rb +3 -3
- data/lib/nokogiri/xml.rb +38 -37
- data/lib/nokogiri/xslt/stylesheet.rb +3 -1
- data/lib/nokogiri/xslt.rb +29 -20
- data/lib/nokogiri.rb +49 -65
- data/lib/xsd/xmlparser/nokogiri.rb +26 -24
- data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
- data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
- data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +3040 -0
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +3037 -0
- data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
- metadata +220 -266
- data/.autotest +0 -22
- data/.cross_rubies +0 -8
- data/.editorconfig +0 -17
- data/.gemtest +0 -0
- data/.travis.yml +0 -63
- data/CHANGELOG.md +0 -1368
- data/CONTRIBUTING.md +0 -42
- data/C_CODING_STYLE.rdoc +0 -33
- data/Gemfile-libxml-ruby +0 -3
- data/Manifest.txt +0 -370
- data/ROADMAP.md +0 -111
- data/Rakefile +0 -348
- data/SECURITY.md +0 -19
- data/STANDARD_RESPONSES.md +0 -47
- data/Y_U_NO_GEMSPEC.md +0 -155
- data/appveyor.yml +0 -29
- data/build_all +0 -44
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -15
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/document_fragment.rb +0 -49
- data/lib/nokogiri/html/element_description_defaults.rb +0 -671
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
- data/patches/libxml2/0002-Fix-nullptr-deref-with-XPath-logic-ops.patch +0 -54
- data/patches/libxml2/0003-Fix-infinite-loop-in-LZMA-decompression.patch +0 -50
- data/patches/sort-patches-by-date +0 -25
- data/ports/archives/libxml2-2.9.8.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.32.tar.gz +0 -0
- data/suppressions/README.txt +0 -1
- data/suppressions/nokogiri_ruby-2.supp +0 -10
- data/tasks/test.rb +0 -100
- data/test/css/test_nthiness.rb +0 -226
- data/test/css/test_parser.rb +0 -386
- data/test/css/test_tokenizer.rb +0 -215
- data/test/css/test_xpath_visitor.rb +0 -96
- data/test/decorators/test_slop.rb +0 -23
- data/test/files/2ch.html +0 -108
- data/test/files/GH_1042.html +0 -18
- data/test/files/address_book.rlx +0 -12
- data/test/files/address_book.xml +0 -10
- data/test/files/atom.xml +0 -344
- data/test/files/bar/bar.xsd +0 -4
- data/test/files/bogus.xml +0 -0
- data/test/files/dont_hurt_em_why.xml +0 -422
- data/test/files/encoding.html +0 -82
- data/test/files/encoding.xhtml +0 -84
- data/test/files/exslt.xml +0 -8
- data/test/files/exslt.xslt +0 -35
- data/test/files/foo/foo.xsd +0 -4
- data/test/files/metacharset.html +0 -10
- data/test/files/namespace_pressure_test.xml +0 -1684
- data/test/files/noencoding.html +0 -47
- data/test/files/po.xml +0 -32
- data/test/files/po.xsd +0 -66
- data/test/files/saml/saml20assertion_schema.xsd +0 -283
- data/test/files/saml/saml20protocol_schema.xsd +0 -302
- data/test/files/saml/xenc_schema.xsd +0 -146
- data/test/files/saml/xmldsig_schema.xsd +0 -318
- data/test/files/shift_jis.html +0 -10
- data/test/files/shift_jis.xml +0 -5
- data/test/files/shift_jis_no_charset.html +0 -9
- data/test/files/slow-xpath.xml +0 -25509
- data/test/files/snuggles.xml +0 -3
- data/test/files/staff.dtd +0 -10
- data/test/files/staff.xml +0 -59
- data/test/files/staff.xslt +0 -32
- data/test/files/test_document_url/bar.xml +0 -2
- data/test/files/test_document_url/document.dtd +0 -4
- data/test/files/test_document_url/document.xml +0 -6
- data/test/files/tlm.html +0 -851
- data/test/files/to_be_xincluded.xml +0 -2
- data/test/files/valid_bar.xml +0 -2
- data/test/files/xinclude.xml +0 -4
- data/test/helper.rb +0 -271
- data/test/html/sax/test_parser.rb +0 -168
- data/test/html/sax/test_parser_context.rb +0 -46
- data/test/html/sax/test_parser_text.rb +0 -163
- data/test/html/sax/test_push_parser.rb +0 -87
- data/test/html/test_attributes.rb +0 -85
- data/test/html/test_builder.rb +0 -164
- data/test/html/test_document.rb +0 -712
- data/test/html/test_document_encoding.rb +0 -143
- data/test/html/test_document_fragment.rb +0 -310
- data/test/html/test_element_description.rb +0 -105
- data/test/html/test_named_characters.rb +0 -14
- data/test/html/test_node.rb +0 -212
- data/test/html/test_node_encoding.rb +0 -91
- data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
- data/test/namespaces/test_namespaces_aliased_default.rb +0 -24
- data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
- data/test/namespaces/test_namespaces_in_cloned_doc.rb +0 -31
- data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
- data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -80
- data/test/namespaces/test_namespaces_preservation.rb +0 -31
- data/test/test_convert_xpath.rb +0 -135
- data/test/test_css_cache.rb +0 -47
- data/test/test_encoding_handler.rb +0 -48
- data/test/test_memory_leak.rb +0 -156
- data/test/test_nokogiri.rb +0 -138
- data/test/test_soap4r_sax.rb +0 -52
- data/test/test_xslt_transforms.rb +0 -314
- data/test/xml/node/test_save_options.rb +0 -28
- data/test/xml/node/test_subclass.rb +0 -44
- data/test/xml/sax/test_parser.rb +0 -402
- data/test/xml/sax/test_parser_context.rb +0 -115
- data/test/xml/sax/test_parser_text.rb +0 -202
- data/test/xml/sax/test_push_parser.rb +0 -265
- data/test/xml/test_attr.rb +0 -74
- data/test/xml/test_attribute_decl.rb +0 -86
- data/test/xml/test_builder.rb +0 -341
- data/test/xml/test_c14n.rb +0 -180
- data/test/xml/test_cdata.rb +0 -54
- data/test/xml/test_comment.rb +0 -40
- data/test/xml/test_document.rb +0 -982
- data/test/xml/test_document_encoding.rb +0 -31
- data/test/xml/test_document_fragment.rb +0 -298
- data/test/xml/test_dtd.rb +0 -187
- data/test/xml/test_dtd_encoding.rb +0 -31
- data/test/xml/test_element_content.rb +0 -56
- data/test/xml/test_element_decl.rb +0 -73
- data/test/xml/test_entity_decl.rb +0 -122
- data/test/xml/test_entity_reference.rb +0 -262
- data/test/xml/test_namespace.rb +0 -96
- data/test/xml/test_node.rb +0 -1325
- data/test/xml/test_node_attributes.rb +0 -115
- data/test/xml/test_node_encoding.rb +0 -75
- data/test/xml/test_node_inheritance.rb +0 -32
- data/test/xml/test_node_reparenting.rb +0 -592
- data/test/xml/test_node_set.rb +0 -809
- data/test/xml/test_parse_options.rb +0 -64
- data/test/xml/test_processing_instruction.rb +0 -30
- data/test/xml/test_reader.rb +0 -620
- data/test/xml/test_reader_encoding.rb +0 -134
- data/test/xml/test_relax_ng.rb +0 -60
- data/test/xml/test_schema.rb +0 -142
- data/test/xml/test_syntax_error.rb +0 -36
- data/test/xml/test_text.rb +0 -60
- data/test/xml/test_unparented_node.rb +0 -483
- data/test/xml/test_xinclude.rb +0 -83
- data/test/xml/test_xpath.rb +0 -470
- data/test/xslt/test_custom_functions.rb +0 -133
- data/test/xslt/test_exception_handling.rb +0 -37
data/lib/nokogiri.rb
CHANGED
@@ -1,96 +1,67 @@
|
|
1
|
-
#
|
2
|
-
#
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
3
|
|
4
|
-
require
|
4
|
+
require "rbconfig"
|
5
5
|
|
6
6
|
if defined?(RUBY_ENGINE) && RUBY_ENGINE == "jruby"
|
7
|
-
|
8
|
-
# unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
|
9
|
-
#
|
10
|
-
# However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
|
11
|
-
# an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
|
12
|
-
# of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
|
13
|
-
# should skip loading xml jars. This is because those are in WEB-INF/lib and
|
14
|
-
# already set in the classpath.
|
15
|
-
unless $LOAD_PATH.to_s.include?("appengine-rack")
|
16
|
-
require 'stringio'
|
17
|
-
require 'isorelax.jar'
|
18
|
-
require 'jing.jar'
|
19
|
-
require 'nekohtml.jar'
|
20
|
-
require 'nekodtd.jar'
|
21
|
-
require 'xercesImpl.jar'
|
22
|
-
require 'serializer.jar'
|
23
|
-
require 'xalan.jar'
|
24
|
-
require 'xml-apis.jar'
|
25
|
-
end
|
7
|
+
require_relative "nokogiri/jruby/dependencies"
|
26
8
|
end
|
27
9
|
|
28
|
-
|
29
|
-
RUBY_VERSION =~ /(\d+\.\d+)/
|
30
|
-
require "nokogiri/#{$1}/nokogiri"
|
31
|
-
rescue LoadError
|
32
|
-
require 'nokogiri/nokogiri'
|
33
|
-
end
|
34
|
-
require 'nokogiri/version'
|
35
|
-
require 'nokogiri/syntax_error'
|
36
|
-
require 'nokogiri/xml'
|
37
|
-
require 'nokogiri/xslt'
|
38
|
-
require 'nokogiri/html'
|
39
|
-
require 'nokogiri/decorators/slop'
|
40
|
-
require 'nokogiri/css'
|
41
|
-
require 'nokogiri/html/builder'
|
10
|
+
require_relative "nokogiri/extension"
|
42
11
|
|
43
12
|
# Nokogiri parses and searches XML/HTML very quickly, and also has
|
44
13
|
# correctly implemented CSS3 selector support as well as XPath 1.0
|
45
14
|
# support.
|
46
15
|
#
|
47
16
|
# Parsing a document returns either a Nokogiri::XML::Document, or a
|
48
|
-
# Nokogiri::
|
17
|
+
# Nokogiri::HTML4::Document depending on the kind of document you parse.
|
49
18
|
#
|
50
19
|
# Here is an example:
|
51
20
|
#
|
52
|
-
#
|
53
|
-
#
|
21
|
+
# require 'nokogiri'
|
22
|
+
# require 'open-uri'
|
54
23
|
#
|
55
|
-
#
|
24
|
+
# # Get a Nokogiri::HTML4::Document for the page we’re interested in...
|
56
25
|
#
|
57
|
-
#
|
26
|
+
# doc = Nokogiri::HTML4(URI.open('http://www.google.com/search?q=tenderlove'))
|
58
27
|
#
|
59
|
-
#
|
28
|
+
# # Do funky things with it using Nokogiri::XML::Node methods...
|
60
29
|
#
|
61
|
-
#
|
62
|
-
#
|
63
|
-
#
|
64
|
-
#
|
65
|
-
#
|
30
|
+
# ####
|
31
|
+
# # Search for nodes by css
|
32
|
+
# doc.css('h3.r a.l').each do |link|
|
33
|
+
# puts link.content
|
34
|
+
# end
|
66
35
|
#
|
67
|
-
# See
|
68
|
-
#
|
36
|
+
# See also:
|
37
|
+
#
|
38
|
+
# - Nokogiri::XML::Searchable#css for more information about CSS searching
|
39
|
+
# - Nokogiri::XML::Searchable#xpath for more information about XPath searching
|
69
40
|
module Nokogiri
|
70
41
|
class << self
|
71
42
|
###
|
72
43
|
# Parse an HTML or XML document. +string+ contains the document.
|
73
|
-
def parse
|
44
|
+
def parse(string, url = nil, encoding = nil, options = nil)
|
74
45
|
if string.respond_to?(:read) ||
|
75
|
-
/^\s*<(?:!DOCTYPE\s+)?html[\s>]/i
|
46
|
+
/^\s*<(?:!DOCTYPE\s+)?html[\s>]/i.match?(string[0, 512])
|
76
47
|
# Expect an HTML indicator to appear within the first 512
|
77
48
|
# characters of a document. (<?xml ?> + <?xml-stylesheet ?>
|
78
49
|
# shouldn't be that long)
|
79
|
-
Nokogiri.
|
50
|
+
Nokogiri.HTML4(string, url, encoding,
|
80
51
|
options || XML::ParseOptions::DEFAULT_HTML)
|
81
52
|
else
|
82
53
|
Nokogiri.XML(string, url, encoding,
|
83
54
|
options || XML::ParseOptions::DEFAULT_XML)
|
84
|
-
end.tap
|
55
|
+
end.tap do |doc|
|
85
56
|
yield doc if block_given?
|
86
|
-
|
57
|
+
end
|
87
58
|
end
|
88
59
|
|
89
60
|
###
|
90
61
|
# Create a new Nokogiri::XML::DocumentFragment
|
91
|
-
def make
|
62
|
+
def make(input = nil, opts = {}, &blk)
|
92
63
|
if input
|
93
|
-
Nokogiri::
|
64
|
+
Nokogiri::HTML4.fragment(input).children.first
|
94
65
|
else
|
95
66
|
Nokogiri(&blk)
|
96
67
|
end
|
@@ -115,14 +86,15 @@ module Nokogiri
|
|
115
86
|
Nokogiri(*args, &block).slop!
|
116
87
|
end
|
117
88
|
|
89
|
+
# :nodoc:
|
118
90
|
def install_default_aliases
|
119
91
|
# Make sure to support some popular encoding aliases not known by
|
120
92
|
# all iconv implementations.
|
121
93
|
{
|
122
|
-
|
123
|
-
}.each
|
94
|
+
"Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
|
95
|
+
}.each do |alias_name, name|
|
124
96
|
EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil?
|
125
|
-
|
97
|
+
end
|
126
98
|
end
|
127
99
|
end
|
128
100
|
|
@@ -130,15 +102,27 @@ module Nokogiri
|
|
130
102
|
end
|
131
103
|
|
132
104
|
###
|
133
|
-
#
|
134
|
-
#
|
135
|
-
# Nokogiri.parse
|
105
|
+
# Parse a document contained in +args+. Nokogiri will try to guess what type of document you are
|
106
|
+
# attempting to parse. For more information, see Nokogiri.parse
|
136
107
|
#
|
137
|
-
# To specify the type of document, use Nokogiri.XML or Nokogiri.
|
108
|
+
# To specify the type of document, use {Nokogiri.XML}, {Nokogiri.HTML4}, or {Nokogiri.HTML5}.
|
138
109
|
def Nokogiri(*args, &block)
|
139
|
-
if
|
140
|
-
Nokogiri::
|
110
|
+
if block
|
111
|
+
Nokogiri::HTML4::Builder.new(&block).doc.root
|
141
112
|
else
|
142
113
|
Nokogiri.parse(*args)
|
143
114
|
end
|
144
115
|
end
|
116
|
+
|
117
|
+
require_relative "nokogiri/version"
|
118
|
+
require_relative "nokogiri/class_resolver"
|
119
|
+
require_relative "nokogiri/syntax_error"
|
120
|
+
require_relative "nokogiri/xml"
|
121
|
+
require_relative "nokogiri/xslt"
|
122
|
+
require_relative "nokogiri/html4"
|
123
|
+
require_relative "nokogiri/html"
|
124
|
+
require_relative "nokogiri/decorators/slop"
|
125
|
+
require_relative "nokogiri/css"
|
126
|
+
require_relative "nokogiri/html4/builder"
|
127
|
+
|
128
|
+
require_relative "nokogiri/html5" if Nokogiri.uses_gumbo?
|
@@ -1,7 +1,9 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
4
|
-
|
3
|
+
require "nokogiri"
|
4
|
+
|
5
|
+
module XSD
|
6
|
+
module XMLParser
|
5
7
|
###
|
6
8
|
# Nokogiri XML parser for soap4r.
|
7
9
|
#
|
@@ -26,40 +28,40 @@ module XSD # :nodoc:
|
|
26
28
|
class Nokogiri < XSD::XMLParser::Parser
|
27
29
|
###
|
28
30
|
# Create a new XSD parser with +host+ and +opt+
|
29
|
-
def initialize
|
31
|
+
def initialize(host, opt = {})
|
30
32
|
super
|
31
|
-
@parser = ::Nokogiri::XML::SAX::Parser.new(self, @charset ||
|
33
|
+
@parser = ::Nokogiri::XML::SAX::Parser.new(self, @charset || "UTF-8")
|
32
34
|
end
|
33
35
|
|
34
36
|
###
|
35
37
|
# Start parsing +string_or_readable+
|
36
|
-
def do_parse
|
38
|
+
def do_parse(string_or_readable)
|
37
39
|
@parser.parse(string_or_readable)
|
38
40
|
end
|
39
41
|
|
40
42
|
###
|
41
43
|
# Handle the start_element event with +name+ and +attrs+
|
42
|
-
def start_element
|
44
|
+
def start_element(name, attrs = [])
|
43
45
|
super(name, Hash[*attrs.flatten])
|
44
46
|
end
|
45
47
|
|
46
48
|
###
|
47
49
|
# Handle the end_element event with +name+
|
48
|
-
def end_element
|
50
|
+
def end_element(name)
|
49
51
|
super
|
50
52
|
end
|
51
53
|
|
52
54
|
###
|
53
55
|
# Handle errors with message +msg+
|
54
|
-
def error
|
55
|
-
raise ParseError
|
56
|
+
def error(msg)
|
57
|
+
raise ParseError, msg
|
56
58
|
end
|
57
|
-
|
59
|
+
alias_method :warning, :error
|
58
60
|
|
59
61
|
###
|
60
62
|
# Handle cdata_blocks containing +string+
|
61
|
-
def cdata_block
|
62
|
-
characters
|
63
|
+
def cdata_block(string)
|
64
|
+
characters(string)
|
63
65
|
end
|
64
66
|
|
65
67
|
###
|
@@ -69,16 +71,16 @@ module XSD # :nodoc:
|
|
69
71
|
# +prefix+ is the namespace prefix for the element
|
70
72
|
# +uri+ is the associated namespace URI
|
71
73
|
# +ns+ is a hash of namespace prefix:urls associated with the element
|
72
|
-
def start_element_namespace
|
74
|
+
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
|
73
75
|
###
|
74
76
|
# Deal with SAX v1 interface
|
75
|
-
name = [prefix, name].compact.join(
|
76
|
-
attributes = ns.map
|
77
|
-
[[
|
78
|
-
|
79
|
-
[[attr.prefix, attr.localname].compact.join(
|
80
|
-
|
81
|
-
start_element
|
77
|
+
name = [prefix, name].compact.join(":")
|
78
|
+
attributes = ns.map do |ns_prefix, ns_uri|
|
79
|
+
[["xmlns", ns_prefix].compact.join(":"), ns_uri]
|
80
|
+
end + attrs.map do |attr|
|
81
|
+
[[attr.prefix, attr.localname].compact.join(":"), attr.value]
|
82
|
+
end.flatten
|
83
|
+
start_element(name, attributes)
|
82
84
|
end
|
83
85
|
|
84
86
|
###
|
@@ -86,13 +88,13 @@ module XSD # :nodoc:
|
|
86
88
|
# +name+ is the element's name
|
87
89
|
# +prefix+ is the namespace prefix associated with the element
|
88
90
|
# +uri+ is the associated namespace URI
|
89
|
-
def end_element_namespace
|
91
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
90
92
|
###
|
91
93
|
# Deal with SAX v1 interface
|
92
|
-
end_element
|
94
|
+
end_element([prefix, name].compact.join(":"))
|
93
95
|
end
|
94
96
|
|
95
|
-
|
97
|
+
["xmldecl", "start_document", "end_document", "comment"].each do |name|
|
96
98
|
class_eval %{ def #{name}(*args); end }
|
97
99
|
end
|
98
100
|
|
@@ -0,0 +1,40 @@
|
|
1
|
+
From 27e4aa8d885e47a296ea78d114dbbe8fc7aa3508 Mon Sep 17 00:00:00 2001
|
2
|
+
From: Kevin Solorio <soloriok@gmail.com>
|
3
|
+
Date: Fri, 1 Feb 2019 14:32:42 -0800
|
4
|
+
Subject: [PATCH] Revert-support-html-h-b-7-1
|
5
|
+
|
6
|
+
---
|
7
|
+
entities.c | 17 -----------------
|
8
|
+
1 file changed, 17 deletions(-)
|
9
|
+
|
10
|
+
diff --git a/entities.c b/entities.c
|
11
|
+
index 43549bc5..82652f6d 100644
|
12
|
+
--- a/entities.c
|
13
|
+
+++ b/entities.c
|
14
|
+
@@ -623,23 +623,6 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
|
15
|
+
*out++ = 't';
|
16
|
+
*out++ = ';';
|
17
|
+
} else if (*cur == '&') {
|
18
|
+
- /*
|
19
|
+
- * Special handling of &{...} construct from HTML 4, see
|
20
|
+
- * http://www.w3.org/TR/html401/appendix/notes.html#h-B.7.1
|
21
|
+
- */
|
22
|
+
- if (html && attr && (cur[1] == '{') &&
|
23
|
+
- (strchr((const char *) cur, '}'))) {
|
24
|
+
- while (*cur != '}') {
|
25
|
+
- *out++ = *cur++;
|
26
|
+
- indx = out - buffer;
|
27
|
+
- if (indx + 100 > buffer_size) {
|
28
|
+
- growBufferReentrant();
|
29
|
+
- out = &buffer[indx];
|
30
|
+
- }
|
31
|
+
- }
|
32
|
+
- *out++ = *cur++;
|
33
|
+
- continue;
|
34
|
+
- }
|
35
|
+
*out++ = '&';
|
36
|
+
*out++ = 'a';
|
37
|
+
*out++ = 'm';
|
38
|
+
--
|
39
|
+
2.16.2
|
40
|
+
|
@@ -0,0 +1,44 @@
|
|
1
|
+
From ffc08467744bd2305d41ca882c37fa30adf3a067 Mon Sep 17 00:00:00 2001
|
2
|
+
From: Kevin Solorio <soloriok@gmail.com>
|
3
|
+
Date: Wed, 27 Feb 2019 14:34:17 -0800
|
4
|
+
Subject: [PATCH 2/2] update entities.c to remove handling of ssi
|
5
|
+
|
6
|
+
---
|
7
|
+
entities.c | 21 ---------------------
|
8
|
+
1 file changed, 21 deletions(-)
|
9
|
+
|
10
|
+
diff --git a/entities.c b/entities.c
|
11
|
+
index 43549bc5..5c4a2a60 100644
|
12
|
+
--- a/entities.c
|
13
|
+
+++ b/entities.c
|
14
|
+
@@ -592,27 +592,6 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
|
15
|
+
* By default one have to encode at least '<', '>', '"' and '&' !
|
16
|
+
*/
|
17
|
+
if (*cur == '<') {
|
18
|
+
- const xmlChar *end;
|
19
|
+
-
|
20
|
+
- /*
|
21
|
+
- * Special handling of server side include in HTML attributes
|
22
|
+
- */
|
23
|
+
- if (html && attr &&
|
24
|
+
- (cur[1] == '!') && (cur[2] == '-') && (cur[3] == '-') &&
|
25
|
+
- ((end = xmlStrstr(cur, BAD_CAST "-->")) != NULL)) {
|
26
|
+
- while (cur != end) {
|
27
|
+
- *out++ = *cur++;
|
28
|
+
- indx = out - buffer;
|
29
|
+
- if (indx + 100 > buffer_size) {
|
30
|
+
- growBufferReentrant();
|
31
|
+
- out = &buffer[indx];
|
32
|
+
- }
|
33
|
+
- }
|
34
|
+
- *out++ = *cur++;
|
35
|
+
- *out++ = *cur++;
|
36
|
+
- *out++ = *cur++;
|
37
|
+
- continue;
|
38
|
+
- }
|
39
|
+
*out++ = '&';
|
40
|
+
*out++ = 'l';
|
41
|
+
*out++ = 't';
|
42
|
+
--
|
43
|
+
2.16.2
|
44
|
+
|
@@ -0,0 +1,25 @@
|
|
1
|
+
From 0b6ae484761fa01242fe8b67b54e3eb2d282d83d Mon Sep 17 00:00:00 2001
|
2
|
+
From: Mike Dalessio <mike.dalessio@gmail.com>
|
3
|
+
Date: Wed, 4 Dec 2019 08:43:51 -0500
|
4
|
+
Subject: [PATCH] fix libxml2.la's path
|
5
|
+
|
6
|
+
---
|
7
|
+
Makefile.in | 2 +-
|
8
|
+
1 file changed, 1 insertion(+), 1 deletion(-)
|
9
|
+
|
10
|
+
diff --git a/Makefile.in b/Makefile.in
|
11
|
+
index cf96d41..1372d8b 100644
|
12
|
+
--- a/Makefile.in
|
13
|
+
+++ b/Makefile.in
|
14
|
+
@@ -1057,7 +1057,7 @@ clean-noinstLTLIBRARIES:
|
15
|
+
rm -f $${locs}; \
|
16
|
+
}
|
17
|
+
|
18
|
+
-libxml2.la: $(libxml2_la_OBJECTS) $(libxml2_la_DEPENDENCIES) $(EXTRA_libxml2_la_DEPENDENCIES)
|
19
|
+
+$(top_builddir)/libxml2.la: $(libxml2_la_OBJECTS) $(libxml2_la_DEPENDENCIES) $(EXTRA_libxml2_la_DEPENDENCIES)
|
20
|
+
$(AM_V_CCLD)$(libxml2_la_LINK) -rpath $(libdir) $(libxml2_la_OBJECTS) $(libxml2_la_LIBADD) $(LIBS)
|
21
|
+
|
22
|
+
testdso.la: $(testdso_la_OBJECTS) $(testdso_la_DEPENDENCIES) $(EXTRA_testdso_la_DEPENDENCIES)
|
23
|
+
--
|
24
|
+
2.17.1
|
25
|
+
|
@@ -0,0 +1,53 @@
|
|
1
|
+
From c94172d2a4451368530db2186190d70be8a1d9e5 Mon Sep 17 00:00:00 2001
|
2
|
+
From: Ilya Zub <ilya@serpapi.com>
|
3
|
+
Date: Wed, 23 Dec 2020 12:45:29 +0200
|
4
|
+
Subject: Use glibc strlen to speed up xmlStrlen
|
5
|
+
MIME-Version: 1.0
|
6
|
+
Content-Type: text/plain; charset=UTF-8
|
7
|
+
Content-Transfer-Encoding: 8bit
|
8
|
+
|
9
|
+
xmlStrlen (entire HTML file): 926171.936981 μs
|
10
|
+
glibc_xmlStrlen (entire HTML file): 36905.903992 μs
|
11
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 25.094584 times
|
12
|
+
|
13
|
+
xmlStrlen (average string): 57479.204010 μs
|
14
|
+
glibc_xmlStrlen (average string): 5802.069000 μs
|
15
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 9.905937 times
|
16
|
+
|
17
|
+
xmlStrlen (bigger string): 388056.315979 μs
|
18
|
+
glibc_xmlStrlen (bigger string): 12797.856995 μs
|
19
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 30.318382 times
|
20
|
+
|
21
|
+
xmlStrlen (smallest string): 15870.046021 μs
|
22
|
+
glibc_xmlStrlen (smallest string): 6282.208984 μs
|
23
|
+
delta (xmlStrlen ÷ glibc_xmlStrlen): 2.527903 times
|
24
|
+
|
25
|
+
See https://gitlab.gnome.org/GNOME/libxml2/-/issues/212 for reference.
|
26
|
+
---
|
27
|
+
xmlstring.c | 9 ++-------
|
28
|
+
1 file changed, 2 insertions(+), 7 deletions(-)
|
29
|
+
|
30
|
+
diff --git a/xmlstring.c b/xmlstring.c
|
31
|
+
index e8a1e45d..df247dff 100644
|
32
|
+
--- a/xmlstring.c
|
33
|
+
+++ b/xmlstring.c
|
34
|
+
@@ -423,12 +423,7 @@ xmlStrsub(const xmlChar *str, int start, int len) {
|
35
|
+
|
36
|
+
int
|
37
|
+
xmlStrlen(const xmlChar *str) {
|
38
|
+
- size_t len = 0;
|
39
|
+
-
|
40
|
+
if (str == NULL) return(0);
|
41
|
+
- while (*str != 0) { /* non input consuming */
|
42
|
+
- str++;
|
43
|
+
- len++;
|
44
|
+
- }
|
45
|
+
- return(len > INT_MAX ? 0 : len);
|
46
|
+
+
|
47
|
+
+ return strlen((const char*)str);
|
48
|
+
}
|
49
|
+
|
50
|
+
/**
|
51
|
+
--
|
52
|
+
2.29.2
|
53
|
+
|
@@ -0,0 +1,81 @@
|
|
1
|
+
This patch is a result of rake-compiler-dock using centos 7 (manylinux2014) to cross-compile.
|
2
|
+
|
3
|
+
Centos, for reasons I have not been able to discern, implements `isnan` and `isinf` as a function
|
4
|
+
and not as a macro. Debian knows how to resolve that function at dynamic-link time (despite using a
|
5
|
+
macro at compile time), but musl-based systems (like alpine) do not. Running `nm` on nokogiri.so
|
6
|
+
created on such a centos system shows:
|
7
|
+
|
8
|
+
```
|
9
|
+
U __isinf@@GLIBC_2.2.5
|
10
|
+
U __isnan@@GLIBC_2.2.5
|
11
|
+
```
|
12
|
+
|
13
|
+
(see https://github.com/sparklemotion/nokogiri/pull/2142 for more info)
|
14
|
+
|
15
|
+
This patch avoids using glibc's `isnan` and `isinf` calls, instead using libxml2's fallback
|
16
|
+
implementation. There's history here, see libxml2 commit 8813f39:
|
17
|
+
|
18
|
+
commit 8813f39
|
19
|
+
Author: Nick Wellnhofer <wellnhofer@aevum.de>
|
20
|
+
Date: 2017-09-21 00:11:26 +0200
|
21
|
+
|
22
|
+
Simplify XPath NaN, inf and -0 handling
|
23
|
+
|
24
|
+
Use C99 macros NAN, INFINITY, isnan, isinf. If they're not available:
|
25
|
+
|
26
|
+
- Assume that (0.0 / 0.0) generates a NaN and !(x == x) tests for NaN.
|
27
|
+
- Use C89's HUGE_VAL for INFINITY.
|
28
|
+
|
29
|
+
Remove manual handling of NaN, infinity and negative zero in functions
|
30
|
+
xmlXPathValueFlipSign and xmlXPathDivValues.
|
31
|
+
|
32
|
+
Remove xmlXPathGetSign. All the tests for negative zero can be replaced
|
33
|
+
with a test for negative or positive zero.
|
34
|
+
|
35
|
+
Simplify xmlXPathRoundFunction.
|
36
|
+
|
37
|
+
Remove Trio dependency.
|
38
|
+
|
39
|
+
This should work on IEEE 754 compliant implementations even if the C99
|
40
|
+
macros aren't available, but will likely break some ancient platforms.
|
41
|
+
If problems arise, my plan is to port the relevant trionan.c solution
|
42
|
+
to xpath.c. Note that non-compliant implementations are impossible
|
43
|
+
to fully support, anyway, since XPath requires IEEE 754.
|
44
|
+
|
45
|
+
This patch would be unnecessary if any of the following was true:
|
46
|
+
|
47
|
+
* centos implements these as macros, and doesn't generate an unresolved symbol for either in the shared library
|
48
|
+
* we had a way to ensure `__isinf` and `__isnan` resolve on musl (e.g., we implement them locally)
|
49
|
+
|
50
|
+
diff --git a/xpath.c b/xpath.c
|
51
|
+
index 9f64ab9..5b6d999 100644
|
52
|
+
--- a/xpath.c
|
53
|
+
+++ b/xpath.c
|
54
|
+
@@ -515,11 +515,7 @@ xmlXPathInit(void) {
|
55
|
+
*/
|
56
|
+
int
|
57
|
+
xmlXPathIsNaN(double val) {
|
58
|
+
-#ifdef isnan
|
59
|
+
- return isnan(val);
|
60
|
+
-#else
|
61
|
+
return !(val == val);
|
62
|
+
-#endif
|
63
|
+
}
|
64
|
+
|
65
|
+
/**
|
66
|
+
@@ -530,15 +530,11 @@ xmlXPathIsNaN(double val) {
|
67
|
+
*/
|
68
|
+
int
|
69
|
+
xmlXPathIsInf(double val) {
|
70
|
+
-#ifdef isinf
|
71
|
+
- return isinf(val) ? (val > 0 ? 1 : -1) : 0;
|
72
|
+
-#else
|
73
|
+
if (val >= xmlXPathPINF)
|
74
|
+
return 1;
|
75
|
+
if (val <= -xmlXPathPINF)
|
76
|
+
return -1;
|
77
|
+
return 0;
|
78
|
+
-#endif
|
79
|
+
}
|
80
|
+
|
81
|
+
#endif /* SCHEMAS or XPATH */
|