nokogiri 1.6.0 → 1.13.2
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +7 -0
- data/Gemfile +3 -19
- data/LICENSE-DEPENDENCIES.md +1903 -0
- data/LICENSE.md +9 -0
- data/README.md +280 -0
- data/bin/nokogiri +84 -31
- data/dependencies.yml +23 -4
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +952 -132
- data/ext/nokogiri/gumbo.c +584 -0
- data/ext/nokogiri/html4_document.c +166 -0
- data/ext/nokogiri/html4_element_description.c +294 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +120 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +231 -96
- data/ext/nokogiri/nokogiri.h +188 -129
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +49 -40
- data/ext/nokogiri/xml_attribute_decl.c +18 -18
- data/ext/nokogiri/xml_cdata.c +24 -23
- data/ext/nokogiri/xml_comment.c +29 -21
- data/ext/nokogiri/xml_document.c +327 -223
- data/ext/nokogiri/xml_document_fragment.c +12 -16
- data/ext/nokogiri/xml_dtd.c +56 -50
- data/ext/nokogiri/xml_element_content.c +31 -26
- data/ext/nokogiri/xml_element_decl.c +22 -22
- data/ext/nokogiri/xml_encoding_handler.c +45 -20
- data/ext/nokogiri/xml_entity_decl.c +32 -30
- data/ext/nokogiri/xml_entity_reference.c +16 -18
- data/ext/nokogiri/xml_namespace.c +74 -32
- data/ext/nokogiri/xml_node.c +1290 -680
- data/ext/nokogiri/xml_node_set.c +239 -208
- data/ext/nokogiri/xml_processing_instruction.c +17 -19
- data/ext/nokogiri/xml_reader.c +227 -189
- data/ext/nokogiri/xml_relax_ng.c +52 -28
- data/ext/nokogiri/xml_sax_parser.c +123 -125
- data/ext/nokogiri/xml_sax_parser_context.c +138 -79
- data/ext/nokogiri/xml_sax_push_parser.c +88 -35
- data/ext/nokogiri/xml_schema.c +112 -33
- data/ext/nokogiri/xml_syntax_error.c +50 -23
- data/ext/nokogiri/xml_text.c +14 -18
- data/ext/nokogiri/xml_xpath_context.c +227 -140
- data/ext/nokogiri/xslt_stylesheet.c +269 -177
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +101 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +626 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/gumbo.h +943 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +4875 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +222 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +169 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3463 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +68 -0
- data/gumbo-parser/src/util.h +30 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +10 -58
- data/lib/nokogiri/css/parser.rb +407 -357
- data/lib/nokogiri/css/parser.y +265 -246
- data/lib/nokogiri/css/parser_extras.rb +52 -49
- data/lib/nokogiri/css/syntax_error.rb +3 -1
- data/lib/nokogiri/css/tokenizer.rb +107 -104
- data/lib/nokogiri/css/tokenizer.rex +8 -7
- data/lib/nokogiri/css/xpath_visitor.rb +266 -80
- data/lib/nokogiri/css.rb +50 -17
- data/lib/nokogiri/decorators/slop.rb +17 -8
- data/lib/nokogiri/extension.rb +31 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +38 -27
- data/lib/nokogiri/{html → html4}/builder.rb +4 -2
- data/lib/nokogiri/html4/document.rb +331 -0
- data/lib/nokogiri/html4/document_fragment.rb +54 -0
- data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
- data/lib/nokogiri/{html → html4}/sax/parser.rb +24 -15
- data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
- data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
- data/lib/nokogiri/html4.rb +46 -0
- data/lib/nokogiri/html5/document.rb +88 -0
- data/lib/nokogiri/html5/document_fragment.rb +83 -0
- data/lib/nokogiri/html5/node.rb +96 -0
- data/lib/nokogiri/html5.rb +477 -0
- data/lib/nokogiri/jruby/dependencies.rb +21 -0
- data/lib/nokogiri/syntax_error.rb +2 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +221 -0
- data/lib/nokogiri/version.rb +3 -105
- data/lib/nokogiri/xml/attr.rb +6 -3
- data/lib/nokogiri/xml/attribute_decl.rb +3 -1
- data/lib/nokogiri/xml/builder.rb +96 -54
- data/lib/nokogiri/xml/cdata.rb +3 -1
- data/lib/nokogiri/xml/character_data.rb +2 -0
- data/lib/nokogiri/xml/document.rb +234 -95
- data/lib/nokogiri/xml/document_fragment.rb +86 -36
- data/lib/nokogiri/xml/dtd.rb +16 -4
- data/lib/nokogiri/xml/element_content.rb +2 -0
- data/lib/nokogiri/xml/element_decl.rb +3 -1
- data/lib/nokogiri/xml/entity_decl.rb +4 -2
- data/lib/nokogiri/xml/entity_reference.rb +20 -0
- data/lib/nokogiri/xml/namespace.rb +3 -0
- data/lib/nokogiri/xml/node/save_options.rb +8 -4
- data/lib/nokogiri/xml/node.rb +947 -502
- data/lib/nokogiri/xml/node_set.rb +168 -159
- data/lib/nokogiri/xml/notation.rb +13 -0
- data/lib/nokogiri/xml/parse_options.rb +40 -5
- data/lib/nokogiri/xml/pp/character_data.rb +9 -6
- data/lib/nokogiri/xml/pp/node.rb +25 -26
- data/lib/nokogiri/xml/pp.rb +4 -2
- data/lib/nokogiri/xml/processing_instruction.rb +3 -1
- data/lib/nokogiri/xml/reader.rb +23 -28
- data/lib/nokogiri/xml/relax_ng.rb +8 -2
- data/lib/nokogiri/xml/sax/document.rb +45 -49
- data/lib/nokogiri/xml/sax/parser.rb +43 -41
- data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
- data/lib/nokogiri/xml/sax.rb +6 -4
- data/lib/nokogiri/xml/schema.rb +19 -9
- data/lib/nokogiri/xml/searchable.rb +270 -0
- data/lib/nokogiri/xml/syntax_error.rb +25 -1
- data/lib/nokogiri/xml/text.rb +2 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
- data/lib/nokogiri/xml/xpath.rb +15 -4
- data/lib/nokogiri/xml/xpath_context.rb +3 -3
- data/lib/nokogiri/xml.rb +38 -36
- data/lib/nokogiri/xslt/stylesheet.rb +3 -1
- data/lib/nokogiri/xslt.rb +29 -20
- data/lib/nokogiri.rb +69 -69
- data/lib/xsd/xmlparser/nokogiri.rb +26 -24
- data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
- data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
- data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +3040 -0
- data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +3037 -0
- data/ports/archives/libxml2-2.9.13.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
- metadata +278 -362
- data/.autotest +0 -26
- data/.gemtest +0 -0
- data/.travis.yml +0 -27
- data/CHANGELOG.ja.rdoc +0 -819
- data/CHANGELOG.rdoc +0 -819
- data/C_CODING_STYLE.rdoc +0 -33
- data/Manifest.txt +0 -315
- data/README.ja.rdoc +0 -106
- data/README.rdoc +0 -175
- data/ROADMAP.md +0 -90
- data/Rakefile +0 -246
- data/STANDARD_RESPONSES.md +0 -47
- data/Y_U_NO_GEMSPEC.md +0 -155
- data/build_all +0 -105
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -56
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -13
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -14
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/document.rb +0 -254
- data/lib/nokogiri/html/document_fragment.rb +0 -41
- data/lib/nokogiri/html/element_description_defaults.rb +0 -671
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/lib/nokogiri/html/sax/push_parser.rb +0 -16
- data/ports/archives/libxml2-2.8.0.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.26.tar.gz +0 -0
- data/tasks/cross_compile.rb +0 -132
- data/tasks/nokogiri.org.rb +0 -24
- data/tasks/test.rb +0 -95
- data/test/css/test_nthiness.rb +0 -159
- data/test/css/test_parser.rb +0 -341
- data/test/css/test_tokenizer.rb +0 -198
- data/test/css/test_xpath_visitor.rb +0 -91
- data/test/decorators/test_slop.rb +0 -16
- data/test/files/2ch.html +0 -108
- data/test/files/address_book.rlx +0 -12
- data/test/files/address_book.xml +0 -10
- data/test/files/bar/bar.xsd +0 -4
- data/test/files/bogus.xml +0 -0
- data/test/files/dont_hurt_em_why.xml +0 -422
- data/test/files/encoding.html +0 -82
- data/test/files/encoding.xhtml +0 -84
- data/test/files/exslt.xml +0 -8
- data/test/files/exslt.xslt +0 -35
- data/test/files/foo/foo.xsd +0 -4
- data/test/files/metacharset.html +0 -10
- data/test/files/noencoding.html +0 -47
- data/test/files/po.xml +0 -32
- data/test/files/po.xsd +0 -66
- data/test/files/saml/saml20assertion_schema.xsd +0 -283
- data/test/files/saml/saml20protocol_schema.xsd +0 -302
- data/test/files/saml/xenc_schema.xsd +0 -146
- data/test/files/saml/xmldsig_schema.xsd +0 -318
- data/test/files/shift_jis.html +0 -10
- data/test/files/shift_jis.xml +0 -5
- data/test/files/snuggles.xml +0 -3
- data/test/files/staff.dtd +0 -10
- data/test/files/staff.xml +0 -59
- data/test/files/staff.xslt +0 -32
- data/test/files/test_document_url/bar.xml +0 -2
- data/test/files/test_document_url/document.dtd +0 -4
- data/test/files/test_document_url/document.xml +0 -6
- data/test/files/tlm.html +0 -850
- data/test/files/to_be_xincluded.xml +0 -2
- data/test/files/valid_bar.xml +0 -2
- data/test/files/xinclude.xml +0 -4
- data/test/helper.rb +0 -154
- data/test/html/sax/test_parser.rb +0 -141
- data/test/html/sax/test_parser_context.rb +0 -46
- data/test/html/test_builder.rb +0 -164
- data/test/html/test_document.rb +0 -552
- data/test/html/test_document_encoding.rb +0 -138
- data/test/html/test_document_fragment.rb +0 -261
- data/test/html/test_element_description.rb +0 -105
- data/test/html/test_named_characters.rb +0 -14
- data/test/html/test_node.rb +0 -196
- data/test/html/test_node_encoding.rb +0 -27
- data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
- data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
- data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
- data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
- data/test/test_convert_xpath.rb +0 -135
- data/test/test_css_cache.rb +0 -45
- data/test/test_encoding_handler.rb +0 -46
- data/test/test_memory_leak.rb +0 -156
- data/test/test_nokogiri.rb +0 -132
- data/test/test_reader.rb +0 -555
- data/test/test_soap4r_sax.rb +0 -52
- data/test/test_xslt_transforms.rb +0 -254
- data/test/xml/node/test_save_options.rb +0 -28
- data/test/xml/node/test_subclass.rb +0 -44
- data/test/xml/sax/test_parser.rb +0 -366
- data/test/xml/sax/test_parser_context.rb +0 -106
- data/test/xml/sax/test_push_parser.rb +0 -157
- data/test/xml/test_attr.rb +0 -64
- data/test/xml/test_attribute_decl.rb +0 -86
- data/test/xml/test_builder.rb +0 -306
- data/test/xml/test_c14n.rb +0 -151
- data/test/xml/test_cdata.rb +0 -48
- data/test/xml/test_comment.rb +0 -29
- data/test/xml/test_document.rb +0 -828
- data/test/xml/test_document_encoding.rb +0 -28
- data/test/xml/test_document_fragment.rb +0 -223
- data/test/xml/test_dtd.rb +0 -103
- data/test/xml/test_dtd_encoding.rb +0 -33
- data/test/xml/test_element_content.rb +0 -56
- data/test/xml/test_element_decl.rb +0 -73
- data/test/xml/test_entity_decl.rb +0 -122
- data/test/xml/test_entity_reference.rb +0 -245
- data/test/xml/test_namespace.rb +0 -95
- data/test/xml/test_node.rb +0 -1137
- data/test/xml/test_node_attributes.rb +0 -96
- data/test/xml/test_node_encoding.rb +0 -107
- data/test/xml/test_node_inheritance.rb +0 -32
- data/test/xml/test_node_reparenting.rb +0 -374
- data/test/xml/test_node_set.rb +0 -755
- data/test/xml/test_parse_options.rb +0 -64
- data/test/xml/test_processing_instruction.rb +0 -30
- data/test/xml/test_reader_encoding.rb +0 -142
- data/test/xml/test_relax_ng.rb +0 -60
- data/test/xml/test_schema.rb +0 -103
- data/test/xml/test_syntax_error.rb +0 -12
- data/test/xml/test_text.rb +0 -45
- data/test/xml/test_unparented_node.rb +0 -422
- data/test/xml/test_xinclude.rb +0 -83
- data/test/xml/test_xpath.rb +0 -295
- data/test/xslt/test_custom_functions.rb +0 -133
- data/test/xslt/test_exception_handling.rb +0 -37
- data/test_all +0 -81
@@ -1,254 +0,0 @@
|
|
1
|
-
module Nokogiri
|
2
|
-
module HTML
|
3
|
-
class Document < Nokogiri::XML::Document
|
4
|
-
###
|
5
|
-
# Get the meta tag encoding for this document. If there is no meta tag,
|
6
|
-
# then nil is returned.
|
7
|
-
def meta_encoding
|
8
|
-
meta = meta_content_type and
|
9
|
-
match = /charset\s*=\s*([\w-]+)/i.match(meta['content']) and
|
10
|
-
match[1]
|
11
|
-
end
|
12
|
-
|
13
|
-
###
|
14
|
-
# Set the meta tag encoding for this document. If there is no meta
|
15
|
-
# content tag, the encoding is not set.
|
16
|
-
def meta_encoding= encoding
|
17
|
-
meta = meta_content_type and
|
18
|
-
meta['content'] = "text/html; charset=%s" % encoding
|
19
|
-
end
|
20
|
-
|
21
|
-
def meta_content_type
|
22
|
-
css('meta[@http-equiv]').find { |node|
|
23
|
-
node['http-equiv'] =~ /\AContent-Type\z/i and
|
24
|
-
!node['content'].nil? and
|
25
|
-
!node['content'].empty?
|
26
|
-
}
|
27
|
-
end
|
28
|
-
private :meta_content_type
|
29
|
-
|
30
|
-
###
|
31
|
-
# Get the title string of this document. Return nil if there is
|
32
|
-
# no title tag.
|
33
|
-
def title
|
34
|
-
title = at('title') and title.inner_text
|
35
|
-
end
|
36
|
-
|
37
|
-
###
|
38
|
-
# Set the title string of this document. If there is no head
|
39
|
-
# element, the title is not set.
|
40
|
-
def title=(text)
|
41
|
-
unless title = at('title')
|
42
|
-
head = at('head') or return nil
|
43
|
-
title = Nokogiri::XML::Node.new('title', self)
|
44
|
-
head << title
|
45
|
-
end
|
46
|
-
title.children = XML::Text.new(text, self)
|
47
|
-
end
|
48
|
-
|
49
|
-
####
|
50
|
-
# Serialize Node using +options+. Save options can also be set using a
|
51
|
-
# block. See SaveOptions.
|
52
|
-
#
|
53
|
-
# These two statements are equivalent:
|
54
|
-
#
|
55
|
-
# node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
|
56
|
-
#
|
57
|
-
# or
|
58
|
-
#
|
59
|
-
# node.serialize(:encoding => 'UTF-8') do |config|
|
60
|
-
# config.format.as_xml
|
61
|
-
# end
|
62
|
-
#
|
63
|
-
def serialize options = {}
|
64
|
-
options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
|
65
|
-
super
|
66
|
-
end
|
67
|
-
|
68
|
-
####
|
69
|
-
# Create a Nokogiri::XML::DocumentFragment from +tags+
|
70
|
-
def fragment tags = nil
|
71
|
-
DocumentFragment.new(self, tags, self.root)
|
72
|
-
end
|
73
|
-
|
74
|
-
class << self
|
75
|
-
###
|
76
|
-
# Parse HTML. +string_or_io+ may be a String, or any object that
|
77
|
-
# responds to _read_ and _close_ such as an IO, or StringIO.
|
78
|
-
# +url+ is resource where this document is located. +encoding+ is the
|
79
|
-
# encoding that should be used when processing the document. +options+
|
80
|
-
# is a number that sets options in the parser, such as
|
81
|
-
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
|
82
|
-
# Nokogiri::XML::ParseOptions.
|
83
|
-
def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
|
84
|
-
|
85
|
-
options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
|
86
|
-
# Give the options to the user
|
87
|
-
yield options if block_given?
|
88
|
-
|
89
|
-
if string_or_io.respond_to?(:encoding)
|
90
|
-
unless string_or_io.encoding.name == "ASCII-8BIT"
|
91
|
-
encoding ||= string_or_io.encoding.name
|
92
|
-
end
|
93
|
-
end
|
94
|
-
|
95
|
-
if string_or_io.respond_to?(:read)
|
96
|
-
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
97
|
-
if !encoding
|
98
|
-
# Libxml2's parser has poor support for encoding
|
99
|
-
# detection. First, it does not recognize the HTML5
|
100
|
-
# style meta charset declaration. Secondly, even if it
|
101
|
-
# successfully detects an encoding hint, it does not
|
102
|
-
# re-decode or re-parse the preceding part which may be
|
103
|
-
# garbled.
|
104
|
-
#
|
105
|
-
# EncodingReader aims to perform advanced encoding
|
106
|
-
# detection beyond what Libxml2 does, and to emulate
|
107
|
-
# rewinding of a stream and make Libxml2 redo parsing
|
108
|
-
# from the start when an encoding hint is found.
|
109
|
-
string_or_io = EncodingReader.new(string_or_io)
|
110
|
-
begin
|
111
|
-
return read_io(string_or_io, url, encoding, options.to_i)
|
112
|
-
rescue EncodingFound => e
|
113
|
-
encoding = e.found_encoding
|
114
|
-
end
|
115
|
-
end
|
116
|
-
return read_io(string_or_io, url, encoding, options.to_i)
|
117
|
-
end
|
118
|
-
|
119
|
-
# read_memory pukes on empty docs
|
120
|
-
return new if string_or_io.nil? or string_or_io.empty?
|
121
|
-
|
122
|
-
encoding ||= EncodingReader.detect_encoding(string_or_io)
|
123
|
-
|
124
|
-
read_memory(string_or_io, url, encoding, options.to_i)
|
125
|
-
end
|
126
|
-
end
|
127
|
-
|
128
|
-
class EncodingFound < StandardError # :nodoc:
|
129
|
-
attr_reader :found_encoding
|
130
|
-
|
131
|
-
def initialize(encoding)
|
132
|
-
@found_encoding = encoding
|
133
|
-
super("encoding found: %s" % encoding)
|
134
|
-
end
|
135
|
-
end
|
136
|
-
|
137
|
-
class EncodingReader # :nodoc:
|
138
|
-
class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
|
139
|
-
attr_reader :encoding
|
140
|
-
|
141
|
-
def initialize
|
142
|
-
@encoding = nil
|
143
|
-
super()
|
144
|
-
end
|
145
|
-
|
146
|
-
def start_element(name, attrs = [])
|
147
|
-
return unless name == 'meta'
|
148
|
-
attr = Hash[attrs]
|
149
|
-
charset = attr['charset'] and
|
150
|
-
@encoding = charset
|
151
|
-
http_equiv = attr['http-equiv'] and
|
152
|
-
http_equiv.match(/\AContent-Type\z/i) and
|
153
|
-
content = attr['content'] and
|
154
|
-
m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
|
155
|
-
@encoding = m[1]
|
156
|
-
end
|
157
|
-
end
|
158
|
-
|
159
|
-
class JumpSAXHandler < SAXHandler
|
160
|
-
def initialize(jumptag)
|
161
|
-
@jumptag = jumptag
|
162
|
-
super()
|
163
|
-
end
|
164
|
-
|
165
|
-
def start_element(name, attrs = [])
|
166
|
-
super
|
167
|
-
throw @jumptag, @encoding if @encoding
|
168
|
-
throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
|
169
|
-
end
|
170
|
-
end
|
171
|
-
|
172
|
-
def self.detect_encoding(chunk)
|
173
|
-
if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
|
174
|
-
return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
|
175
|
-
end
|
176
|
-
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
177
|
-
return Nokogiri.XML(m[1]).encoding
|
178
|
-
|
179
|
-
if Nokogiri.jruby?
|
180
|
-
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
181
|
-
return m[4]
|
182
|
-
catch(:encoding_found) {
|
183
|
-
Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
184
|
-
nil
|
185
|
-
}
|
186
|
-
else
|
187
|
-
handler = SAXHandler.new
|
188
|
-
parser = Nokogiri::HTML::SAX::PushParser.new(handler)
|
189
|
-
parser << chunk rescue Nokogiri::SyntaxError
|
190
|
-
handler.encoding
|
191
|
-
end
|
192
|
-
end
|
193
|
-
|
194
|
-
def self.is_jruby_without_fix?
|
195
|
-
JRUBY_VERSION.split('.').join.to_i < 165
|
196
|
-
end
|
197
|
-
|
198
|
-
def self.detect_encoding_for_jruby_without_fix(chunk)
|
199
|
-
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
200
|
-
return Nokogiri.XML(m[1]).encoding
|
201
|
-
|
202
|
-
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
203
|
-
return m[4]
|
204
|
-
|
205
|
-
catch(:encoding_found) {
|
206
|
-
Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
|
207
|
-
nil
|
208
|
-
}
|
209
|
-
rescue Nokogiri::SyntaxError, RuntimeError
|
210
|
-
# Ignore parser errors that nokogiri may raise
|
211
|
-
nil
|
212
|
-
end
|
213
|
-
|
214
|
-
def initialize(io)
|
215
|
-
@io = io
|
216
|
-
@firstchunk = nil
|
217
|
-
@encoding_found = nil
|
218
|
-
end
|
219
|
-
|
220
|
-
# This method is used by the C extension so that
|
221
|
-
# Nokogiri::HTML::Document#read_io() does not leak memory when
|
222
|
-
# EncodingFound is raised.
|
223
|
-
attr_reader :encoding_found
|
224
|
-
|
225
|
-
def read(len)
|
226
|
-
# no support for a call without len
|
227
|
-
|
228
|
-
if !@firstchunk
|
229
|
-
@firstchunk = @io.read(len) or return nil
|
230
|
-
|
231
|
-
# This implementation expects that the first call from
|
232
|
-
# htmlReadIO() is made with a length long enough (~1KB) to
|
233
|
-
# achieve advanced encoding detection.
|
234
|
-
if encoding = EncodingReader.detect_encoding(@firstchunk)
|
235
|
-
# The first chunk is stored for the next read in retry.
|
236
|
-
raise @encoding_found = EncodingFound.new(encoding)
|
237
|
-
end
|
238
|
-
end
|
239
|
-
@encoding_found = nil
|
240
|
-
|
241
|
-
ret = @firstchunk.slice!(0, len)
|
242
|
-
if (len -= ret.length) > 0
|
243
|
-
rest = @io.read(len) and ret << rest
|
244
|
-
end
|
245
|
-
if ret.empty?
|
246
|
-
nil
|
247
|
-
else
|
248
|
-
ret
|
249
|
-
end
|
250
|
-
end
|
251
|
-
end
|
252
|
-
end
|
253
|
-
end
|
254
|
-
end
|
@@ -1,41 +0,0 @@
|
|
1
|
-
module Nokogiri
|
2
|
-
module HTML
|
3
|
-
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
4
|
-
attr_accessor :errors
|
5
|
-
|
6
|
-
####
|
7
|
-
# Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
|
8
|
-
def self.parse tags, encoding = nil
|
9
|
-
doc = HTML::Document.new
|
10
|
-
|
11
|
-
encoding ||= tags.respond_to?(:encoding) ? tags.encoding.name : 'UTF-8'
|
12
|
-
doc.encoding = encoding
|
13
|
-
|
14
|
-
new(doc, tags)
|
15
|
-
end
|
16
|
-
|
17
|
-
def initialize document, tags = nil, ctx = nil
|
18
|
-
return self unless tags
|
19
|
-
|
20
|
-
if ctx
|
21
|
-
preexisting_errors = document.errors.dup
|
22
|
-
node_set = ctx.parse("<div>#{tags}</div>")
|
23
|
-
node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
|
24
|
-
self.errors = document.errors - preexisting_errors
|
25
|
-
else
|
26
|
-
# This is a horrible hack, but I don't care
|
27
|
-
if tags.strip =~ /^<body/i
|
28
|
-
path = "/html/body"
|
29
|
-
else
|
30
|
-
path = "/html/body/node()"
|
31
|
-
end
|
32
|
-
|
33
|
-
temp_doc = HTML::Document.parse "<html><body>#{tags}", nil, document.encoding
|
34
|
-
temp_doc.xpath(path).each { |child| child.parent = self }
|
35
|
-
self.errors = temp_doc.errors
|
36
|
-
end
|
37
|
-
children
|
38
|
-
end
|
39
|
-
end
|
40
|
-
end
|
41
|
-
end
|