RubyGems - nokogumbo - Versions diffs - 1.5.0 → 2.0.0.pre.alpha - Mend

nokogumbo 1.5.0 → 2.0.0.pre.alpha

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +56 -0
data/README.md +146 -22
data/ext/nokogumbo/extconf.rb +116 -0
data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
data/gumbo-parser/src/ascii.c +33 -0
data/gumbo-parser/src/ascii.h +31 -0
data/gumbo-parser/src/attribute.c +26 -28
data/gumbo-parser/src/attribute.h +3 -23
data/gumbo-parser/src/char_ref.c +135 -2351
data/gumbo-parser/src/char_ref.h +13 -29
data/gumbo-parser/src/error.c +215 -133
data/gumbo-parser/src/error.h +34 -49
data/gumbo-parser/src/foreign_attrs.c +104 -0
data/gumbo-parser/src/gumbo.h +506 -304
data/gumbo-parser/src/insertion_mode.h +4 -28
data/gumbo-parser/src/macros.h +91 -0
data/gumbo-parser/src/parser.c +1989 -1431
data/gumbo-parser/src/parser.h +6 -22
data/gumbo-parser/src/replacement.h +33 -0
data/gumbo-parser/src/string_buffer.c +43 -50
data/gumbo-parser/src/string_buffer.h +24 -40
data/gumbo-parser/src/string_piece.c +39 -39
data/gumbo-parser/src/svg_attrs.c +174 -0
data/gumbo-parser/src/svg_tags.c +137 -0
data/gumbo-parser/src/tag.c +186 -59
data/gumbo-parser/src/tag_lookup.c +382 -0
data/gumbo-parser/src/tag_lookup.h +13 -0
data/gumbo-parser/src/token_type.h +1 -25
data/gumbo-parser/src/tokenizer.c +899 -495
data/gumbo-parser/src/tokenizer.h +37 -37
data/gumbo-parser/src/tokenizer_states.h +6 -22
data/gumbo-parser/src/utf8.c +103 -86
data/gumbo-parser/src/utf8.h +37 -41
data/gumbo-parser/src/util.c +48 -38
data/gumbo-parser/src/util.h +10 -40
data/gumbo-parser/src/vector.c +45 -57
data/gumbo-parser/src/vector.h +17 -39
data/lib/nokogumbo.rb +10 -174
data/lib/nokogumbo/html5.rb +250 -0
data/lib/nokogumbo/html5/document.rb +37 -0
data/lib/nokogumbo/html5/document_fragment.rb +46 -0
data/lib/nokogumbo/version.rb +3 -0
data/lib/nokogumbo/xml/node.rb +57 -0
metadata +32 -19
data/ext/nokogumboc/extconf.rb +0 -60
data/gumbo-parser/src/char_ref.rl +0 -2554
data/gumbo-parser/src/string_piece.h +0 -38
data/gumbo-parser/src/tag.in +0 -150
data/gumbo-parser/src/tag_enum.h +0 -153
data/gumbo-parser/src/tag_gperf.h +0 -105
data/gumbo-parser/src/tag_sizes.h +0 -4
data/gumbo-parser/src/tag_strings.h +0 -153
data/gumbo-parser/visualc/include/strings.h +0 -4
data/test-nokogumbo.rb +0 -190

data/lib/nokogumbo/html5.rb ADDED Viewed

@@ -0,0 +1,250 @@
+require 'nokogumbo/html5/document'
+require 'nokogumbo/html5/document_fragment'
+module Nokogiri
+  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
+  def self.HTML5(string_or_io, url = nil, encoding = nil, **options, &block)
+    Nokogiri::HTML5::Document.parse(string_or_io, url, encoding, **options, &block)
+  end
+  module HTML5
+    # HTML uses the XHTML namespace.
+    HTML_NAMESPACE = 'http://www.w3.org/1999/xhtml'.freeze
+    MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML'.freeze
+    SVG_NAMESPACE = 'http://www.w3.org/2000/svg'.freeze
+    XLINK_NAMESPACE = 'http://www.w3.org/1999/xlink'.freeze
+    XML_NAMESPACE = 'http://www.w3.org/XML/1998/namespace'.freeze
+    XMLNS_NAMESPACE = 'http://www.w3.org/2000/xmlns/'.freeze
+    # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
+    def self.parse(string, url = nil, encoding = nil, **options, &block)
+      Document.parse(string, url, encoding, options, &block)
+    end
+    # Parse a fragment from +string+. Convenience method for
+    # Nokogiri::HTML5::DocumentFragment.parse.
+    def self.fragment(string, encoding = nil, **options)
+      DocumentFragment.parse(string, encoding, options)
+    end
+    # Fetch and parse a HTML document from the web, following redirects,
+    # handling https, and determining the character encoding using HTML5
+    # rules.  +uri+ may be a +String+ or a +URI+.  +options+ contains
+    # http headers and special options.  Everything which is not a
+    # special option is considered a header.  Special options include:
+    #  * :follow_limit => number of redirects which are followed
+    #  * :basic_auth => [username, password]
+    def self.get(uri, options={})
+      headers = options.clone
+      headers = {:follow_limit => headers} if Numeric === headers # deprecated
+      limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
+      require 'net/http'
+      uri = URI(uri) unless URI === uri
+      http = Net::HTTP.new(uri.host, uri.port)
+      # TLS / SSL support
+      http.use_ssl = true if uri.scheme == 'https'
+      # Pass through Net::HTTP override values, which currently include:
+      #   :ca_file, :ca_path, :cert, :cert_store, :ciphers,
+      #   :close_on_empty_response, :continue_timeout, :key, :open_timeout,
+      #   :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
+      #   :verify_callback, :verify_depth, :verify_mode
+      options.each do |key, value|
+        http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
+      end
+      request = Net::HTTP::Get.new(uri.request_uri)
+      # basic authentication
+      auth = headers.delete(:basic_auth)
+      auth ||= [uri.user, uri.password] if uri.user && uri.password
+      request.basic_auth auth.first, auth.last if auth
+      # remaining options are treated as headers
+      headers.each {|key, value| request[key.to_s] = value.to_s}
+      response = http.request(request)
+      case response
+      when Net::HTTPSuccess
+        doc = parse(reencode(response.body, response['content-type']), options)
+        doc.instance_variable_set('@response', response)
+        doc.class.send(:attr_reader, :response)
+        doc
+      when Net::HTTPRedirection
+        response.value if limit <= 1
+        location = URI.join(uri, response['location'])
+        get(location, options.merge(:follow_limit => limit-1))
+      else
+        response.value
+      end
+    end
+    private
+    def self.read_and_encode(string, encoding)
+      # Read the string with the given encoding.
+      if string.respond_to?(:read)
+        if encoding.nil?
+          string = string.read
+        else
+        string = string.read(encoding: encoding)
+        end
+      else
+        # Otherwise the string has the given encoding.
+        if encoding && string.respond_to?(:force_encoding)
+          string = string.dup
+          string.force_encoding(encoding)
+        end
+      end
+      # convert to UTF-8 (Ruby 1.9+)
+      if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
+        string = reencode(string.dup)
+      end
+      string
+    end
+    # Charset sniffing is a complex and controversial topic that understandably
+    # isn't done _by default_ by the Ruby Net::HTTP library.  This being said,
+    # it is a very real problem for consumers of HTML as the default for HTML
+    # is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
+    # *only* supports utf-8.
+    #
+    # Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
+    # detection.  Following this lead, Nokogiri::HTML5 attempts to do likewise,
+    # while attempting to more closely follow the HTML5 standard.
+    #
+    # http://bugs.ruby-lang.org/issues/2567
+    # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
+    #
+    def self.reencode(body, content_type=nil)
+      return body unless body.respond_to? :encoding
+      if body.encoding == Encoding::ASCII_8BIT
+        encoding = nil
+        # look for a Byte Order Mark (BOM)
+        if body[0..1] == "\xFE\xFF"
+          encoding = 'utf-16be'
+        elsif body[0..1] == "\xFF\xFE"
+          encoding = 'utf-16le'
+        elsif body[0..2] == "\xEF\xBB\xBF"
+          encoding = 'utf-8'
+        end
+        # look for a charset in a content-encoding header
+        if content_type
+          encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
+        end
+        # look for a charset in a meta tag in the first 1024 bytes
+        if not encoding
+          data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
+          data.scan(/<meta.*?>/m).each do |meta|
+            encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
+          end
+        end
+        # if all else fails, default to the official default encoding for HTML
+        encoding ||= Encoding::ISO_8859_1
+        # change the encoding to match the detected or inferred encoding
+        begin
+          body.force_encoding(encoding)
+        rescue ArgumentError
+          body.force_encoding(Encoding::ISO_8859_1)
+        end
+      end
+      body.encode(Encoding::UTF_8)
+    end
+    def self.serialize_node_internal(current_node, io, encoding, options)
+      case current_node.type
+      when XML::Node::ELEMENT_NODE
+        ns = current_node.namespace
+        ns_uri = ns.nil? ? nil : ns.uri
+        # XXX(sfc): attach namespaces to all nodes, even html?
+        if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
+          tagname = current_node.name
+        else
+          tagname = "#{ns.prefix}:#{current_node.name}"
+        end
+        io << '<' << tagname
+        current_node.attribute_nodes.each do |attr|
+          attr_ns = attr.namespace
+          if attr_ns.nil?
+            attr_name = attr.name
+          else
+            ns_uri = attr_ns.href
+            if ns_uri == XML_NAMESPACE
+              attr_name = 'xml:' + attr.name.sub(/^[^:]*:/, '')
+            elsif ns_uri == XMLNS_NAMESPACE && attr.name.sub(/^[^:]*:/, '') == 'xmlns'
+              attr_name = 'xmlns'
+            elsif ns_uri == XMLNS_NAMESPACE
+              attr_name = 'xmlns:' + attr.name.sub(/^[^:]*:/, '')
+            elsif ns_uri == XLINK_NAMESPACE
+              attr_name = 'xlink:' + attr.name.sub(/^[^:]*:/, '')
+            else
+              attr_name = "#{attr_ns.prefix}:#{attr.name}"
+            end
+          end
+          io << ' ' << attr_name << '="' << escape_text(attr.content, encoding, true) << '"'
+        end
+        io << '>'
+        if !%w[area base basefont bgsound br col embed frame hr img input keygen
+               link meta param source track wbr].include?(current_node.name)
+          io << "\n" if options[:preserve_newline] && prepend_newline?(current_node)
+          current_node.children.each do |child|
+            # XXX(sfc): Templates handled specially?
+            serialize_node_internal(child, io, encoding, options)
+          end
+          io << '</' << tagname << '>'
+        end
+      when XML::Node::TEXT_NODE
+        parent = current_node.parent
+        if parent.element? && %w[style script xmp iframe noembed noframes plaintext noscript].include?(parent.name)
+          io << current_node.content
+        else
+          io << escape_text(current_node.content, encoding, false)
+        end
+      when XML::Node::CDATA_SECTION_NODE
+        io << '<![CDATA[' << current_node.content << ']]>'
+      when XML::Node::COMMENT_NODE
+        io << '<!--' << current_node.content << '-->'
+      when XML::Node::PI_NODE
+        io << '<?' << current_node.content << '>'
+      when XML::Node::DOCUMENT_TYPE_NODE, XML::Node::DTD_NODE
+          io << '<!DOCTYPE ' << current_node.name << '>'
+      when XML::Node::HTML_DOCUMENT_NODE, XML::Node::DOCUMENT_FRAG_NODE
+        current_node.children.each do |child|
+          serialize_node_internal(child, io, encoding, options)
+        end
+      else
+        raise "Unexpected node '#{current_node.name}' of type #{current_node.type}"
+      end
+    end
+    def self.escape_text(text, encoding, attribute_mode)
+      if attribute_mode
+        text = text.gsub(/[&\u00a0"]/,
+                           '&' => '&amp;', "\u00a0" => '&nbsp;', '"' => '&quot;')
+      else
+        text = text.gsub(/[&\u00a0<>]/,
+                           '&' => '&amp;', "\u00a0" => '&nbsp;',  '<' => '&lt;', '>' => '&gt;')
+      end
+      # Not part of the standard
+      text.encode(encoding, fallback: lambda { |c| "&\#x#{c.ord.to_s(16)};" })
+    end
+    def self.prepend_newline?(node)
+      return false unless %w[pre textarea listing].include?(node.name) && !node.children.empty?
+      first_child = node.children[0]
+      first_child.text? && first_child.content.start_with?("\n")
+    end
+  end
+end

data/lib/nokogumbo/html5/document.rb ADDED Viewed

@@ -0,0 +1,37 @@
+module Nokogiri
+  module HTML5
+    class Document < Nokogiri::HTML::Document
+      def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
+        yield options if block_given?
+        if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
+          encoding ||= string_or_io.encoding.name
+        end
+        if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
+          url ||= string_or_io.path
+        end
+        do_parse(string_or_io, url, encoding, options)
+      end
+      def self.read_io(io, url = nil, encoding = nil, **options)
+        raise ArgumentError.new("io object doesn't respond to :read") unless io.respon_to?(:read)
+        do_parse(io, url, encoding, options)
+      end
+      def self.read_memory(string, url = nil, encoding = nil, **options)
+        do_parse(string.to_s, url, encoding, options)
+      end
+      private
+      def self.do_parse(string_or_io, url, encoding, options)
+        string = HTML5.read_and_encode(string_or_io, encoding)
+        max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
+        max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
+        doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
+        doc.encoding = 'UTF-8'
+        doc
+      end
+    end
+  end
+end

data/lib/nokogumbo/html5/document_fragment.rb ADDED Viewed

@@ -0,0 +1,46 @@
+require 'nokogiri'
+module Nokogiri
+  module HTML5
+    class DocumentFragment < Nokogiri::HTML::DocumentFragment
+      # Create a document fragment.
+      def initialize(doc, tags = nil, ctx = nil, options = {})
+        return self unless tags
+        if ctx
+          raise Argument.new("Fragment parsing with context not supported")
+        else
+          tags = Nokogiri::HTML5.read_and_encode(tags, nil)
+          # Copied from Nokogiri's document_fragment.rb and labled "a horrible
+          # hack."
+          if tags.strip =~ /^<body/i
+            path = "/html/body"
+          else
+            path = "/html/body/node()"
+          end
+          # Add 2 for <html> and <body>.
+          max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
+          options = options.dup
+          options[:max_tree_depth] = max_depth
+          temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
+          temp_doc.xpath(path).each { |child| child.parent = self }
+        self.errors = temp_doc.errors
+        end
+      end
+      def serialize(options = {}, &block)
+        # Bypass XML::Document.serialize which doesn't support options even
+        # though XML::Node.serialize does!
+        XML::Node.instance_method(:serialize).bind(self).call(options, &block)
+      end
+      # Parse a document fragment from +tags+, returning a Nodeset.
+      def self.parse(tags, encoding = nil, options = {})
+        doc = HTML5::Document.new
+        tags = HTML5.read_and_encode(tags, encoding)
+        doc.encoding = 'UTF-8'
+        new(doc, tags, nil, options)
+      end
+    end
+  end
+end

data/lib/nokogumbo/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module Nokogumbo
+  VERSION = "2.0.0-alpha"
+end

data/lib/nokogumbo/xml/node.rb ADDED Viewed

@@ -0,0 +1,57 @@
+require 'nokogiri'
+module Nokogiri
+  # Monkey patch
+  module XML
+    class Node
+      # HTML elements can have attributes that contain colons.
+      # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
+      # and tries to create an attribute in a namespace. This is especially
+      # annoying with attribute names like xml:lang since libxml2 will
+      # actually create the xml namespace if it doesn't exist already.
+      define_method(:add_child_node_and_reparent_attrs) do |node|
+        add_child_node(node)
+        node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
+          attr.remove
+          node[attr.name] = attr.value
+        end
+      end
+      def inner_html(options = {})
+        result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
+        result << children.map { |child| child.to_html(options) }.join
+        result
+      end
+      def write_to(io, *options)
+        options = options.first.is_a?(Hash) ? options.shift : {}
+        encoding = options[:encoding] || options[0]
+        if Nokogiri.jruby?
+          save_options = options[:save_with] || options[1]
+          indent_times = options[:indent] || 0
+        else
+          save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
+          indent_times = options[:indent] || 2
+        end
+        indent_string = (options[:indent_text] || ' ') * indent_times
+        config = SaveOptions.new(save_options.to_i)
+        yield config if block_given?
+        config_options = config.options
+        if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0) || !document.is_a?(HTML5::Document)
+          # Use Nokogiri's serializing code.
+          native_write_to(io, encoding, indent_string, config_options)
+        else
+          # Serialize including the current node.
+          encoding ||= document.encoding || Encoding::UTF_8
+          internal_ops = {
+            trailing_nl: config_options & SaveOptions::FORMAT != 0,
+            preserve_newline: options[:preserve_newline] || false
+          }
+          HTML5.serialize_node_internal(self, io, encoding, options)
+        end
+      end
+    end
+  end
+end

metadata CHANGED Viewed

@@ -1,14 +1,15 @@
 --- !ruby/object:Gem::Specification
 name: nokogumbo
 version: !ruby/object:Gem::Version
-  version: 1.5.0
+  version: 2.0.0.pre.alpha
 platform: ruby
 authors:
 - Sam Ruby
+- Stephen Checkoway
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-01-27 00:00:00.000000000 Z
+date: 2018-08-31 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
@@ -26,37 +27,42 @@ dependencies:
         version: '0'
 description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
   access the result as a Nokogiri parsed document.
-email: rubys@intertwingly.net
+email:
+- rubys@intertwingly.net
+- s@pahtak.org
 executables: []
 extensions:
-- ext/nokogumboc/extconf.rb
+- ext/nokogumbo/extconf.rb
 extra_rdoc_files: []
 files:
+- CHANGELOG.md
 - LICENSE.txt
 - README.md
-- ext/nokogumboc/extconf.rb
-- ext/nokogumboc/nokogumbo.c
+- ext/nokogumbo/extconf.rb
+- ext/nokogumbo/nokogumbo.c
+- gumbo-parser/src/ascii.c
+- gumbo-parser/src/ascii.h
 - gumbo-parser/src/attribute.c
 - gumbo-parser/src/attribute.h
 - gumbo-parser/src/char_ref.c
 - gumbo-parser/src/char_ref.h
-- gumbo-parser/src/char_ref.rl
 - gumbo-parser/src/error.c
 - gumbo-parser/src/error.h
+- gumbo-parser/src/foreign_attrs.c
 - gumbo-parser/src/gumbo.h
 - gumbo-parser/src/insertion_mode.h
+- gumbo-parser/src/macros.h
 - gumbo-parser/src/parser.c
 - gumbo-parser/src/parser.h
+- gumbo-parser/src/replacement.h
 - gumbo-parser/src/string_buffer.c
 - gumbo-parser/src/string_buffer.h
 - gumbo-parser/src/string_piece.c
-- gumbo-parser/src/string_piece.h
+- gumbo-parser/src/svg_attrs.c
+- gumbo-parser/src/svg_tags.c
 - gumbo-parser/src/tag.c
-- gumbo-parser/src/tag.in
-- gumbo-parser/src/tag_enum.h
-- gumbo-parser/src/tag_gperf.h
-- gumbo-parser/src/tag_sizes.h
-- gumbo-parser/src/tag_strings.h
+- gumbo-parser/src/tag_lookup.c
+- gumbo-parser/src/tag_lookup.h
 - gumbo-parser/src/token_type.h
 - gumbo-parser/src/tokenizer.c
 - gumbo-parser/src/tokenizer.h
@@ -67,13 +73,20 @@ files:
 - gumbo-parser/src/util.h
 - gumbo-parser/src/vector.c
 - gumbo-parser/src/vector.h
-- gumbo-parser/visualc/include/strings.h
 - lib/nokogumbo.rb
-- test-nokogumbo.rb
+- lib/nokogumbo/html5.rb
+- lib/nokogumbo/html5/document.rb
+- lib/nokogumbo/html5/document_fragment.rb
+- lib/nokogumbo/version.rb
+- lib/nokogumbo/xml/node.rb
 homepage: https://github.com/rubys/nokogumbo/#readme
 licenses:
 - Apache-2.0
-metadata: {}
+metadata:
+  bug_tracker_uri: https://github.com/rubys/nokogumbo/issues
+  changelog_uri: https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md
+  homepage_uri: https://github.com/rubys/nokogumbo/#readme
+  source_code_uri: https://github.com/rubys/nokogumbo
 post_install_message:
 rdoc_options: []
 require_paths:
@@ -85,12 +98,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
       version: '0'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">="
+  - - ">"
     - !ruby/object:Gem::Version
-      version: '0'
+      version: 1.3.1
 requirements: []
 rubyforge_project:
-rubygems_version: 2.7.4
+rubygems_version: 2.7.6
 signing_key:
 specification_version: 4
 summary: Nokogiri interface to the Gumbo HTML5 parser