RubyGems - nokogumbo - Versions diffs - 2.0.0.pre.alpha → 2.0.0 - Mend

nokogumbo 2.0.0.pre.alpha → 2.0.0

Files changed (29) hide show

checksums.yaml +4 -4
data/README.md +81 -10
data/ext/nokogumbo/extconf.rb +6 -1
data/ext/nokogumbo/nokogumbo.c +579 -233
data/gumbo-parser/src/ascii.c +42 -0
data/gumbo-parser/src/ascii.h +91 -7
data/gumbo-parser/src/char_ref.c +5973 -4601
data/gumbo-parser/src/char_ref.h +13 -28
data/gumbo-parser/src/error.c +376 -120
data/gumbo-parser/src/error.h +63 -125
data/gumbo-parser/src/gumbo.h +47 -4
data/gumbo-parser/src/parser.c +849 -709
data/gumbo-parser/src/string_buffer.c +1 -1
data/gumbo-parser/src/string_buffer.h +1 -1
data/gumbo-parser/src/token_buffer.c +79 -0
data/gumbo-parser/src/token_buffer.h +71 -0
data/gumbo-parser/src/tokenizer.c +1426 -1261
data/gumbo-parser/src/tokenizer.h +5 -5
data/gumbo-parser/src/tokenizer_states.h +275 -23
data/gumbo-parser/src/utf8.c +12 -59
data/gumbo-parser/src/utf8.h +51 -16
data/lib/nokogumbo.rb +0 -1
data/lib/nokogumbo/html5.rb +2 -1
data/lib/nokogumbo/html5/document.rb +12 -1
data/lib/nokogumbo/html5/document_fragment.rb +35 -20
data/lib/nokogumbo/{xml → html5}/node.rb +28 -13
data/lib/nokogumbo/version.rb +1 -1
metadata +16 -9
data/CHANGELOG.md +0 -56

@@ -30,7 +30,8 @@ struct GumboInternalError;
 struct GumboInternalParser;
 // Unicode replacement char.
-extern const int kUtf8ReplacementChar;
+#define kUtf8ReplacementChar 0xFFFD
+#define kUtf8MaxChar 0x10FFFF
 typedef struct GumboInternalUtf8Iterator {
   // Points at the start of the code point most recently read into 'current'.
@@ -60,9 +61,23 @@ typedef struct GumboInternalUtf8Iterator {
   struct GumboInternalParser* _parser;
 } Utf8Iterator;
-// Returns true if this Unicode code point is in the list of characters
-// forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
-bool utf8_is_invalid_code_point(int c) CONST_FN;
+// Returns true if this Unicode code point is a surrogate.
+CONST_FN static inline bool utf8_is_surrogate(int c) {
+  return c >= 0xD800 && c <= 0xDFFF;
+}
+// Returns true if this Unicode code point is a noncharacter.
+CONST_FN static inline bool utf8_is_noncharacter(int c) {
+  return
+    (c >= 0xFDD0 && c <= 0xFDEF)
+    || ((c & 0xFFFF) == 0xFFFE)
+    || ((c & 0xFFFF) == 0xFFFF);
+}
+// Returns true if this Unicode code point is a control.
+CONST_FN static inline bool utf8_is_control(int c) {
+  return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
+}
 // Initializes a new Utf8Iterator from the given byte buffer. The source does
 // not have to be NUL-terminated, but the length must be passed in explicitly.
@@ -77,20 +92,47 @@ void utf8iterator_init (
 void utf8iterator_next(Utf8Iterator* iter);
 // Returns the current code point as an integer.
-int utf8iterator_current(const Utf8Iterator* iter);
+static inline int utf8iterator_current(const Utf8Iterator* iter) {
+  return iter->_current;
+}
 // Retrieves and fills the output parameter with the current source position.
-void utf8iterator_get_position(
-    const Utf8Iterator* iter, GumboSourcePosition* output);
+static inline void utf8iterator_get_position (
+  const Utf8Iterator* iter,
+  GumboSourcePosition* output
+) {
+  *output = iter->_pos;
+}
+// Retrieves the marked position.
+static inline GumboSourcePosition utf8iterator_get_mark_position (
+  const Utf8Iterator* iter
+) {
+  return iter->_mark_pos;
+}
 // Retrieves a character pointer to the start of the current character.
-const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
+static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
+  return iter->_start;
+}
+// Retrieves the width of the current character.
+static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
+  return iter->_width;
+}
 // Retrieves a character pointer to 1 past the end of the buffer. This is
 // necessary for certain state machines and string comparisons that would like
 // to look directly for ASCII text in the buffer without going through the
 // decoder.
-const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
+static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
+  return iter->_end;
+}
+// Retrieves a character pointer to the marked position.
+static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
+  return iter->_mark;
+}
 // If the upcoming text in the buffer matches the specified prefix (which has
 // length 'length'), consume it and return true. Otherwise, return false with
@@ -114,13 +156,6 @@ void utf8iterator_mark(Utf8Iterator* iter);
 // Returns the current input stream position to the mark.
 void utf8iterator_reset(Utf8Iterator* iter);
-// Sets the position and original text fields of an error to the value at the
-// mark.
-void utf8iterator_fill_error_at_mark (
-  Utf8Iterator* iter,
-  struct GumboInternalError* error
-);
 #ifdef __cplusplus
 }
 #endif

data/lib/nokogumbo.rb CHANGED

@@ -1,7 +1,6 @@
 require 'nokogiri'
 require 'nokogumbo/version'
 require 'nokogumbo/html5'
-require 'nokogumbo/xml/node.rb'
 require 'nokogumbo/nokogumbo'

data/lib/nokogumbo/html5.rb CHANGED

@@ -1,5 +1,6 @@
 require 'nokogumbo/html5/document'
 require 'nokogumbo/html5/document_fragment'
+require 'nokogumbo/html5/node'
 module Nokogiri
   # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
@@ -167,7 +168,7 @@ module Nokogiri
       case current_node.type
       when XML::Node::ELEMENT_NODE
         ns = current_node.namespace
-        ns_uri = ns.nil? ? nil : ns.uri
+        ns_uri = ns.nil? ? nil : ns.href
         # XXX(sfc): attach namespaces to all nodes, even html?
         if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
           tagname = current_node.name

data/lib/nokogumbo/html5/document.rb CHANGED

@@ -3,6 +3,7 @@ module Nokogiri
     class Document < Nokogiri::HTML::Document
       def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
         yield options if block_given?
+	string_or_io = '' unless string_or_io
         if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
           encoding ||= string_or_io.encoding.name
@@ -15,7 +16,7 @@ module Nokogiri
       end
       def self.read_io(io, url = nil, encoding = nil, **options)
-        raise ArgumentError.new("io object doesn't respond to :read") unless io.respon_to?(:read)
+        raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
         do_parse(io, url, encoding, options)
       end
@@ -23,6 +24,16 @@ module Nokogiri
         do_parse(string.to_s, url, encoding, options)
       end
+      def fragment(tags = nil)
+        DocumentFragment.new(self, tags, self.root)
+      end
+      def to_xml(options = {}, &block)
+        # Bypass XML::Document#to_xml which doesn't add
+        # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
+        XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
+      end
       private
       def self.do_parse(string_or_io, url, encoding, options)
         string = HTML5.read_and_encode(string_or_io, encoding)

data/lib/nokogumbo/html5/document_fragment.rb CHANGED

@@ -3,29 +3,19 @@ require 'nokogiri'
 module Nokogiri
   module HTML5
     class DocumentFragment < Nokogiri::HTML::DocumentFragment
+      attr_accessor :document
+      attr_accessor :errors
       # Create a document fragment.
       def initialize(doc, tags = nil, ctx = nil, options = {})
+        self.document = doc
+        self.errors = []
         return self unless tags
-        if ctx
-          raise Argument.new("Fragment parsing with context not supported")
-        else
-          tags = Nokogiri::HTML5.read_and_encode(tags, nil)
-          # Copied from Nokogiri's document_fragment.rb and labled "a horrible
-          # hack."
-          if tags.strip =~ /^<body/i
-            path = "/html/body"
-          else
-            path = "/html/body/node()"
-          end
-          # Add 2 for <html> and <body>.
-          max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
-          options = options.dup
-          options[:max_tree_depth] = max_depth
-          temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
-          temp_doc.xpath(path).each { |child| child.parent = self }
-        self.errors = temp_doc.errors
-        end
+        max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
+        max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
+        tags = Nokogiri::HTML5.read_and_encode(tags, nil)
+        Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
       end
       def serialize(options = {}, &block)
@@ -41,6 +31,31 @@ module Nokogiri
         doc.encoding = 'UTF-8'
         new(doc, tags, nil, options)
       end
+      def extract_params params # :nodoc:
+        handler = params.find do |param|
+          ![Hash, String, Symbol].include?(param.class)
+        end
+        params -= [handler] if handler
+        hashes = []
+        while Hash === params.last || params.last.nil?
+          hashes << params.pop
+          break if params.empty?
+        end
+        ns, binds = hashes.reverse
+        ns ||=
+          begin
+            ns = Hash.new
+            children.each { |child| ns.merge!(child.namespaces) }
+            ns
+          end
+        [params, handler, ns, binds]
+      end
     end
   end
 end
+# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:

data/lib/nokogumbo/{xml → html5}/node.rb RENAMED

@@ -1,57 +1,72 @@
 require 'nokogiri'
 module Nokogiri
-  # Monkey patch
-  module XML
-    class Node
+  module HTML5
+    module Node
       # HTML elements can have attributes that contain colons.
       # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
       # and tries to create an attribute in a namespace. This is especially
       # annoying with attribute names like xml:lang since libxml2 will
       # actually create the xml namespace if it doesn't exist already.
-      define_method(:add_child_node_and_reparent_attrs) do |node|
+      def add_child_node_and_reparent_attrs(node)
+        return super(node) unless document.is_a?(HTML5::Document)
+        # I'm not sure what this method is supposed to do. Reparenting
+        # namespaces is handled by libxml2, including child namespaces which
+        # this method wouldn't handle.
+        # https://github.com/sparklemotion/nokogiri/issues/1790
         add_child_node(node)
-        node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
-          attr.remove
-          node[attr.name] = attr.value
-        end
+        #node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
+        #  attr.remove
+        #  ns = attr.namespace
+        #  a["#{ns.prefix}:#{attr.name}"] = attr.value
+        #end
       end
       def inner_html(options = {})
+        return super(options) unless document.is_a?(HTML5::Document)
         result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
         result << children.map { |child| child.to_html(options) }.join
         result
       end
       def write_to(io, *options)
+        return super(io, *options) unless document.is_a?(HTML5::Document)
         options = options.first.is_a?(Hash) ? options.shift : {}
         encoding = options[:encoding] || options[0]
         if Nokogiri.jruby?
           save_options = options[:save_with] || options[1]
           indent_times = options[:indent] || 0
         else
-          save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
+          save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
           indent_times = options[:indent] || 2
         end
         indent_string = (options[:indent_text] || ' ') * indent_times
-        config = SaveOptions.new(save_options.to_i)
+        config = XML::Node::SaveOptions.new(save_options.to_i)
         yield config if block_given?
         config_options = config.options
-        if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0) || !document.is_a?(HTML5::Document)
+        if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
           # Use Nokogiri's serializing code.
           native_write_to(io, encoding, indent_string, config_options)
         else
           # Serialize including the current node.
           encoding ||= document.encoding || Encoding::UTF_8
           internal_ops = {
-            trailing_nl: config_options & SaveOptions::FORMAT != 0,
             preserve_newline: options[:preserve_newline] || false
           }
-          HTML5.serialize_node_internal(self, io, encoding, options)
+          HTML5.serialize_node_internal(self, io, encoding, internal_ops)
         end
       end
+      def fragment(tags)
+        return super(tags) unless document.is_a?(HTML5::Document)
+        DocumentFragment.new(document, tags, self)
+      end
     end
+    # Monkey patch
+    XML::Node.prepend(HTML5::Node)
   end
 end
+# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:

data/lib/nokogumbo/version.rb CHANGED

@@ -1,3 +1,3 @@
 module Nokogumbo
-  VERSION = "2.0.0-alpha"
+  VERSION = "2.0.0"
 end

metadata CHANGED

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: nokogumbo
 version: !ruby/object:Gem::Version
-  version: 2.0.0.pre.alpha
+  version: 2.0.0
 platform: ruby
 authors:
 - Sam Ruby
@@ -9,22 +9,28 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2018-08-31 00:00:00.000000000 Z
+date: 2018-10-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: nokogiri
   requirement: !ruby/object:Gem::Requirement
     requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.8.4
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.8'
     - - ">="
       - !ruby/object:Gem::Version
-        version: '0'
+        version: 1.8.4
 description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
   access the result as a Nokogiri parsed document.
 email:
@@ -35,7 +41,6 @@ extensions:
 - ext/nokogumbo/extconf.rb
 extra_rdoc_files: []
 files:
-- CHANGELOG.md
 - LICENSE.txt
 - README.md
 - ext/nokogumbo/extconf.rb
@@ -63,6 +68,8 @@ files:
 - gumbo-parser/src/tag.c
 - gumbo-parser/src/tag_lookup.c
 - gumbo-parser/src/tag_lookup.h
+- gumbo-parser/src/token_buffer.c
+- gumbo-parser/src/token_buffer.h
 - gumbo-parser/src/token_type.h
 - gumbo-parser/src/tokenizer.c
 - gumbo-parser/src/tokenizer.h
@@ -77,8 +84,8 @@ files:
 - lib/nokogumbo/html5.rb
 - lib/nokogumbo/html5/document.rb
 - lib/nokogumbo/html5/document_fragment.rb
+- lib/nokogumbo/html5/node.rb
 - lib/nokogumbo/version.rb
-- lib/nokogumbo/xml/node.rb
 homepage: https://github.com/rubys/nokogumbo/#readme
 licenses:
 - Apache-2.0
@@ -95,12 +102,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '0'
+      version: '2.1'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
-  - - ">"
+  - - ">="
     - !ruby/object:Gem::Version
-      version: 1.3.1
+      version: '0'
 requirements: []
 rubyforge_project:
 rubygems_version: 2.7.6

data/CHANGELOG.md DELETED

@@ -1,56 +0,0 @@
-# Changelog
-All notable changes to Nokogumbo will be documented in this file.
-The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
-and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-## [Unreleased]
-### Added
-- Experimental support for errors (it was supported in 1.5.0 but
-  undocumented).
-- Added proper HTML5 serialization.
-- Added option `:max_tree_depth` to control the maximum parse tree depth.
-### Changed
-- Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
-  Nokogumbo. A system version will not be used.
-- The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
-  `:max_parse_errors` is deprecated and will go away
-- The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
-  `Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
-  rather than `Nokogiri::HTML::Document` and
-  `Nokogiri::HTML::DocumentFragment`.
-- Changed the top-level API to more closely match Nokogiri's while maintaining
-  backwards compatibility. The new APIs are
-  * `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
-  * `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
-  * `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
-  * `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
-  * `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
-  In all cases, `html` can be a string or an `IO` object (something that
-  responds to `#read`). The `url` parameter is entirely for error reporting,
-  as in Nokogiri. The `encoding` parameter only signals what encoding `html`
-  should have on input; the output `Document` or `DocumentFragment` will be in
-  UTF-8. Currently, the only options supported is `:max_errors` which controls
-  the maximum number of reported by `#errors`.
-### Deprecated
-- `:max_parse_errors`; use `:max_errors`
-### Removed
-### Fixed
-- Fixed documents failing to serialize (via `to_html`) if they contain certain
-  `meta` elements that set the `charset`.
-- Documents are now properly marked as UTF-8 after parsing.
-- Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
-  `<!DOCTYPE html>`.
-- Fixed crash when input contains U+0000 NULL bytes and error reporting is
-  enabled.
-### Security
-- The most recent, released version of Gumbo has a [potential security
-  issue](https://github.com/google/gumbo-parser/pull/375) that could result in
-  a cross-site scripting vulnerability. This has been fixed by integrating
-  Gumbo into Nokogumbo.