nokogumbo 2.0.0.pre.alpha → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,8 @@ struct GumboInternalError;
30
30
  struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
- extern const int kUtf8ReplacementChar;
33
+ #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8MaxChar 0x10FFFF
34
35
 
35
36
  typedef struct GumboInternalUtf8Iterator {
36
37
  // Points at the start of the code point most recently read into 'current'.
@@ -60,9 +61,23 @@ typedef struct GumboInternalUtf8Iterator {
60
61
  struct GumboInternalParser* _parser;
61
62
  } Utf8Iterator;
62
63
 
63
- // Returns true if this Unicode code point is in the list of characters
64
- // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
65
- bool utf8_is_invalid_code_point(int c) CONST_FN;
64
+ // Returns true if this Unicode code point is a surrogate.
65
+ CONST_FN static inline bool utf8_is_surrogate(int c) {
66
+ return c >= 0xD800 && c <= 0xDFFF;
67
+ }
68
+
69
+ // Returns true if this Unicode code point is a noncharacter.
70
+ CONST_FN static inline bool utf8_is_noncharacter(int c) {
71
+ return
72
+ (c >= 0xFDD0 && c <= 0xFDEF)
73
+ || ((c & 0xFFFF) == 0xFFFE)
74
+ || ((c & 0xFFFF) == 0xFFFF);
75
+ }
76
+
77
+ // Returns true if this Unicode code point is a control.
78
+ CONST_FN static inline bool utf8_is_control(int c) {
79
+ return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
80
+ }
66
81
 
67
82
  // Initializes a new Utf8Iterator from the given byte buffer. The source does
68
83
  // not have to be NUL-terminated, but the length must be passed in explicitly.
@@ -77,20 +92,47 @@ void utf8iterator_init (
77
92
  void utf8iterator_next(Utf8Iterator* iter);
78
93
 
79
94
  // Returns the current code point as an integer.
80
- int utf8iterator_current(const Utf8Iterator* iter);
95
+ static inline int utf8iterator_current(const Utf8Iterator* iter) {
96
+ return iter->_current;
97
+ }
81
98
 
82
99
  // Retrieves and fills the output parameter with the current source position.
83
- void utf8iterator_get_position(
84
- const Utf8Iterator* iter, GumboSourcePosition* output);
100
+ static inline void utf8iterator_get_position (
101
+ const Utf8Iterator* iter,
102
+ GumboSourcePosition* output
103
+ ) {
104
+ *output = iter->_pos;
105
+ }
106
+
107
+ // Retrieves the marked position.
108
+ static inline GumboSourcePosition utf8iterator_get_mark_position (
109
+ const Utf8Iterator* iter
110
+ ) {
111
+ return iter->_mark_pos;
112
+ }
85
113
 
86
114
  // Retrieves a character pointer to the start of the current character.
87
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
115
+ static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
116
+ return iter->_start;
117
+ }
118
+
119
+ // Retrieves the width of the current character.
120
+ static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
121
+ return iter->_width;
122
+ }
88
123
 
89
124
  // Retrieves a character pointer to 1 past the end of the buffer. This is
90
125
  // necessary for certain state machines and string comparisons that would like
91
126
  // to look directly for ASCII text in the buffer without going through the
92
127
  // decoder.
93
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
128
+ static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
129
+ return iter->_end;
130
+ }
131
+
132
+ // Retrieves a character pointer to the marked position.
133
+ static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
134
+ return iter->_mark;
135
+ }
94
136
 
95
137
  // If the upcoming text in the buffer matches the specified prefix (which has
96
138
  // length 'length'), consume it and return true. Otherwise, return false with
@@ -114,13 +156,6 @@ void utf8iterator_mark(Utf8Iterator* iter);
114
156
  // Returns the current input stream position to the mark.
115
157
  void utf8iterator_reset(Utf8Iterator* iter);
116
158
 
117
- // Sets the position and original text fields of an error to the value at the
118
- // mark.
119
- void utf8iterator_fill_error_at_mark (
120
- Utf8Iterator* iter,
121
- struct GumboInternalError* error
122
- );
123
-
124
159
  #ifdef __cplusplus
125
160
  }
126
161
  #endif
@@ -1,7 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'nokogumbo/version'
3
3
  require 'nokogumbo/html5'
4
- require 'nokogumbo/xml/node.rb'
5
4
 
6
5
  require 'nokogumbo/nokogumbo'
7
6
 
@@ -1,5 +1,6 @@
1
1
  require 'nokogumbo/html5/document'
2
2
  require 'nokogumbo/html5/document_fragment'
3
+ require 'nokogumbo/html5/node'
3
4
 
4
5
  module Nokogiri
5
6
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
@@ -167,7 +168,7 @@ module Nokogiri
167
168
  case current_node.type
168
169
  when XML::Node::ELEMENT_NODE
169
170
  ns = current_node.namespace
170
- ns_uri = ns.nil? ? nil : ns.uri
171
+ ns_uri = ns.nil? ? nil : ns.href
171
172
  # XXX(sfc): attach namespaces to all nodes, even html?
172
173
  if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
173
174
  tagname = current_node.name
@@ -3,6 +3,7 @@ module Nokogiri
3
3
  class Document < Nokogiri::HTML::Document
4
4
  def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
5
5
  yield options if block_given?
6
+ string_or_io = '' unless string_or_io
6
7
 
7
8
  if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
8
9
  encoding ||= string_or_io.encoding.name
@@ -15,7 +16,7 @@ module Nokogiri
15
16
  end
16
17
 
17
18
  def self.read_io(io, url = nil, encoding = nil, **options)
18
- raise ArgumentError.new("io object doesn't respond to :read") unless io.respon_to?(:read)
19
+ raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
19
20
  do_parse(io, url, encoding, options)
20
21
  end
21
22
 
@@ -23,6 +24,16 @@ module Nokogiri
23
24
  do_parse(string.to_s, url, encoding, options)
24
25
  end
25
26
 
27
+ def fragment(tags = nil)
28
+ DocumentFragment.new(self, tags, self.root)
29
+ end
30
+
31
+ def to_xml(options = {}, &block)
32
+ # Bypass XML::Document#to_xml which doesn't add
33
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
34
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
35
+ end
36
+
26
37
  private
27
38
  def self.do_parse(string_or_io, url, encoding, options)
28
39
  string = HTML5.read_and_encode(string_or_io, encoding)
@@ -3,29 +3,19 @@ require 'nokogiri'
3
3
  module Nokogiri
4
4
  module HTML5
5
5
  class DocumentFragment < Nokogiri::HTML::DocumentFragment
6
+ attr_accessor :document
7
+ attr_accessor :errors
8
+
6
9
  # Create a document fragment.
7
10
  def initialize(doc, tags = nil, ctx = nil, options = {})
11
+ self.document = doc
12
+ self.errors = []
8
13
  return self unless tags
9
- if ctx
10
- raise Argument.new("Fragment parsing with context not supported")
11
- else
12
- tags = Nokogiri::HTML5.read_and_encode(tags, nil)
13
-
14
- # Copied from Nokogiri's document_fragment.rb and labled "a horrible
15
- # hack."
16
- if tags.strip =~ /^<body/i
17
- path = "/html/body"
18
- else
19
- path = "/html/body/node()"
20
- end
21
- # Add 2 for <html> and <body>.
22
- max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
23
- options = options.dup
24
- options[:max_tree_depth] = max_depth
25
- temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
26
- temp_doc.xpath(path).each { |child| child.parent = self }
27
- self.errors = temp_doc.errors
28
- end
14
+
15
+ max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
16
+ max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
17
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
18
+ Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
29
19
  end
30
20
 
31
21
  def serialize(options = {}, &block)
@@ -41,6 +31,31 @@ module Nokogiri
41
31
  doc.encoding = 'UTF-8'
42
32
  new(doc, tags, nil, options)
43
33
  end
34
+
35
+ def extract_params params # :nodoc:
36
+ handler = params.find do |param|
37
+ ![Hash, String, Symbol].include?(param.class)
38
+ end
39
+ params -= [handler] if handler
40
+
41
+ hashes = []
42
+ while Hash === params.last || params.last.nil?
43
+ hashes << params.pop
44
+ break if params.empty?
45
+ end
46
+ ns, binds = hashes.reverse
47
+
48
+ ns ||=
49
+ begin
50
+ ns = Hash.new
51
+ children.each { |child| ns.merge!(child.namespaces) }
52
+ ns
53
+ end
54
+
55
+ [params, handler, ns, binds]
56
+ end
57
+
44
58
  end
45
59
  end
46
60
  end
61
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,57 +1,72 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  module Nokogiri
4
- # Monkey patch
5
- module XML
6
- class Node
4
+ module HTML5
5
+ module Node
7
6
  # HTML elements can have attributes that contain colons.
8
7
  # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
9
8
  # and tries to create an attribute in a namespace. This is especially
10
9
  # annoying with attribute names like xml:lang since libxml2 will
11
10
  # actually create the xml namespace if it doesn't exist already.
12
- define_method(:add_child_node_and_reparent_attrs) do |node|
11
+ def add_child_node_and_reparent_attrs(node)
12
+ return super(node) unless document.is_a?(HTML5::Document)
13
+ # I'm not sure what this method is supposed to do. Reparenting
14
+ # namespaces is handled by libxml2, including child namespaces which
15
+ # this method wouldn't handle.
16
+ # https://github.com/sparklemotion/nokogiri/issues/1790
13
17
  add_child_node(node)
14
- node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
15
- attr.remove
16
- node[attr.name] = attr.value
17
- end
18
+ #node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
19
+ # attr.remove
20
+ # ns = attr.namespace
21
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
22
+ #end
18
23
  end
19
24
 
20
25
  def inner_html(options = {})
26
+ return super(options) unless document.is_a?(HTML5::Document)
21
27
  result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
22
28
  result << children.map { |child| child.to_html(options) }.join
23
29
  result
24
30
  end
25
31
 
26
32
  def write_to(io, *options)
33
+ return super(io, *options) unless document.is_a?(HTML5::Document)
27
34
  options = options.first.is_a?(Hash) ? options.shift : {}
28
35
  encoding = options[:encoding] || options[0]
29
36
  if Nokogiri.jruby?
30
37
  save_options = options[:save_with] || options[1]
31
38
  indent_times = options[:indent] || 0
32
39
  else
33
- save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
40
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
34
41
  indent_times = options[:indent] || 2
35
42
  end
36
43
  indent_string = (options[:indent_text] || ' ') * indent_times
37
44
 
38
- config = SaveOptions.new(save_options.to_i)
45
+ config = XML::Node::SaveOptions.new(save_options.to_i)
39
46
  yield config if block_given?
40
47
 
41
48
  config_options = config.options
42
- if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0) || !document.is_a?(HTML5::Document)
49
+ if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
43
50
  # Use Nokogiri's serializing code.
44
51
  native_write_to(io, encoding, indent_string, config_options)
45
52
  else
46
53
  # Serialize including the current node.
47
54
  encoding ||= document.encoding || Encoding::UTF_8
48
55
  internal_ops = {
49
- trailing_nl: config_options & SaveOptions::FORMAT != 0,
50
56
  preserve_newline: options[:preserve_newline] || false
51
57
  }
52
- HTML5.serialize_node_internal(self, io, encoding, options)
58
+ HTML5.serialize_node_internal(self, io, encoding, internal_ops)
53
59
  end
54
60
  end
61
+
62
+ def fragment(tags)
63
+ return super(tags) unless document.is_a?(HTML5::Document)
64
+ DocumentFragment.new(document, tags, self)
65
+ end
55
66
  end
67
+ # Monkey patch
68
+ XML::Node.prepend(HTML5::Node)
56
69
  end
57
70
  end
71
+
72
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.0-alpha"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0.pre.alpha
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
@@ -9,22 +9,28 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-08-31 00:00:00.000000000 Z
12
+ date: 2018-10-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.8'
18
21
  - - ">="
19
22
  - !ruby/object:Gem::Version
20
- version: '0'
23
+ version: 1.8.4
21
24
  type: :runtime
22
25
  prerelease: false
23
26
  version_requirements: !ruby/object:Gem::Requirement
24
27
  requirements:
28
+ - - "~>"
29
+ - !ruby/object:Gem::Version
30
+ version: '1.8'
25
31
  - - ">="
26
32
  - !ruby/object:Gem::Version
27
- version: '0'
33
+ version: 1.8.4
28
34
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
29
35
  access the result as a Nokogiri parsed document.
30
36
  email:
@@ -35,7 +41,6 @@ extensions:
35
41
  - ext/nokogumbo/extconf.rb
36
42
  extra_rdoc_files: []
37
43
  files:
38
- - CHANGELOG.md
39
44
  - LICENSE.txt
40
45
  - README.md
41
46
  - ext/nokogumbo/extconf.rb
@@ -63,6 +68,8 @@ files:
63
68
  - gumbo-parser/src/tag.c
64
69
  - gumbo-parser/src/tag_lookup.c
65
70
  - gumbo-parser/src/tag_lookup.h
71
+ - gumbo-parser/src/token_buffer.c
72
+ - gumbo-parser/src/token_buffer.h
66
73
  - gumbo-parser/src/token_type.h
67
74
  - gumbo-parser/src/tokenizer.c
68
75
  - gumbo-parser/src/tokenizer.h
@@ -77,8 +84,8 @@ files:
77
84
  - lib/nokogumbo/html5.rb
78
85
  - lib/nokogumbo/html5/document.rb
79
86
  - lib/nokogumbo/html5/document_fragment.rb
87
+ - lib/nokogumbo/html5/node.rb
80
88
  - lib/nokogumbo/version.rb
81
- - lib/nokogumbo/xml/node.rb
82
89
  homepage: https://github.com/rubys/nokogumbo/#readme
83
90
  licenses:
84
91
  - Apache-2.0
@@ -95,12 +102,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
95
102
  requirements:
96
103
  - - ">="
97
104
  - !ruby/object:Gem::Version
98
- version: '0'
105
+ version: '2.1'
99
106
  required_rubygems_version: !ruby/object:Gem::Requirement
100
107
  requirements:
101
- - - ">"
108
+ - - ">="
102
109
  - !ruby/object:Gem::Version
103
- version: 1.3.1
110
+ version: '0'
104
111
  requirements: []
105
112
  rubyforge_project:
106
113
  rubygems_version: 2.7.6
@@ -1,56 +0,0 @@
1
- # Changelog
2
-
3
- All notable changes to Nokogumbo will be documented in this file.
4
-
5
- The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
- and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
-
8
- ## [Unreleased]
9
- ### Added
10
- - Experimental support for errors (it was supported in 1.5.0 but
11
- undocumented).
12
- - Added proper HTML5 serialization.
13
- - Added option `:max_tree_depth` to control the maximum parse tree depth.
14
-
15
- ### Changed
16
- - Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
17
- Nokogumbo. A system version will not be used.
18
- - The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
19
- `:max_parse_errors` is deprecated and will go away
20
- - The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
21
- `Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
22
- rather than `Nokogiri::HTML::Document` and
23
- `Nokogiri::HTML::DocumentFragment`.
24
- - Changed the top-level API to more closely match Nokogiri's while maintaining
25
- backwards compatibility. The new APIs are
26
- * `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
27
- * `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
28
- * `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
29
- * `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
30
- * `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
31
- In all cases, `html` can be a string or an `IO` object (something that
32
- responds to `#read`). The `url` parameter is entirely for error reporting,
33
- as in Nokogiri. The `encoding` parameter only signals what encoding `html`
34
- should have on input; the output `Document` or `DocumentFragment` will be in
35
- UTF-8. Currently, the only options supported is `:max_errors` which controls
36
- the maximum number of reported by `#errors`.
37
-
38
- ### Deprecated
39
- - `:max_parse_errors`; use `:max_errors`
40
-
41
- ### Removed
42
-
43
- ### Fixed
44
- - Fixed documents failing to serialize (via `to_html`) if they contain certain
45
- `meta` elements that set the `charset`.
46
- - Documents are now properly marked as UTF-8 after parsing.
47
- - Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
48
- `<!DOCTYPE html>`.
49
- - Fixed crash when input contains U+0000 NULL bytes and error reporting is
50
- enabled.
51
-
52
- ### Security
53
- - The most recent, released version of Gumbo has a [potential security
54
- issue](https://github.com/google/gumbo-parser/pull/375) that could result in
55
- a cross-site scripting vulnerability. This has been fixed by integrating
56
- Gumbo into Nokogumbo.