nokogumbo 2.0.0.pre.alpha → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -30,7 +30,8 @@ struct GumboInternalError;
30
30
  struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
- extern const int kUtf8ReplacementChar;
33
+ #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8MaxChar 0x10FFFF
34
35
 
35
36
  typedef struct GumboInternalUtf8Iterator {
36
37
  // Points at the start of the code point most recently read into 'current'.
@@ -60,9 +61,23 @@ typedef struct GumboInternalUtf8Iterator {
60
61
  struct GumboInternalParser* _parser;
61
62
  } Utf8Iterator;
62
63
 
63
- // Returns true if this Unicode code point is in the list of characters
64
- // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
65
- bool utf8_is_invalid_code_point(int c) CONST_FN;
64
+ // Returns true if this Unicode code point is a surrogate.
65
+ CONST_FN static inline bool utf8_is_surrogate(int c) {
66
+ return c >= 0xD800 && c <= 0xDFFF;
67
+ }
68
+
69
+ // Returns true if this Unicode code point is a noncharacter.
70
+ CONST_FN static inline bool utf8_is_noncharacter(int c) {
71
+ return
72
+ (c >= 0xFDD0 && c <= 0xFDEF)
73
+ || ((c & 0xFFFF) == 0xFFFE)
74
+ || ((c & 0xFFFF) == 0xFFFF);
75
+ }
76
+
77
+ // Returns true if this Unicode code point is a control.
78
+ CONST_FN static inline bool utf8_is_control(int c) {
79
+ return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
80
+ }
66
81
 
67
82
  // Initializes a new Utf8Iterator from the given byte buffer. The source does
68
83
  // not have to be NUL-terminated, but the length must be passed in explicitly.
@@ -77,20 +92,47 @@ void utf8iterator_init (
77
92
  void utf8iterator_next(Utf8Iterator* iter);
78
93
 
79
94
  // Returns the current code point as an integer.
80
- int utf8iterator_current(const Utf8Iterator* iter);
95
+ static inline int utf8iterator_current(const Utf8Iterator* iter) {
96
+ return iter->_current;
97
+ }
81
98
 
82
99
  // Retrieves and fills the output parameter with the current source position.
83
- void utf8iterator_get_position(
84
- const Utf8Iterator* iter, GumboSourcePosition* output);
100
+ static inline void utf8iterator_get_position (
101
+ const Utf8Iterator* iter,
102
+ GumboSourcePosition* output
103
+ ) {
104
+ *output = iter->_pos;
105
+ }
106
+
107
+ // Retrieves the marked position.
108
+ static inline GumboSourcePosition utf8iterator_get_mark_position (
109
+ const Utf8Iterator* iter
110
+ ) {
111
+ return iter->_mark_pos;
112
+ }
85
113
 
86
114
  // Retrieves a character pointer to the start of the current character.
87
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
115
+ static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
116
+ return iter->_start;
117
+ }
118
+
119
+ // Retrieves the width of the current character.
120
+ static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
121
+ return iter->_width;
122
+ }
88
123
 
89
124
  // Retrieves a character pointer to 1 past the end of the buffer. This is
90
125
  // necessary for certain state machines and string comparisons that would like
91
126
  // to look directly for ASCII text in the buffer without going through the
92
127
  // decoder.
93
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
128
+ static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
129
+ return iter->_end;
130
+ }
131
+
132
+ // Retrieves a character pointer to the marked position.
133
+ static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
134
+ return iter->_mark;
135
+ }
94
136
 
95
137
  // If the upcoming text in the buffer matches the specified prefix (which has
96
138
  // length 'length'), consume it and return true. Otherwise, return false with
@@ -114,13 +156,6 @@ void utf8iterator_mark(Utf8Iterator* iter);
114
156
  // Returns the current input stream position to the mark.
115
157
  void utf8iterator_reset(Utf8Iterator* iter);
116
158
 
117
- // Sets the position and original text fields of an error to the value at the
118
- // mark.
119
- void utf8iterator_fill_error_at_mark (
120
- Utf8Iterator* iter,
121
- struct GumboInternalError* error
122
- );
123
-
124
159
  #ifdef __cplusplus
125
160
  }
126
161
  #endif
@@ -1,7 +1,6 @@
1
1
  require 'nokogiri'
2
2
  require 'nokogumbo/version'
3
3
  require 'nokogumbo/html5'
4
- require 'nokogumbo/xml/node.rb'
5
4
 
6
5
  require 'nokogumbo/nokogumbo'
7
6
 
@@ -1,5 +1,6 @@
1
1
  require 'nokogumbo/html5/document'
2
2
  require 'nokogumbo/html5/document_fragment'
3
+ require 'nokogumbo/html5/node'
3
4
 
4
5
  module Nokogiri
5
6
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
@@ -167,7 +168,7 @@ module Nokogiri
167
168
  case current_node.type
168
169
  when XML::Node::ELEMENT_NODE
169
170
  ns = current_node.namespace
170
- ns_uri = ns.nil? ? nil : ns.uri
171
+ ns_uri = ns.nil? ? nil : ns.href
171
172
  # XXX(sfc): attach namespaces to all nodes, even html?
172
173
  if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
173
174
  tagname = current_node.name
@@ -3,6 +3,7 @@ module Nokogiri
3
3
  class Document < Nokogiri::HTML::Document
4
4
  def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
5
5
  yield options if block_given?
6
+ string_or_io = '' unless string_or_io
6
7
 
7
8
  if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
8
9
  encoding ||= string_or_io.encoding.name
@@ -15,7 +16,7 @@ module Nokogiri
15
16
  end
16
17
 
17
18
  def self.read_io(io, url = nil, encoding = nil, **options)
18
- raise ArgumentError.new("io object doesn't respond to :read") unless io.respon_to?(:read)
19
+ raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
19
20
  do_parse(io, url, encoding, options)
20
21
  end
21
22
 
@@ -23,6 +24,16 @@ module Nokogiri
23
24
  do_parse(string.to_s, url, encoding, options)
24
25
  end
25
26
 
27
+ def fragment(tags = nil)
28
+ DocumentFragment.new(self, tags, self.root)
29
+ end
30
+
31
+ def to_xml(options = {}, &block)
32
+ # Bypass XML::Document#to_xml which doesn't add
33
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
34
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
35
+ end
36
+
26
37
  private
27
38
  def self.do_parse(string_or_io, url, encoding, options)
28
39
  string = HTML5.read_and_encode(string_or_io, encoding)
@@ -3,29 +3,19 @@ require 'nokogiri'
3
3
  module Nokogiri
4
4
  module HTML5
5
5
  class DocumentFragment < Nokogiri::HTML::DocumentFragment
6
+ attr_accessor :document
7
+ attr_accessor :errors
8
+
6
9
  # Create a document fragment.
7
10
  def initialize(doc, tags = nil, ctx = nil, options = {})
11
+ self.document = doc
12
+ self.errors = []
8
13
  return self unless tags
9
- if ctx
10
- raise Argument.new("Fragment parsing with context not supported")
11
- else
12
- tags = Nokogiri::HTML5.read_and_encode(tags, nil)
13
-
14
- # Copied from Nokogiri's document_fragment.rb and labled "a horrible
15
- # hack."
16
- if tags.strip =~ /^<body/i
17
- path = "/html/body"
18
- else
19
- path = "/html/body/node()"
20
- end
21
- # Add 2 for <html> and <body>.
22
- max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
23
- options = options.dup
24
- options[:max_tree_depth] = max_depth
25
- temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
26
- temp_doc.xpath(path).each { |child| child.parent = self }
27
- self.errors = temp_doc.errors
28
- end
14
+
15
+ max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
16
+ max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
17
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
18
+ Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
29
19
  end
30
20
 
31
21
  def serialize(options = {}, &block)
@@ -41,6 +31,31 @@ module Nokogiri
41
31
  doc.encoding = 'UTF-8'
42
32
  new(doc, tags, nil, options)
43
33
  end
34
+
35
+ def extract_params params # :nodoc:
36
+ handler = params.find do |param|
37
+ ![Hash, String, Symbol].include?(param.class)
38
+ end
39
+ params -= [handler] if handler
40
+
41
+ hashes = []
42
+ while Hash === params.last || params.last.nil?
43
+ hashes << params.pop
44
+ break if params.empty?
45
+ end
46
+ ns, binds = hashes.reverse
47
+
48
+ ns ||=
49
+ begin
50
+ ns = Hash.new
51
+ children.each { |child| ns.merge!(child.namespaces) }
52
+ ns
53
+ end
54
+
55
+ [params, handler, ns, binds]
56
+ end
57
+
44
58
  end
45
59
  end
46
60
  end
61
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,57 +1,72 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  module Nokogiri
4
- # Monkey patch
5
- module XML
6
- class Node
4
+ module HTML5
5
+ module Node
7
6
  # HTML elements can have attributes that contain colons.
8
7
  # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
9
8
  # and tries to create an attribute in a namespace. This is especially
10
9
  # annoying with attribute names like xml:lang since libxml2 will
11
10
  # actually create the xml namespace if it doesn't exist already.
12
- define_method(:add_child_node_and_reparent_attrs) do |node|
11
+ def add_child_node_and_reparent_attrs(node)
12
+ return super(node) unless document.is_a?(HTML5::Document)
13
+ # I'm not sure what this method is supposed to do. Reparenting
14
+ # namespaces is handled by libxml2, including child namespaces which
15
+ # this method wouldn't handle.
16
+ # https://github.com/sparklemotion/nokogiri/issues/1790
13
17
  add_child_node(node)
14
- node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
15
- attr.remove
16
- node[attr.name] = attr.value
17
- end
18
+ #node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
19
+ # attr.remove
20
+ # ns = attr.namespace
21
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
22
+ #end
18
23
  end
19
24
 
20
25
  def inner_html(options = {})
26
+ return super(options) unless document.is_a?(HTML5::Document)
21
27
  result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
22
28
  result << children.map { |child| child.to_html(options) }.join
23
29
  result
24
30
  end
25
31
 
26
32
  def write_to(io, *options)
33
+ return super(io, *options) unless document.is_a?(HTML5::Document)
27
34
  options = options.first.is_a?(Hash) ? options.shift : {}
28
35
  encoding = options[:encoding] || options[0]
29
36
  if Nokogiri.jruby?
30
37
  save_options = options[:save_with] || options[1]
31
38
  indent_times = options[:indent] || 0
32
39
  else
33
- save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
40
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
34
41
  indent_times = options[:indent] || 2
35
42
  end
36
43
  indent_string = (options[:indent_text] || ' ') * indent_times
37
44
 
38
- config = SaveOptions.new(save_options.to_i)
45
+ config = XML::Node::SaveOptions.new(save_options.to_i)
39
46
  yield config if block_given?
40
47
 
41
48
  config_options = config.options
42
- if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0) || !document.is_a?(HTML5::Document)
49
+ if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
43
50
  # Use Nokogiri's serializing code.
44
51
  native_write_to(io, encoding, indent_string, config_options)
45
52
  else
46
53
  # Serialize including the current node.
47
54
  encoding ||= document.encoding || Encoding::UTF_8
48
55
  internal_ops = {
49
- trailing_nl: config_options & SaveOptions::FORMAT != 0,
50
56
  preserve_newline: options[:preserve_newline] || false
51
57
  }
52
- HTML5.serialize_node_internal(self, io, encoding, options)
58
+ HTML5.serialize_node_internal(self, io, encoding, internal_ops)
53
59
  end
54
60
  end
61
+
62
+ def fragment(tags)
63
+ return super(tags) unless document.is_a?(HTML5::Document)
64
+ DocumentFragment.new(document, tags, self)
65
+ end
55
66
  end
67
+ # Monkey patch
68
+ XML::Node.prepend(HTML5::Node)
56
69
  end
57
70
  end
71
+
72
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.0-alpha"
2
+ VERSION = "2.0.0"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0.pre.alpha
4
+ version: 2.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
@@ -9,22 +9,28 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-08-31 00:00:00.000000000 Z
12
+ date: 2018-10-04 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.8'
18
21
  - - ">="
19
22
  - !ruby/object:Gem::Version
20
- version: '0'
23
+ version: 1.8.4
21
24
  type: :runtime
22
25
  prerelease: false
23
26
  version_requirements: !ruby/object:Gem::Requirement
24
27
  requirements:
28
+ - - "~>"
29
+ - !ruby/object:Gem::Version
30
+ version: '1.8'
25
31
  - - ">="
26
32
  - !ruby/object:Gem::Version
27
- version: '0'
33
+ version: 1.8.4
28
34
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
29
35
  access the result as a Nokogiri parsed document.
30
36
  email:
@@ -35,7 +41,6 @@ extensions:
35
41
  - ext/nokogumbo/extconf.rb
36
42
  extra_rdoc_files: []
37
43
  files:
38
- - CHANGELOG.md
39
44
  - LICENSE.txt
40
45
  - README.md
41
46
  - ext/nokogumbo/extconf.rb
@@ -63,6 +68,8 @@ files:
63
68
  - gumbo-parser/src/tag.c
64
69
  - gumbo-parser/src/tag_lookup.c
65
70
  - gumbo-parser/src/tag_lookup.h
71
+ - gumbo-parser/src/token_buffer.c
72
+ - gumbo-parser/src/token_buffer.h
66
73
  - gumbo-parser/src/token_type.h
67
74
  - gumbo-parser/src/tokenizer.c
68
75
  - gumbo-parser/src/tokenizer.h
@@ -77,8 +84,8 @@ files:
77
84
  - lib/nokogumbo/html5.rb
78
85
  - lib/nokogumbo/html5/document.rb
79
86
  - lib/nokogumbo/html5/document_fragment.rb
87
+ - lib/nokogumbo/html5/node.rb
80
88
  - lib/nokogumbo/version.rb
81
- - lib/nokogumbo/xml/node.rb
82
89
  homepage: https://github.com/rubys/nokogumbo/#readme
83
90
  licenses:
84
91
  - Apache-2.0
@@ -95,12 +102,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
95
102
  requirements:
96
103
  - - ">="
97
104
  - !ruby/object:Gem::Version
98
- version: '0'
105
+ version: '2.1'
99
106
  required_rubygems_version: !ruby/object:Gem::Requirement
100
107
  requirements:
101
- - - ">"
108
+ - - ">="
102
109
  - !ruby/object:Gem::Version
103
- version: 1.3.1
110
+ version: '0'
104
111
  requirements: []
105
112
  rubyforge_project:
106
113
  rubygems_version: 2.7.6
@@ -1,56 +0,0 @@
1
- # Changelog
2
-
3
- All notable changes to Nokogumbo will be documented in this file.
4
-
5
- The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
6
- and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
7
-
8
- ## [Unreleased]
9
- ### Added
10
- - Experimental support for errors (it was supported in 1.5.0 but
11
- undocumented).
12
- - Added proper HTML5 serialization.
13
- - Added option `:max_tree_depth` to control the maximum parse tree depth.
14
-
15
- ### Changed
16
- - Integrated [Gumbo parser](https://github.com/google/gumbo-parser) into
17
- Nokogumbo. A system version will not be used.
18
- - The undocumented (but publicly mentioned) `:max_parse_errors` renamed to `:max_errors`;
19
- `:max_parse_errors` is deprecated and will go away
20
- - The various `#parse` and `#fragment` (and `Nokogiri.HTML5`) methods return
21
- `Nokogiri::HTML5::Document` and `Nokogiri::HTML5::DocumentFragment` classes
22
- rather than `Nokogiri::HTML::Document` and
23
- `Nokogiri::HTML::DocumentFragment`.
24
- - Changed the top-level API to more closely match Nokogiri's while maintaining
25
- backwards compatibility. The new APIs are
26
- * `Nokogiri::HTML5(html, url = nil, encoding = nil, **options, &block)`
27
- * `Nokogiri::HTML5.parse(html, url = nil, encoding = nil, **options, &block)`
28
- * `Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, **options, &block)`
29
- * `Nokogiri::HTML5.fragment(html, encoding = nil, **options)`
30
- * `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, **options)`
31
- In all cases, `html` can be a string or an `IO` object (something that
32
- responds to `#read`). The `url` parameter is entirely for error reporting,
33
- as in Nokogiri. The `encoding` parameter only signals what encoding `html`
34
- should have on input; the output `Document` or `DocumentFragment` will be in
35
- UTF-8. Currently, the only options supported is `:max_errors` which controls
36
- the maximum number of reported by `#errors`.
37
-
38
- ### Deprecated
39
- - `:max_parse_errors`; use `:max_errors`
40
-
41
- ### Removed
42
-
43
- ### Fixed
44
- - Fixed documents failing to serialize (via `to_html`) if they contain certain
45
- `meta` elements that set the `charset`.
46
- - Documents are now properly marked as UTF-8 after parsing.
47
- - Fixed `Nokogiri::HTML5.fragment` reporting an error due to a missing
48
- `<!DOCTYPE html>`.
49
- - Fixed crash when input contains U+0000 NULL bytes and error reporting is
50
- enabled.
51
-
52
- ### Security
53
- - The most recent, released version of Gumbo has a [potential security
54
- issue](https://github.com/google/gumbo-parser/pull/375) that could result in
55
- a cross-site scripting vulnerability. This has been fixed by integrating
56
- Gumbo into Nokogumbo.