nokogumbo 2.0.0.pre.alpha → 2.0.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -30,7 +30,9 @@ struct GumboInternalError;
30
30
  struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
- extern const int kUtf8ReplacementChar;
33
+ #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8BomChar 0xFEFF
35
+ #define kUtf8MaxChar 0x10FFFF
34
36
 
35
37
  typedef struct GumboInternalUtf8Iterator {
36
38
  // Points at the start of the code point most recently read into 'current'.
@@ -60,9 +62,23 @@ typedef struct GumboInternalUtf8Iterator {
60
62
  struct GumboInternalParser* _parser;
61
63
  } Utf8Iterator;
62
64
 
63
- // Returns true if this Unicode code point is in the list of characters
64
- // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
65
- bool utf8_is_invalid_code_point(int c) CONST_FN;
65
+ // Returns true if this Unicode code point is a surrogate.
66
+ CONST_FN static inline bool utf8_is_surrogate(int c) {
67
+ return c >= 0xD800 && c <= 0xDFFF;
68
+ }
69
+
70
+ // Returns true if this Unicode code point is a noncharacter.
71
+ CONST_FN static inline bool utf8_is_noncharacter(int c) {
72
+ return
73
+ (c >= 0xFDD0 && c <= 0xFDEF)
74
+ || ((c & 0xFFFF) == 0xFFFE)
75
+ || ((c & 0xFFFF) == 0xFFFF);
76
+ }
77
+
78
+ // Returns true if this Unicode code point is a control.
79
+ CONST_FN static inline bool utf8_is_control(int c) {
80
+ return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
81
+ }
66
82
 
67
83
  // Initializes a new Utf8Iterator from the given byte buffer. The source does
68
84
  // not have to be NUL-terminated, but the length must be passed in explicitly.
@@ -77,20 +93,47 @@ void utf8iterator_init (
77
93
  void utf8iterator_next(Utf8Iterator* iter);
78
94
 
79
95
  // Returns the current code point as an integer.
80
- int utf8iterator_current(const Utf8Iterator* iter);
96
+ static inline int utf8iterator_current(const Utf8Iterator* iter) {
97
+ return iter->_current;
98
+ }
81
99
 
82
100
  // Retrieves and fills the output parameter with the current source position.
83
- void utf8iterator_get_position(
84
- const Utf8Iterator* iter, GumboSourcePosition* output);
101
+ static inline void utf8iterator_get_position (
102
+ const Utf8Iterator* iter,
103
+ GumboSourcePosition* output
104
+ ) {
105
+ *output = iter->_pos;
106
+ }
107
+
108
+ // Retrieves the marked position.
109
+ static inline GumboSourcePosition utf8iterator_get_mark_position (
110
+ const Utf8Iterator* iter
111
+ ) {
112
+ return iter->_mark_pos;
113
+ }
85
114
 
86
115
  // Retrieves a character pointer to the start of the current character.
87
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
116
+ static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
117
+ return iter->_start;
118
+ }
119
+
120
+ // Retrieves the width of the current character.
121
+ static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
122
+ return iter->_width;
123
+ }
88
124
 
89
125
  // Retrieves a character pointer to 1 past the end of the buffer. This is
90
126
  // necessary for certain state machines and string comparisons that would like
91
127
  // to look directly for ASCII text in the buffer without going through the
92
128
  // decoder.
93
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
129
+ static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
130
+ return iter->_end;
131
+ }
132
+
133
+ // Retrieves a character pointer to the marked position.
134
+ static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
135
+ return iter->_mark;
136
+ }
94
137
 
95
138
  // If the upcoming text in the buffer matches the specified prefix (which has
96
139
  // length 'length'), consume it and return true. Otherwise, return false with
@@ -114,13 +157,6 @@ void utf8iterator_mark(Utf8Iterator* iter);
114
157
  // Returns the current input stream position to the mark.
115
158
  void utf8iterator_reset(Utf8Iterator* iter);
116
159
 
117
- // Sets the position and original text fields of an error to the value at the
118
- // mark.
119
- void utf8iterator_fill_error_at_mark (
120
- Utf8Iterator* iter,
121
- struct GumboInternalError* error
122
- );
123
-
124
160
  #ifdef __cplusplus
125
161
  }
126
162
  #endif
@@ -1,11 +1,13 @@
1
1
  require 'nokogiri'
2
2
  require 'nokogumbo/version'
3
3
  require 'nokogumbo/html5'
4
- require 'nokogumbo/xml/node.rb'
5
4
 
6
5
  require 'nokogumbo/nokogumbo'
7
6
 
8
7
  module Nokogumbo
8
+ # The default maximum number of attributes per element.
9
+ DEFAULT_MAX_ATTRIBUTES = 400
10
+
9
11
  # The default maximum number of errors for parsing a document or a fragment.
10
12
  DEFAULT_MAX_ERRORS = 0
11
13
 
@@ -1,5 +1,6 @@
1
1
  require 'nokogumbo/html5/document'
2
2
  require 'nokogumbo/html5/document_fragment'
3
+ require 'nokogumbo/html5/node'
3
4
 
4
5
  module Nokogiri
5
6
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
@@ -18,7 +19,7 @@ module Nokogiri
18
19
 
19
20
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
20
21
  def self.parse(string, url = nil, encoding = nil, **options, &block)
21
- Document.parse(string, url, encoding, options, &block)
22
+ Document.parse(string, url, encoding, **options, &block)
22
23
  end
23
24
 
24
25
  # Parse a fragment from +string+. Convenience method for
@@ -91,19 +92,20 @@ module Nokogiri
91
92
  if encoding.nil?
92
93
  string = string.read
93
94
  else
94
- string = string.read(encoding: encoding)
95
+ string = string.read(encoding: encoding)
95
96
  end
96
97
  else
97
98
  # Otherwise the string has the given encoding.
98
- if encoding && string.respond_to?(:force_encoding)
99
+ string = string.to_s
100
+ if encoding
99
101
  string = string.dup
100
102
  string.force_encoding(encoding)
101
103
  end
102
104
  end
103
105
 
104
- # convert to UTF-8 (Ruby 1.9+)
105
- if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
106
- string = reencode(string.dup)
106
+ # convert to UTF-8
107
+ if string.encoding != Encoding::UTF_8
108
+ string = reencode(string)
107
109
  end
108
110
  string
109
111
  end
@@ -122,18 +124,17 @@ module Nokogiri
122
124
  # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
123
125
  #
124
126
  def self.reencode(body, content_type=nil)
125
- return body unless body.respond_to? :encoding
126
-
127
127
  if body.encoding == Encoding::ASCII_8BIT
128
128
  encoding = nil
129
129
 
130
130
  # look for a Byte Order Mark (BOM)
131
- if body[0..1] == "\xFE\xFF"
132
- encoding = 'utf-16be'
133
- elsif body[0..1] == "\xFF\xFE"
134
- encoding = 'utf-16le'
135
- elsif body[0..2] == "\xEF\xBB\xBF"
136
- encoding = 'utf-8'
131
+ initial_bytes = body[0..2].bytes
132
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
133
+ encoding = Encoding::UTF_8
134
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
135
+ encoding = Encoding::UTF_16BE
136
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
137
+ encoding = Encoding::UTF_16LE
137
138
  end
138
139
 
139
140
  # look for a charset in a content-encoding header
@@ -153,6 +154,7 @@ module Nokogiri
153
154
  encoding ||= Encoding::ISO_8859_1
154
155
 
155
156
  # change the encoding to match the detected or inferred encoding
157
+ body = body.dup
156
158
  begin
157
159
  body.force_encoding(encoding)
158
160
  rescue ArgumentError
@@ -167,7 +169,7 @@ module Nokogiri
167
169
  case current_node.type
168
170
  when XML::Node::ELEMENT_NODE
169
171
  ns = current_node.namespace
170
- ns_uri = ns.nil? ? nil : ns.uri
172
+ ns_uri = ns.nil? ? nil : ns.href
171
173
  # XXX(sfc): attach namespaces to all nodes, even html?
172
174
  if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
173
175
  tagname = current_node.name
@@ -3,6 +3,7 @@ module Nokogiri
3
3
  class Document < Nokogiri::HTML::Document
4
4
  def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
5
5
  yield options if block_given?
6
+ string_or_io = '' unless string_or_io
6
7
 
7
8
  if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
8
9
  encoding ||= string_or_io.encoding.name
@@ -11,24 +12,39 @@ module Nokogiri
11
12
  if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
12
13
  url ||= string_or_io.path
13
14
  end
15
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
16
+ raise ArgumentError.new("not a string or IO object")
17
+ end
14
18
  do_parse(string_or_io, url, encoding, options)
15
19
  end
16
20
 
17
21
  def self.read_io(io, url = nil, encoding = nil, **options)
18
- raise ArgumentError.new("io object doesn't respond to :read") unless io.respon_to?(:read)
22
+ raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
19
23
  do_parse(io, url, encoding, options)
20
24
  end
21
25
 
22
26
  def self.read_memory(string, url = nil, encoding = nil, **options)
23
- do_parse(string.to_s, url, encoding, options)
27
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
28
+ do_parse(string, url, encoding, options)
29
+ end
30
+
31
+ def fragment(tags = nil)
32
+ DocumentFragment.new(self, tags, self.root)
33
+ end
34
+
35
+ def to_xml(options = {}, &block)
36
+ # Bypass XML::Document#to_xml which doesn't add
37
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
38
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
24
39
  end
25
40
 
26
41
  private
27
42
  def self.do_parse(string_or_io, url, encoding, options)
28
43
  string = HTML5.read_and_encode(string_or_io, encoding)
44
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
29
45
  max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
30
46
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
31
- doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
47
+ doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
32
48
  doc.encoding = 'UTF-8'
33
49
  doc
34
50
  end
@@ -3,29 +3,20 @@ require 'nokogiri'
3
3
  module Nokogiri
4
4
  module HTML5
5
5
  class DocumentFragment < Nokogiri::HTML::DocumentFragment
6
+ attr_accessor :document
7
+ attr_accessor :errors
8
+
6
9
  # Create a document fragment.
7
10
  def initialize(doc, tags = nil, ctx = nil, options = {})
11
+ self.document = doc
12
+ self.errors = []
8
13
  return self unless tags
9
- if ctx
10
- raise Argument.new("Fragment parsing with context not supported")
11
- else
12
- tags = Nokogiri::HTML5.read_and_encode(tags, nil)
13
-
14
- # Copied from Nokogiri's document_fragment.rb and labled "a horrible
15
- # hack."
16
- if tags.strip =~ /^<body/i
17
- path = "/html/body"
18
- else
19
- path = "/html/body/node()"
20
- end
21
- # Add 2 for <html> and <body>.
22
- max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
23
- options = options.dup
24
- options[:max_tree_depth] = max_depth
25
- temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
26
- temp_doc.xpath(path).each { |child| child.parent = self }
27
- self.errors = temp_doc.errors
28
- end
14
+
15
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
16
+ max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
17
+ max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
18
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
19
+ Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
29
20
  end
30
21
 
31
22
  def serialize(options = {}, &block)
@@ -41,6 +32,31 @@ module Nokogiri
41
32
  doc.encoding = 'UTF-8'
42
33
  new(doc, tags, nil, options)
43
34
  end
35
+
36
+ def extract_params params # :nodoc:
37
+ handler = params.find do |param|
38
+ ![Hash, String, Symbol].include?(param.class)
39
+ end
40
+ params -= [handler] if handler
41
+
42
+ hashes = []
43
+ while Hash === params.last || params.last.nil?
44
+ hashes << params.pop
45
+ break if params.empty?
46
+ end
47
+ ns, binds = hashes.reverse
48
+
49
+ ns ||=
50
+ begin
51
+ ns = Hash.new
52
+ children.each { |child| ns.merge!(child.namespaces) }
53
+ ns
54
+ end
55
+
56
+ [params, handler, ns, binds]
57
+ end
58
+
44
59
  end
45
60
  end
46
61
  end
62
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,57 +1,72 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  module Nokogiri
4
- # Monkey patch
5
- module XML
6
- class Node
4
+ module HTML5
5
+ module Node
7
6
  # HTML elements can have attributes that contain colons.
8
7
  # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
9
8
  # and tries to create an attribute in a namespace. This is especially
10
9
  # annoying with attribute names like xml:lang since libxml2 will
11
10
  # actually create the xml namespace if it doesn't exist already.
12
- define_method(:add_child_node_and_reparent_attrs) do |node|
11
+ def add_child_node_and_reparent_attrs(node)
12
+ return super(node) unless document.is_a?(HTML5::Document)
13
+ # I'm not sure what this method is supposed to do. Reparenting
14
+ # namespaces is handled by libxml2, including child namespaces which
15
+ # this method wouldn't handle.
16
+ # https://github.com/sparklemotion/nokogiri/issues/1790
13
17
  add_child_node(node)
14
- node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
15
- attr.remove
16
- node[attr.name] = attr.value
17
- end
18
+ #node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
19
+ # attr.remove
20
+ # ns = attr.namespace
21
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
22
+ #end
18
23
  end
19
24
 
20
25
  def inner_html(options = {})
26
+ return super(options) unless document.is_a?(HTML5::Document)
21
27
  result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
22
28
  result << children.map { |child| child.to_html(options) }.join
23
29
  result
24
30
  end
25
31
 
26
32
  def write_to(io, *options)
33
+ return super(io, *options) unless document.is_a?(HTML5::Document)
27
34
  options = options.first.is_a?(Hash) ? options.shift : {}
28
35
  encoding = options[:encoding] || options[0]
29
36
  if Nokogiri.jruby?
30
37
  save_options = options[:save_with] || options[1]
31
38
  indent_times = options[:indent] || 0
32
39
  else
33
- save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
40
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
34
41
  indent_times = options[:indent] || 2
35
42
  end
36
43
  indent_string = (options[:indent_text] || ' ') * indent_times
37
44
 
38
- config = SaveOptions.new(save_options.to_i)
45
+ config = XML::Node::SaveOptions.new(save_options.to_i)
39
46
  yield config if block_given?
40
47
 
41
48
  config_options = config.options
42
- if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0) || !document.is_a?(HTML5::Document)
49
+ if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
43
50
  # Use Nokogiri's serializing code.
44
51
  native_write_to(io, encoding, indent_string, config_options)
45
52
  else
46
53
  # Serialize including the current node.
47
54
  encoding ||= document.encoding || Encoding::UTF_8
48
55
  internal_ops = {
49
- trailing_nl: config_options & SaveOptions::FORMAT != 0,
50
56
  preserve_newline: options[:preserve_newline] || false
51
57
  }
52
- HTML5.serialize_node_internal(self, io, encoding, options)
58
+ HTML5.serialize_node_internal(self, io, encoding, internal_ops)
53
59
  end
54
60
  end
61
+
62
+ def fragment(tags)
63
+ return super(tags) unless document.is_a?(HTML5::Document)
64
+ DocumentFragment.new(document, tags, self)
65
+ end
55
66
  end
67
+ # Monkey patch
68
+ XML::Node.prepend(HTML5::Node)
56
69
  end
57
70
  end
71
+
72
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.0-alpha"
2
+ VERSION = "2.0.4"
3
3
  end
metadata CHANGED
@@ -1,30 +1,36 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0.pre.alpha
4
+ version: 2.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
8
8
  - Stephen Checkoway
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-08-31 00:00:00.000000000 Z
12
+ date: 2020-11-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.8'
18
21
  - - ">="
19
22
  - !ruby/object:Gem::Version
20
- version: '0'
23
+ version: 1.8.4
21
24
  type: :runtime
22
25
  prerelease: false
23
26
  version_requirements: !ruby/object:Gem::Requirement
24
27
  requirements:
28
+ - - "~>"
29
+ - !ruby/object:Gem::Version
30
+ version: '1.8'
25
31
  - - ">="
26
32
  - !ruby/object:Gem::Version
27
- version: '0'
33
+ version: 1.8.4
28
34
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
29
35
  access the result as a Nokogiri parsed document.
30
36
  email:
@@ -35,7 +41,6 @@ extensions:
35
41
  - ext/nokogumbo/extconf.rb
36
42
  extra_rdoc_files: []
37
43
  files:
38
- - CHANGELOG.md
39
44
  - LICENSE.txt
40
45
  - README.md
41
46
  - ext/nokogumbo/extconf.rb
@@ -63,6 +68,8 @@ files:
63
68
  - gumbo-parser/src/tag.c
64
69
  - gumbo-parser/src/tag_lookup.c
65
70
  - gumbo-parser/src/tag_lookup.h
71
+ - gumbo-parser/src/token_buffer.c
72
+ - gumbo-parser/src/token_buffer.h
66
73
  - gumbo-parser/src/token_type.h
67
74
  - gumbo-parser/src/tokenizer.c
68
75
  - gumbo-parser/src/tokenizer.h
@@ -77,8 +84,8 @@ files:
77
84
  - lib/nokogumbo/html5.rb
78
85
  - lib/nokogumbo/html5/document.rb
79
86
  - lib/nokogumbo/html5/document_fragment.rb
87
+ - lib/nokogumbo/html5/node.rb
80
88
  - lib/nokogumbo/version.rb
81
- - lib/nokogumbo/xml/node.rb
82
89
  homepage: https://github.com/rubys/nokogumbo/#readme
83
90
  licenses:
84
91
  - Apache-2.0
@@ -87,7 +94,7 @@ metadata:
87
94
  changelog_uri: https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md
88
95
  homepage_uri: https://github.com/rubys/nokogumbo/#readme
89
96
  source_code_uri: https://github.com/rubys/nokogumbo
90
- post_install_message:
97
+ post_install_message:
91
98
  rdoc_options: []
92
99
  require_paths:
93
100
  - lib
@@ -95,16 +102,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
95
102
  requirements:
96
103
  - - ">="
97
104
  - !ruby/object:Gem::Version
98
- version: '0'
105
+ version: '2.1'
99
106
  required_rubygems_version: !ruby/object:Gem::Requirement
100
107
  requirements:
101
- - - ">"
108
+ - - ">="
102
109
  - !ruby/object:Gem::Version
103
- version: 1.3.1
110
+ version: '0'
104
111
  requirements: []
105
- rubyforge_project:
106
- rubygems_version: 2.7.6
107
- signing_key:
112
+ rubygems_version: 3.1.4
113
+ signing_key:
108
114
  specification_version: 4
109
115
  summary: Nokogiri interface to the Gumbo HTML5 parser
110
116
  test_files: []