nokogumbo 2.0.0.pre.alpha → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -30,7 +30,9 @@ struct GumboInternalError;
30
30
  struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
- extern const int kUtf8ReplacementChar;
33
+ #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8BomChar 0xFEFF
35
+ #define kUtf8MaxChar 0x10FFFF
34
36
 
35
37
  typedef struct GumboInternalUtf8Iterator {
36
38
  // Points at the start of the code point most recently read into 'current'.
@@ -60,9 +62,23 @@ typedef struct GumboInternalUtf8Iterator {
60
62
  struct GumboInternalParser* _parser;
61
63
  } Utf8Iterator;
62
64
 
63
- // Returns true if this Unicode code point is in the list of characters
64
- // forbidden by the HTML5 spec, such as NUL bytes and undefined control chars.
65
- bool utf8_is_invalid_code_point(int c) CONST_FN;
65
+ // Returns true if this Unicode code point is a surrogate.
66
+ CONST_FN static inline bool utf8_is_surrogate(int c) {
67
+ return c >= 0xD800 && c <= 0xDFFF;
68
+ }
69
+
70
+ // Returns true if this Unicode code point is a noncharacter.
71
+ CONST_FN static inline bool utf8_is_noncharacter(int c) {
72
+ return
73
+ (c >= 0xFDD0 && c <= 0xFDEF)
74
+ || ((c & 0xFFFF) == 0xFFFE)
75
+ || ((c & 0xFFFF) == 0xFFFF);
76
+ }
77
+
78
+ // Returns true if this Unicode code point is a control.
79
+ CONST_FN static inline bool utf8_is_control(int c) {
80
+ return ((unsigned int)c < 0x1Fu) || (c >= 0x7F && c <= 0x9F);
81
+ }
66
82
 
67
83
  // Initializes a new Utf8Iterator from the given byte buffer. The source does
68
84
  // not have to be NUL-terminated, but the length must be passed in explicitly.
@@ -77,20 +93,47 @@ void utf8iterator_init (
77
93
  void utf8iterator_next(Utf8Iterator* iter);
78
94
 
79
95
  // Returns the current code point as an integer.
80
- int utf8iterator_current(const Utf8Iterator* iter);
96
+ static inline int utf8iterator_current(const Utf8Iterator* iter) {
97
+ return iter->_current;
98
+ }
81
99
 
82
100
  // Retrieves and fills the output parameter with the current source position.
83
- void utf8iterator_get_position(
84
- const Utf8Iterator* iter, GumboSourcePosition* output);
101
+ static inline void utf8iterator_get_position (
102
+ const Utf8Iterator* iter,
103
+ GumboSourcePosition* output
104
+ ) {
105
+ *output = iter->_pos;
106
+ }
107
+
108
+ // Retrieves the marked position.
109
+ static inline GumboSourcePosition utf8iterator_get_mark_position (
110
+ const Utf8Iterator* iter
111
+ ) {
112
+ return iter->_mark_pos;
113
+ }
85
114
 
86
115
  // Retrieves a character pointer to the start of the current character.
87
- const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter);
116
+ static inline const char* utf8iterator_get_char_pointer(const Utf8Iterator* iter) {
117
+ return iter->_start;
118
+ }
119
+
120
+ // Retrieves the width of the current character.
121
+ static inline size_t utf8iterator_get_width(const Utf8Iterator* iter) {
122
+ return iter->_width;
123
+ }
88
124
 
89
125
  // Retrieves a character pointer to 1 past the end of the buffer. This is
90
126
  // necessary for certain state machines and string comparisons that would like
91
127
  // to look directly for ASCII text in the buffer without going through the
92
128
  // decoder.
93
- const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter);
129
+ static inline const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
130
+ return iter->_end;
131
+ }
132
+
133
+ // Retrieves a character pointer to the marked position.
134
+ static inline const char* utf8iterator_get_mark_pointer(const Utf8Iterator* iter) {
135
+ return iter->_mark;
136
+ }
94
137
 
95
138
  // If the upcoming text in the buffer matches the specified prefix (which has
96
139
  // length 'length'), consume it and return true. Otherwise, return false with
@@ -114,13 +157,6 @@ void utf8iterator_mark(Utf8Iterator* iter);
114
157
  // Returns the current input stream position to the mark.
115
158
  void utf8iterator_reset(Utf8Iterator* iter);
116
159
 
117
- // Sets the position and original text fields of an error to the value at the
118
- // mark.
119
- void utf8iterator_fill_error_at_mark (
120
- Utf8Iterator* iter,
121
- struct GumboInternalError* error
122
- );
123
-
124
160
  #ifdef __cplusplus
125
161
  }
126
162
  #endif
@@ -1,11 +1,13 @@
1
1
  require 'nokogiri'
2
2
  require 'nokogumbo/version'
3
3
  require 'nokogumbo/html5'
4
- require 'nokogumbo/xml/node.rb'
5
4
 
6
5
  require 'nokogumbo/nokogumbo'
7
6
 
8
7
  module Nokogumbo
8
+ # The default maximum number of attributes per element.
9
+ DEFAULT_MAX_ATTRIBUTES = 400
10
+
9
11
  # The default maximum number of errors for parsing a document or a fragment.
10
12
  DEFAULT_MAX_ERRORS = 0
11
13
 
@@ -1,5 +1,6 @@
1
1
  require 'nokogumbo/html5/document'
2
2
  require 'nokogumbo/html5/document_fragment'
3
+ require 'nokogumbo/html5/node'
3
4
 
4
5
  module Nokogiri
5
6
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
@@ -18,7 +19,7 @@ module Nokogiri
18
19
 
19
20
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
20
21
  def self.parse(string, url = nil, encoding = nil, **options, &block)
21
- Document.parse(string, url, encoding, options, &block)
22
+ Document.parse(string, url, encoding, **options, &block)
22
23
  end
23
24
 
24
25
  # Parse a fragment from +string+. Convenience method for
@@ -91,19 +92,20 @@ module Nokogiri
91
92
  if encoding.nil?
92
93
  string = string.read
93
94
  else
94
- string = string.read(encoding: encoding)
95
+ string = string.read(encoding: encoding)
95
96
  end
96
97
  else
97
98
  # Otherwise the string has the given encoding.
98
- if encoding && string.respond_to?(:force_encoding)
99
+ string = string.to_s
100
+ if encoding
99
101
  string = string.dup
100
102
  string.force_encoding(encoding)
101
103
  end
102
104
  end
103
105
 
104
- # convert to UTF-8 (Ruby 1.9+)
105
- if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
106
- string = reencode(string.dup)
106
+ # convert to UTF-8
107
+ if string.encoding != Encoding::UTF_8
108
+ string = reencode(string)
107
109
  end
108
110
  string
109
111
  end
@@ -122,18 +124,17 @@ module Nokogiri
122
124
  # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
123
125
  #
124
126
  def self.reencode(body, content_type=nil)
125
- return body unless body.respond_to? :encoding
126
-
127
127
  if body.encoding == Encoding::ASCII_8BIT
128
128
  encoding = nil
129
129
 
130
130
  # look for a Byte Order Mark (BOM)
131
- if body[0..1] == "\xFE\xFF"
132
- encoding = 'utf-16be'
133
- elsif body[0..1] == "\xFF\xFE"
134
- encoding = 'utf-16le'
135
- elsif body[0..2] == "\xEF\xBB\xBF"
136
- encoding = 'utf-8'
131
+ initial_bytes = body[0..2].bytes
132
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
133
+ encoding = Encoding::UTF_8
134
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
135
+ encoding = Encoding::UTF_16BE
136
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
137
+ encoding = Encoding::UTF_16LE
137
138
  end
138
139
 
139
140
  # look for a charset in a content-encoding header
@@ -153,6 +154,7 @@ module Nokogiri
153
154
  encoding ||= Encoding::ISO_8859_1
154
155
 
155
156
  # change the encoding to match the detected or inferred encoding
157
+ body = body.dup
156
158
  begin
157
159
  body.force_encoding(encoding)
158
160
  rescue ArgumentError
@@ -167,7 +169,7 @@ module Nokogiri
167
169
  case current_node.type
168
170
  when XML::Node::ELEMENT_NODE
169
171
  ns = current_node.namespace
170
- ns_uri = ns.nil? ? nil : ns.uri
172
+ ns_uri = ns.nil? ? nil : ns.href
171
173
  # XXX(sfc): attach namespaces to all nodes, even html?
172
174
  if ns_uri.nil? || ns_uri == HTML_NAMESPACE || ns_uri == MATHML_NAMESPACE || ns_uri == SVG_NAMESPACE
173
175
  tagname = current_node.name
@@ -3,6 +3,7 @@ module Nokogiri
3
3
  class Document < Nokogiri::HTML::Document
4
4
  def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
5
5
  yield options if block_given?
6
+ string_or_io = '' unless string_or_io
6
7
 
7
8
  if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
8
9
  encoding ||= string_or_io.encoding.name
@@ -11,24 +12,39 @@ module Nokogiri
11
12
  if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
12
13
  url ||= string_or_io.path
13
14
  end
15
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
16
+ raise ArgumentError.new("not a string or IO object")
17
+ end
14
18
  do_parse(string_or_io, url, encoding, options)
15
19
  end
16
20
 
17
21
  def self.read_io(io, url = nil, encoding = nil, **options)
18
- raise ArgumentError.new("io object doesn't respond to :read") unless io.respon_to?(:read)
22
+ raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
19
23
  do_parse(io, url, encoding, options)
20
24
  end
21
25
 
22
26
  def self.read_memory(string, url = nil, encoding = nil, **options)
23
- do_parse(string.to_s, url, encoding, options)
27
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
28
+ do_parse(string, url, encoding, options)
29
+ end
30
+
31
+ def fragment(tags = nil)
32
+ DocumentFragment.new(self, tags, self.root)
33
+ end
34
+
35
+ def to_xml(options = {}, &block)
36
+ # Bypass XML::Document#to_xml which doesn't add
37
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
38
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
24
39
  end
25
40
 
26
41
  private
27
42
  def self.do_parse(string_or_io, url, encoding, options)
28
43
  string = HTML5.read_and_encode(string_or_io, encoding)
44
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
29
45
  max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
30
46
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
31
- doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
47
+ doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
32
48
  doc.encoding = 'UTF-8'
33
49
  doc
34
50
  end
@@ -3,29 +3,20 @@ require 'nokogiri'
3
3
  module Nokogiri
4
4
  module HTML5
5
5
  class DocumentFragment < Nokogiri::HTML::DocumentFragment
6
+ attr_accessor :document
7
+ attr_accessor :errors
8
+
6
9
  # Create a document fragment.
7
10
  def initialize(doc, tags = nil, ctx = nil, options = {})
11
+ self.document = doc
12
+ self.errors = []
8
13
  return self unless tags
9
- if ctx
10
- raise Argument.new("Fragment parsing with context not supported")
11
- else
12
- tags = Nokogiri::HTML5.read_and_encode(tags, nil)
13
-
14
- # Copied from Nokogiri's document_fragment.rb and labled "a horrible
15
- # hack."
16
- if tags.strip =~ /^<body/i
17
- path = "/html/body"
18
- else
19
- path = "/html/body/node()"
20
- end
21
- # Add 2 for <html> and <body>.
22
- max_depth = (options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH) + 2
23
- options = options.dup
24
- options[:max_tree_depth] = max_depth
25
- temp_doc = HTML5.parse("<!DOCTYPE html><html><body>#{tags}", options)
26
- temp_doc.xpath(path).each { |child| child.parent = self }
27
- self.errors = temp_doc.errors
28
- end
14
+
15
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
16
+ max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
17
+ max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
18
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
19
+ Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
29
20
  end
30
21
 
31
22
  def serialize(options = {}, &block)
@@ -41,6 +32,31 @@ module Nokogiri
41
32
  doc.encoding = 'UTF-8'
42
33
  new(doc, tags, nil, options)
43
34
  end
35
+
36
+ def extract_params params # :nodoc:
37
+ handler = params.find do |param|
38
+ ![Hash, String, Symbol].include?(param.class)
39
+ end
40
+ params -= [handler] if handler
41
+
42
+ hashes = []
43
+ while Hash === params.last || params.last.nil?
44
+ hashes << params.pop
45
+ break if params.empty?
46
+ end
47
+ ns, binds = hashes.reverse
48
+
49
+ ns ||=
50
+ begin
51
+ ns = Hash.new
52
+ children.each { |child| ns.merge!(child.namespaces) }
53
+ ns
54
+ end
55
+
56
+ [params, handler, ns, binds]
57
+ end
58
+
44
59
  end
45
60
  end
46
61
  end
62
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,57 +1,72 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  module Nokogiri
4
- # Monkey patch
5
- module XML
6
- class Node
4
+ module HTML5
5
+ module Node
7
6
  # HTML elements can have attributes that contain colons.
8
7
  # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
9
8
  # and tries to create an attribute in a namespace. This is especially
10
9
  # annoying with attribute names like xml:lang since libxml2 will
11
10
  # actually create the xml namespace if it doesn't exist already.
12
- define_method(:add_child_node_and_reparent_attrs) do |node|
11
+ def add_child_node_and_reparent_attrs(node)
12
+ return super(node) unless document.is_a?(HTML5::Document)
13
+ # I'm not sure what this method is supposed to do. Reparenting
14
+ # namespaces is handled by libxml2, including child namespaces which
15
+ # this method wouldn't handle.
16
+ # https://github.com/sparklemotion/nokogiri/issues/1790
13
17
  add_child_node(node)
14
- node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
15
- attr.remove
16
- node[attr.name] = attr.value
17
- end
18
+ #node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
19
+ # attr.remove
20
+ # ns = attr.namespace
21
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
22
+ #end
18
23
  end
19
24
 
20
25
  def inner_html(options = {})
26
+ return super(options) unless document.is_a?(HTML5::Document)
21
27
  result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? "\n" : ""
22
28
  result << children.map { |child| child.to_html(options) }.join
23
29
  result
24
30
  end
25
31
 
26
32
  def write_to(io, *options)
33
+ return super(io, *options) unless document.is_a?(HTML5::Document)
27
34
  options = options.first.is_a?(Hash) ? options.shift : {}
28
35
  encoding = options[:encoding] || options[0]
29
36
  if Nokogiri.jruby?
30
37
  save_options = options[:save_with] || options[1]
31
38
  indent_times = options[:indent] || 0
32
39
  else
33
- save_options = options[:save_with] || options[1] || SaveOptions::FORMAT
40
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
34
41
  indent_times = options[:indent] || 2
35
42
  end
36
43
  indent_string = (options[:indent_text] || ' ') * indent_times
37
44
 
38
- config = SaveOptions.new(save_options.to_i)
45
+ config = XML::Node::SaveOptions.new(save_options.to_i)
39
46
  yield config if block_given?
40
47
 
41
48
  config_options = config.options
42
- if (config_options & (SaveOptions::AS_XML | SaveOptions::AS_XHTML) != 0) || !document.is_a?(HTML5::Document)
49
+ if (config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0)
43
50
  # Use Nokogiri's serializing code.
44
51
  native_write_to(io, encoding, indent_string, config_options)
45
52
  else
46
53
  # Serialize including the current node.
47
54
  encoding ||= document.encoding || Encoding::UTF_8
48
55
  internal_ops = {
49
- trailing_nl: config_options & SaveOptions::FORMAT != 0,
50
56
  preserve_newline: options[:preserve_newline] || false
51
57
  }
52
- HTML5.serialize_node_internal(self, io, encoding, options)
58
+ HTML5.serialize_node_internal(self, io, encoding, internal_ops)
53
59
  end
54
60
  end
61
+
62
+ def fragment(tags)
63
+ return super(tags) unless document.is_a?(HTML5::Document)
64
+ DocumentFragment.new(document, tags, self)
65
+ end
55
66
  end
67
+ # Monkey patch
68
+ XML::Node.prepend(HTML5::Node)
56
69
  end
57
70
  end
71
+
72
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.0-alpha"
2
+ VERSION = "2.0.4"
3
3
  end
metadata CHANGED
@@ -1,30 +1,36 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0.pre.alpha
4
+ version: 2.0.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
8
8
  - Stephen Checkoway
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-08-31 00:00:00.000000000 Z
12
+ date: 2020-11-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
16
16
  requirement: !ruby/object:Gem::Requirement
17
17
  requirements:
18
+ - - "~>"
19
+ - !ruby/object:Gem::Version
20
+ version: '1.8'
18
21
  - - ">="
19
22
  - !ruby/object:Gem::Version
20
- version: '0'
23
+ version: 1.8.4
21
24
  type: :runtime
22
25
  prerelease: false
23
26
  version_requirements: !ruby/object:Gem::Requirement
24
27
  requirements:
28
+ - - "~>"
29
+ - !ruby/object:Gem::Version
30
+ version: '1.8'
25
31
  - - ">="
26
32
  - !ruby/object:Gem::Version
27
- version: '0'
33
+ version: 1.8.4
28
34
  description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
29
35
  access the result as a Nokogiri parsed document.
30
36
  email:
@@ -35,7 +41,6 @@ extensions:
35
41
  - ext/nokogumbo/extconf.rb
36
42
  extra_rdoc_files: []
37
43
  files:
38
- - CHANGELOG.md
39
44
  - LICENSE.txt
40
45
  - README.md
41
46
  - ext/nokogumbo/extconf.rb
@@ -63,6 +68,8 @@ files:
63
68
  - gumbo-parser/src/tag.c
64
69
  - gumbo-parser/src/tag_lookup.c
65
70
  - gumbo-parser/src/tag_lookup.h
71
+ - gumbo-parser/src/token_buffer.c
72
+ - gumbo-parser/src/token_buffer.h
66
73
  - gumbo-parser/src/token_type.h
67
74
  - gumbo-parser/src/tokenizer.c
68
75
  - gumbo-parser/src/tokenizer.h
@@ -77,8 +84,8 @@ files:
77
84
  - lib/nokogumbo/html5.rb
78
85
  - lib/nokogumbo/html5/document.rb
79
86
  - lib/nokogumbo/html5/document_fragment.rb
87
+ - lib/nokogumbo/html5/node.rb
80
88
  - lib/nokogumbo/version.rb
81
- - lib/nokogumbo/xml/node.rb
82
89
  homepage: https://github.com/rubys/nokogumbo/#readme
83
90
  licenses:
84
91
  - Apache-2.0
@@ -87,7 +94,7 @@ metadata:
87
94
  changelog_uri: https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md
88
95
  homepage_uri: https://github.com/rubys/nokogumbo/#readme
89
96
  source_code_uri: https://github.com/rubys/nokogumbo
90
- post_install_message:
97
+ post_install_message:
91
98
  rdoc_options: []
92
99
  require_paths:
93
100
  - lib
@@ -95,16 +102,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
95
102
  requirements:
96
103
  - - ">="
97
104
  - !ruby/object:Gem::Version
98
- version: '0'
105
+ version: '2.1'
99
106
  required_rubygems_version: !ruby/object:Gem::Requirement
100
107
  requirements:
101
- - - ">"
108
+ - - ">="
102
109
  - !ruby/object:Gem::Version
103
- version: 1.3.1
110
+ version: '0'
104
111
  requirements: []
105
- rubyforge_project:
106
- rubygems_version: 2.7.6
107
- signing_key:
112
+ rubygems_version: 3.1.4
113
+ signing_key:
108
114
  specification_version: 4
109
115
  summary: Nokogiri interface to the Gumbo HTML5 parser
110
116
  test_files: []