nokogumbo 2.0.0 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -20,10 +20,7 @@
20
20
  Coding conventions specific to this file:
21
21
 
22
22
  1. Functions that fill in a token should be named emit_*, and should be
23
- followed immediately by a return from the tokenizer (true if no error
24
- occurred, false if an error occurred). Sometimes the emit functions
25
- themselves return a boolean so that they can be combined with the return
26
- statement; in this case, they should match this convention.
23
+ followed immediately by a return from the tokenizer.
27
24
  2. Functions that shuffle data from temporaries to final API structures
28
25
  should be named finish_*, and be called just before the tokenizer exits the
29
26
  state that accumulates the temporary.
@@ -141,10 +138,6 @@ typedef struct GumboInternalTokenizerState {
141
138
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
142
139
  bool _is_in_cdata;
143
140
 
144
- // A flag indicating whether the tokenizer has seen a parse error since the
145
- // last token was emitted.
146
- bool _parse_error;
147
-
148
141
  // Certain states (notably character references) may emit two character tokens
149
142
  // at once, but the contract for lex() fills in only one token at a time. The
150
143
  // extra character is buffered here, and then this is checked on entry to
@@ -207,7 +200,6 @@ static void tokenizer_add_parse_error (
207
200
  GumboErrorType type
208
201
  ) {
209
202
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
210
- tokenizer->_parse_error = true;
211
203
  GumboError* error = gumbo_add_error(parser);
212
204
  if (!error) {
213
205
  return;
@@ -228,7 +220,6 @@ static void tokenizer_add_char_ref_error (
228
220
  int codepoint
229
221
  ) {
230
222
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
231
- tokenizer->_parse_error = true;
232
223
  GumboError* error = gumbo_add_error(parser);
233
224
  if (!error)
234
225
  return;
@@ -248,7 +239,6 @@ static void tokenizer_add_token_parse_error (
248
239
  GumboErrorType type
249
240
  ) {
250
241
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
251
- tokenizer->_parse_error = true;
252
242
  GumboError* error = gumbo_add_error(parser);
253
243
  if (!error)
254
244
  return;
@@ -732,7 +722,10 @@ static void copy_over_original_tag_text (
732
722
  original_text->data = tag_state->_original_text;
733
723
  original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
734
724
  tag_state->_original_text;
735
- if (original_text->data[original_text->length - 1] == '\r') {
725
+ if (
726
+ original_text->length
727
+ && original_text->data[original_text->length - 1] == '\r'
728
+ ) {
736
729
  // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
737
730
  // appended to the end of original text even when it's really the first part
738
731
  // of the next character. If we detect this situation, shrink the length of
@@ -770,7 +763,6 @@ static void finish_tag_name(GumboParser* parser) {
770
763
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
771
764
  static void add_duplicate_attr_error(GumboParser* parser) {
772
765
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
773
- tokenizer->_parse_error = true;
774
766
  GumboError* error = gumbo_add_error(parser);
775
767
  if (!error) {
776
768
  return;
@@ -788,17 +780,26 @@ static void add_duplicate_attr_error(GumboParser* parser) {
788
780
  // the attribute's name. The attribute's value starts out as the empty string
789
781
  // (following the "Boolean attributes" section of the spec) and is only
790
782
  // overwritten on finish_attribute_value(). If the attribute has already been
791
- // specified, the new attribute is dropped, a parse error is added, and the
792
- // function returns false. Otherwise, this returns true.
793
- static bool finish_attribute_name(GumboParser* parser) {
783
+ // specified, the new attribute is dropped and a parse error is added
784
+ static void finish_attribute_name(GumboParser* parser) {
794
785
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
795
786
  GumboTagState* tag_state = &tokenizer->_tag_state;
787
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
788
+
789
+ int max_attributes = parser->_options->max_attributes;
790
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
791
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
792
+ gumbo_debug("Attributes limit exceeded.\n");
793
+ reinitialize_tag_buffer(parser);
794
+ tag_state->_drop_next_attr_value = true;
795
+ return;
796
+ }
797
+
796
798
  // May've been set by a previous attribute without a value; reset it here.
797
799
  tag_state->_drop_next_attr_value = false;
798
800
  assert(tag_state->_attributes.data);
799
801
  assert(tag_state->_attributes.capacity);
800
802
 
801
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
802
803
  for (unsigned int i = 0; i < attributes->length; ++i) {
803
804
  GumboAttribute* attr = attributes->data[i];
804
805
  if (
@@ -813,7 +814,7 @@ static bool finish_attribute_name(GumboParser* parser) {
813
814
  add_duplicate_attr_error(parser);
814
815
  reinitialize_tag_buffer(parser);
815
816
  tag_state->_drop_next_attr_value = true;
816
- return false;
817
+ return;
817
818
  }
818
819
  }
819
820
 
@@ -835,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
835
836
  );
836
837
  gumbo_vector_add(attr, attributes);
837
838
  reinitialize_tag_buffer(parser);
838
- return true;
839
839
  }
840
840
 
841
841
  // Finishes an attribute value. This sets the value of the most recently added
@@ -881,7 +881,6 @@ void gumbo_tokenizer_state_init (
881
881
  tokenizer->_reconsume_current_input = false;
882
882
  tokenizer->_is_adjusted_current_node_foreign = false;
883
883
  tokenizer->_is_in_cdata = false;
884
- tokenizer->_parse_error = false;
885
884
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
886
885
  tokenizer->_tag_state._name = NULL;
887
886
 
@@ -891,9 +890,9 @@ void gumbo_tokenizer_state_init (
891
890
 
892
891
  mark_tag_state_as_empty(&tokenizer->_tag_state);
893
892
 
894
- tokenizer->_token_start = text;
895
893
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
896
894
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
895
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
897
896
  doc_type_state_init(parser);
898
897
  }
899
898
 
@@ -3373,7 +3372,7 @@ static GumboLexerStateFunction dispatch_table[] = {
3373
3372
  [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3374
3373
  };
3375
3374
 
3376
- bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3375
+ void gumbo_lex(GumboParser* parser, GumboToken* output) {
3377
3376
  // Because of the spec requirements that...
3378
3377
  //
3379
3378
  // 1. Tokens be handled immediately by the parser upon emission.
@@ -3398,15 +3397,13 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3398
3397
  // isn't consumed twice.
3399
3398
  tokenizer->_reconsume_current_input = false;
3400
3399
  tokenizer->_buffered_emit_char = kGumboNoChar;
3401
- return true;
3400
+ return;
3402
3401
  }
3403
3402
 
3404
3403
  if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3405
- // Return no error.
3406
- return true;
3404
+ return;
3407
3405
  }
3408
3406
 
3409
- tokenizer->_parse_error = false;
3410
3407
  while (1) {
3411
3408
  assert(!tokenizer->_resume_pos);
3412
3409
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
@@ -3420,7 +3417,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3420
3417
  tokenizer->_reconsume_current_input = false;
3421
3418
 
3422
3419
  if (result == EMIT_TOKEN)
3423
- return !tokenizer->_parse_error;
3420
+ return;
3424
3421
 
3425
3422
  if (should_advance) {
3426
3423
  utf8iterator_next(&tokenizer->_input);
@@ -93,19 +93,8 @@ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
93
93
  );
94
94
 
95
95
  // Lexes a single token from the specified buffer, filling the output with the
96
- // parsed GumboToken data structure. Returns true for a successful
97
- // tokenization, false if a parse error occurs.
98
- //
99
- // Example:
100
- // struct GumboInternalParser parser;
101
- // GumboToken output;
102
- // gumbo_tokenizer_state_init(&parser, text, strlen(text));
103
- // while (gumbo_lex(&parser, &output)) {
104
- // ...do stuff with output.
105
- // gumbo_token_destroy(&token);
106
- // }
107
- // gumbo_tokenizer_state_destroy(&parser);
108
- bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
96
+ // parsed GumboToken data structure.
97
+ void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
109
98
 
110
99
  // Frees the internally-allocated pointers within a GumboToken. Note that this
111
100
  // doesn't free the token itself, since oftentimes it will be allocated on the
@@ -193,6 +193,11 @@ void utf8iterator_init (
193
193
  iter->_pos.offset = 0;
194
194
  iter->_parser = parser;
195
195
  read_char(iter);
196
+ if (iter->_current == kUtf8BomChar) {
197
+ iter->_start += iter->_width;
198
+ iter->_pos.offset += iter->_width;
199
+ read_char(iter);
200
+ }
196
201
  }
197
202
 
198
203
  void utf8iterator_next(Utf8Iterator* iter) {
@@ -31,6 +31,7 @@ struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
33
  #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8BomChar 0xFEFF
34
35
  #define kUtf8MaxChar 0x10FFFF
35
36
 
36
37
  typedef struct GumboInternalUtf8Iterator {
data/lib/nokogumbo.rb CHANGED
@@ -1,14 +1,27 @@
1
1
  require 'nokogiri'
2
- require 'nokogumbo/version'
3
- require 'nokogumbo/html5'
4
2
 
5
- require 'nokogumbo/nokogumbo'
3
+ if ((defined?(Nokogiri::HTML5) && Nokogiri::HTML5.respond_to?(:parse)) &&
4
+ (defined?(Nokogiri::Gumbo) && Nokogiri::Gumbo.respond_to?(:parse)) &&
5
+ !(ENV.key?("NOKOGUMBO_IGNORE_NOKOGIRI_HTML5") && ENV["NOKOGUMBO_IGNORE_NOKOGIRI_HTML5"] != "false"))
6
+
7
+ warn "NOTE: nokogumbo: Using Nokogiri::HTML5 provided by Nokogiri. See https://github.com/sparklemotion/nokogiri/issues/2205 for more information."
8
+
9
+ ::Nokogumbo = ::Nokogiri::Gumbo
10
+ else
11
+ require 'nokogumbo/html5'
12
+ require 'nokogumbo/nokogumbo'
6
13
 
7
- module Nokogumbo
8
- # The default maximum number of errors for parsing a document or a fragment.
9
- DEFAULT_MAX_ERRORS = 0
14
+ module Nokogumbo
15
+ # The default maximum number of attributes per element.
16
+ DEFAULT_MAX_ATTRIBUTES = 400
10
17
 
11
- # The default maximum depth of the DOM tree produced by parsing a document
12
- # or fragment.
13
- DEFAULT_MAX_TREE_DEPTH = 400
18
+ # The default maximum number of errors for parsing a document or a fragment.
19
+ DEFAULT_MAX_ERRORS = 0
20
+
21
+ # The default maximum depth of the DOM tree produced by parsing a document
22
+ # or fragment.
23
+ DEFAULT_MAX_TREE_DEPTH = 400
24
+ end
14
25
  end
26
+
27
+ require 'nokogumbo/version'
@@ -19,7 +19,7 @@ module Nokogiri
19
19
 
20
20
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
21
21
  def self.parse(string, url = nil, encoding = nil, **options, &block)
22
- Document.parse(string, url, encoding, options, &block)
22
+ Document.parse(string, url, encoding, **options, &block)
23
23
  end
24
24
 
25
25
  # Parse a fragment from +string+. Convenience method for
@@ -92,19 +92,20 @@ module Nokogiri
92
92
  if encoding.nil?
93
93
  string = string.read
94
94
  else
95
- string = string.read(encoding: encoding)
95
+ string = string.read(encoding: encoding)
96
96
  end
97
97
  else
98
98
  # Otherwise the string has the given encoding.
99
- if encoding && string.respond_to?(:force_encoding)
99
+ string = string.to_s
100
+ if encoding
100
101
  string = string.dup
101
102
  string.force_encoding(encoding)
102
103
  end
103
104
  end
104
105
 
105
- # convert to UTF-8 (Ruby 1.9+)
106
- if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
107
- string = reencode(string.dup)
106
+ # convert to UTF-8
107
+ if string.encoding != Encoding::UTF_8
108
+ string = reencode(string)
108
109
  end
109
110
  string
110
111
  end
@@ -123,18 +124,17 @@ module Nokogiri
123
124
  # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
124
125
  #
125
126
  def self.reencode(body, content_type=nil)
126
- return body unless body.respond_to? :encoding
127
-
128
127
  if body.encoding == Encoding::ASCII_8BIT
129
128
  encoding = nil
130
129
 
131
130
  # look for a Byte Order Mark (BOM)
132
- if body[0..1] == "\xFE\xFF"
133
- encoding = 'utf-16be'
134
- elsif body[0..1] == "\xFF\xFE"
135
- encoding = 'utf-16le'
136
- elsif body[0..2] == "\xEF\xBB\xBF"
137
- encoding = 'utf-8'
131
+ initial_bytes = body[0..2].bytes
132
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
133
+ encoding = Encoding::UTF_8
134
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
135
+ encoding = Encoding::UTF_16BE
136
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
137
+ encoding = Encoding::UTF_16LE
138
138
  end
139
139
 
140
140
  # look for a charset in a content-encoding header
@@ -154,6 +154,7 @@ module Nokogiri
154
154
  encoding ||= Encoding::ISO_8859_1
155
155
 
156
156
  # change the encoding to match the detected or inferred encoding
157
+ body = body.dup
157
158
  begin
158
159
  body.force_encoding(encoding)
159
160
  rescue ArgumentError
@@ -12,6 +12,9 @@ module Nokogiri
12
12
  if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
13
13
  url ||= string_or_io.path
14
14
  end
15
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
16
+ raise ArgumentError.new("not a string or IO object")
17
+ end
15
18
  do_parse(string_or_io, url, encoding, options)
16
19
  end
17
20
 
@@ -21,7 +24,8 @@ module Nokogiri
21
24
  end
22
25
 
23
26
  def self.read_memory(string, url = nil, encoding = nil, **options)
24
- do_parse(string.to_s, url, encoding, options)
27
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
28
+ do_parse(string, url, encoding, options)
25
29
  end
26
30
 
27
31
  def fragment(tags = nil)
@@ -37,9 +41,10 @@ module Nokogiri
37
41
  private
38
42
  def self.do_parse(string_or_io, url, encoding, options)
39
43
  string = HTML5.read_and_encode(string_or_io, encoding)
44
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
40
45
  max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
41
46
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
42
- doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
47
+ doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
43
48
  doc.encoding = 'UTF-8'
44
49
  doc
45
50
  end
@@ -12,10 +12,11 @@ module Nokogiri
12
12
  self.errors = []
13
13
  return self unless tags
14
14
 
15
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
15
16
  max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
16
17
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
17
18
  tags = Nokogiri::HTML5.read_and_encode(tags, nil)
18
- Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
19
+ Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
19
20
  end
20
21
 
21
22
  def serialize(options = {}, &block)
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.0"
2
+ VERSION = "2.0.5"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-10-04 00:00:00.000000000 Z
12
+ date: 2021-03-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -109,8 +109,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  requirements: []
112
- rubyforge_project:
113
- rubygems_version: 2.7.6
112
+ rubygems_version: 3.1.4
114
113
  signing_key:
115
114
  specification_version: 4
116
115
  summary: Nokogiri interface to the Gumbo HTML5 parser