nokogumbo 2.0.0 → 2.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -20,10 +20,7 @@
20
20
  Coding conventions specific to this file:
21
21
 
22
22
  1. Functions that fill in a token should be named emit_*, and should be
23
- followed immediately by a return from the tokenizer (true if no error
24
- occurred, false if an error occurred). Sometimes the emit functions
25
- themselves return a boolean so that they can be combined with the return
26
- statement; in this case, they should match this convention.
23
+ followed immediately by a return from the tokenizer.
27
24
  2. Functions that shuffle data from temporaries to final API structures
28
25
  should be named finish_*, and be called just before the tokenizer exits the
29
26
  state that accumulates the temporary.
@@ -141,10 +138,6 @@ typedef struct GumboInternalTokenizerState {
141
138
  // text tokens emitted will be GUMBO_TOKEN_CDATA.
142
139
  bool _is_in_cdata;
143
140
 
144
- // A flag indicating whether the tokenizer has seen a parse error since the
145
- // last token was emitted.
146
- bool _parse_error;
147
-
148
141
  // Certain states (notably character references) may emit two character tokens
149
142
  // at once, but the contract for lex() fills in only one token at a time. The
150
143
  // extra character is buffered here, and then this is checked on entry to
@@ -207,7 +200,6 @@ static void tokenizer_add_parse_error (
207
200
  GumboErrorType type
208
201
  ) {
209
202
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
210
- tokenizer->_parse_error = true;
211
203
  GumboError* error = gumbo_add_error(parser);
212
204
  if (!error) {
213
205
  return;
@@ -228,7 +220,6 @@ static void tokenizer_add_char_ref_error (
228
220
  int codepoint
229
221
  ) {
230
222
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
231
- tokenizer->_parse_error = true;
232
223
  GumboError* error = gumbo_add_error(parser);
233
224
  if (!error)
234
225
  return;
@@ -248,7 +239,6 @@ static void tokenizer_add_token_parse_error (
248
239
  GumboErrorType type
249
240
  ) {
250
241
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
251
- tokenizer->_parse_error = true;
252
242
  GumboError* error = gumbo_add_error(parser);
253
243
  if (!error)
254
244
  return;
@@ -732,7 +722,10 @@ static void copy_over_original_tag_text (
732
722
  original_text->data = tag_state->_original_text;
733
723
  original_text->length = utf8iterator_get_char_pointer(&tokenizer->_input) -
734
724
  tag_state->_original_text;
735
- if (original_text->data[original_text->length - 1] == '\r') {
725
+ if (
726
+ original_text->length
727
+ && original_text->data[original_text->length - 1] == '\r'
728
+ ) {
736
729
  // Since \r is skipped by the UTF-8 iterator, it can sometimes end up
737
730
  // appended to the end of original text even when it's really the first part
738
731
  // of the next character. If we detect this situation, shrink the length of
@@ -770,7 +763,6 @@ static void finish_tag_name(GumboParser* parser) {
770
763
  // Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
771
764
  static void add_duplicate_attr_error(GumboParser* parser) {
772
765
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
773
- tokenizer->_parse_error = true;
774
766
  GumboError* error = gumbo_add_error(parser);
775
767
  if (!error) {
776
768
  return;
@@ -788,17 +780,26 @@ static void add_duplicate_attr_error(GumboParser* parser) {
788
780
  // the attribute's name. The attribute's value starts out as the empty string
789
781
  // (following the "Boolean attributes" section of the spec) and is only
790
782
  // overwritten on finish_attribute_value(). If the attribute has already been
791
- // specified, the new attribute is dropped, a parse error is added, and the
792
- // function returns false. Otherwise, this returns true.
793
- static bool finish_attribute_name(GumboParser* parser) {
783
+ // specified, the new attribute is dropped and a parse error is added
784
+ static void finish_attribute_name(GumboParser* parser) {
794
785
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
795
786
  GumboTagState* tag_state = &tokenizer->_tag_state;
787
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
788
+
789
+ int max_attributes = parser->_options->max_attributes;
790
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
791
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
792
+ gumbo_debug("Attributes limit exceeded.\n");
793
+ reinitialize_tag_buffer(parser);
794
+ tag_state->_drop_next_attr_value = true;
795
+ return;
796
+ }
797
+
796
798
  // May've been set by a previous attribute without a value; reset it here.
797
799
  tag_state->_drop_next_attr_value = false;
798
800
  assert(tag_state->_attributes.data);
799
801
  assert(tag_state->_attributes.capacity);
800
802
 
801
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
802
803
  for (unsigned int i = 0; i < attributes->length; ++i) {
803
804
  GumboAttribute* attr = attributes->data[i];
804
805
  if (
@@ -813,7 +814,7 @@ static bool finish_attribute_name(GumboParser* parser) {
813
814
  add_duplicate_attr_error(parser);
814
815
  reinitialize_tag_buffer(parser);
815
816
  tag_state->_drop_next_attr_value = true;
816
- return false;
817
+ return;
817
818
  }
818
819
  }
819
820
 
@@ -835,7 +836,6 @@ static bool finish_attribute_name(GumboParser* parser) {
835
836
  );
836
837
  gumbo_vector_add(attr, attributes);
837
838
  reinitialize_tag_buffer(parser);
838
- return true;
839
839
  }
840
840
 
841
841
  // Finishes an attribute value. This sets the value of the most recently added
@@ -881,7 +881,6 @@ void gumbo_tokenizer_state_init (
881
881
  tokenizer->_reconsume_current_input = false;
882
882
  tokenizer->_is_adjusted_current_node_foreign = false;
883
883
  tokenizer->_is_in_cdata = false;
884
- tokenizer->_parse_error = false;
885
884
  tokenizer->_tag_state._last_start_tag = GUMBO_TAG_LAST;
886
885
  tokenizer->_tag_state._name = NULL;
887
886
 
@@ -891,9 +890,9 @@ void gumbo_tokenizer_state_init (
891
890
 
892
891
  mark_tag_state_as_empty(&tokenizer->_tag_state);
893
892
 
894
- tokenizer->_token_start = text;
895
893
  utf8iterator_init(parser, text, text_length, &tokenizer->_input);
896
894
  utf8iterator_get_position(&tokenizer->_input, &tokenizer->_token_start_pos);
895
+ tokenizer->_token_start = utf8iterator_get_char_pointer(&tokenizer->_input);
897
896
  doc_type_state_init(parser);
898
897
  }
899
898
 
@@ -3373,7 +3372,7 @@ static GumboLexerStateFunction dispatch_table[] = {
3373
3372
  [GUMBO_LEX_NUMERIC_CHARACTER_REFERENCE_END] = handle_numeric_character_reference_end_state,
3374
3373
  };
3375
3374
 
3376
- bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3375
+ void gumbo_lex(GumboParser* parser, GumboToken* output) {
3377
3376
  // Because of the spec requirements that...
3378
3377
  //
3379
3378
  // 1. Tokens be handled immediately by the parser upon emission.
@@ -3398,15 +3397,13 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3398
3397
  // isn't consumed twice.
3399
3398
  tokenizer->_reconsume_current_input = false;
3400
3399
  tokenizer->_buffered_emit_char = kGumboNoChar;
3401
- return true;
3400
+ return;
3402
3401
  }
3403
3402
 
3404
3403
  if (maybe_emit_from_mark(parser, output) == EMIT_TOKEN) {
3405
- // Return no error.
3406
- return true;
3404
+ return;
3407
3405
  }
3408
3406
 
3409
- tokenizer->_parse_error = false;
3410
3407
  while (1) {
3411
3408
  assert(!tokenizer->_resume_pos);
3412
3409
  assert(tokenizer->_buffered_emit_char == kGumboNoChar);
@@ -3420,7 +3417,7 @@ bool gumbo_lex(GumboParser* parser, GumboToken* output) {
3420
3417
  tokenizer->_reconsume_current_input = false;
3421
3418
 
3422
3419
  if (result == EMIT_TOKEN)
3423
- return !tokenizer->_parse_error;
3420
+ return;
3424
3421
 
3425
3422
  if (should_advance) {
3426
3423
  utf8iterator_next(&tokenizer->_input);
@@ -93,19 +93,8 @@ void gumbo_tokenizer_set_is_adjusted_current_node_foreign (
93
93
  );
94
94
 
95
95
  // Lexes a single token from the specified buffer, filling the output with the
96
- // parsed GumboToken data structure. Returns true for a successful
97
- // tokenization, false if a parse error occurs.
98
- //
99
- // Example:
100
- // struct GumboInternalParser parser;
101
- // GumboToken output;
102
- // gumbo_tokenizer_state_init(&parser, text, strlen(text));
103
- // while (gumbo_lex(&parser, &output)) {
104
- // ...do stuff with output.
105
- // gumbo_token_destroy(&token);
106
- // }
107
- // gumbo_tokenizer_state_destroy(&parser);
108
- bool gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
96
+ // parsed GumboToken data structure.
97
+ void gumbo_lex(struct GumboInternalParser* parser, GumboToken* output);
109
98
 
110
99
  // Frees the internally-allocated pointers within a GumboToken. Note that this
111
100
  // doesn't free the token itself, since oftentimes it will be allocated on the
@@ -193,6 +193,11 @@ void utf8iterator_init (
193
193
  iter->_pos.offset = 0;
194
194
  iter->_parser = parser;
195
195
  read_char(iter);
196
+ if (iter->_current == kUtf8BomChar) {
197
+ iter->_start += iter->_width;
198
+ iter->_pos.offset += iter->_width;
199
+ read_char(iter);
200
+ }
196
201
  }
197
202
 
198
203
  void utf8iterator_next(Utf8Iterator* iter) {
@@ -31,6 +31,7 @@ struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
33
  #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8BomChar 0xFEFF
34
35
  #define kUtf8MaxChar 0x10FFFF
35
36
 
36
37
  typedef struct GumboInternalUtf8Iterator {
data/lib/nokogumbo.rb CHANGED
@@ -1,14 +1,27 @@
1
1
  require 'nokogiri'
2
- require 'nokogumbo/version'
3
- require 'nokogumbo/html5'
4
2
 
5
- require 'nokogumbo/nokogumbo'
3
+ if ((defined?(Nokogiri::HTML5) && Nokogiri::HTML5.respond_to?(:parse)) &&
4
+ (defined?(Nokogiri::Gumbo) && Nokogiri::Gumbo.respond_to?(:parse)) &&
5
+ !(ENV.key?("NOKOGUMBO_IGNORE_NOKOGIRI_HTML5") && ENV["NOKOGUMBO_IGNORE_NOKOGIRI_HTML5"] != "false"))
6
+
7
+ warn "NOTE: nokogumbo: Using Nokogiri::HTML5 provided by Nokogiri. See https://github.com/sparklemotion/nokogiri/issues/2205 for more information."
8
+
9
+ ::Nokogumbo = ::Nokogiri::Gumbo
10
+ else
11
+ require 'nokogumbo/html5'
12
+ require 'nokogumbo/nokogumbo'
6
13
 
7
- module Nokogumbo
8
- # The default maximum number of errors for parsing a document or a fragment.
9
- DEFAULT_MAX_ERRORS = 0
14
+ module Nokogumbo
15
+ # The default maximum number of attributes per element.
16
+ DEFAULT_MAX_ATTRIBUTES = 400
10
17
 
11
- # The default maximum depth of the DOM tree produced by parsing a document
12
- # or fragment.
13
- DEFAULT_MAX_TREE_DEPTH = 400
18
+ # The default maximum number of errors for parsing a document or a fragment.
19
+ DEFAULT_MAX_ERRORS = 0
20
+
21
+ # The default maximum depth of the DOM tree produced by parsing a document
22
+ # or fragment.
23
+ DEFAULT_MAX_TREE_DEPTH = 400
24
+ end
14
25
  end
26
+
27
+ require 'nokogumbo/version'
@@ -19,7 +19,7 @@ module Nokogiri
19
19
 
20
20
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
21
21
  def self.parse(string, url = nil, encoding = nil, **options, &block)
22
- Document.parse(string, url, encoding, options, &block)
22
+ Document.parse(string, url, encoding, **options, &block)
23
23
  end
24
24
 
25
25
  # Parse a fragment from +string+. Convenience method for
@@ -92,19 +92,20 @@ module Nokogiri
92
92
  if encoding.nil?
93
93
  string = string.read
94
94
  else
95
- string = string.read(encoding: encoding)
95
+ string = string.read(encoding: encoding)
96
96
  end
97
97
  else
98
98
  # Otherwise the string has the given encoding.
99
- if encoding && string.respond_to?(:force_encoding)
99
+ string = string.to_s
100
+ if encoding
100
101
  string = string.dup
101
102
  string.force_encoding(encoding)
102
103
  end
103
104
  end
104
105
 
105
- # convert to UTF-8 (Ruby 1.9+)
106
- if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
107
- string = reencode(string.dup)
106
+ # convert to UTF-8
107
+ if string.encoding != Encoding::UTF_8
108
+ string = reencode(string)
108
109
  end
109
110
  string
110
111
  end
@@ -123,18 +124,17 @@ module Nokogiri
123
124
  # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
124
125
  #
125
126
  def self.reencode(body, content_type=nil)
126
- return body unless body.respond_to? :encoding
127
-
128
127
  if body.encoding == Encoding::ASCII_8BIT
129
128
  encoding = nil
130
129
 
131
130
  # look for a Byte Order Mark (BOM)
132
- if body[0..1] == "\xFE\xFF"
133
- encoding = 'utf-16be'
134
- elsif body[0..1] == "\xFF\xFE"
135
- encoding = 'utf-16le'
136
- elsif body[0..2] == "\xEF\xBB\xBF"
137
- encoding = 'utf-8'
131
+ initial_bytes = body[0..2].bytes
132
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
133
+ encoding = Encoding::UTF_8
134
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
135
+ encoding = Encoding::UTF_16BE
136
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
137
+ encoding = Encoding::UTF_16LE
138
138
  end
139
139
 
140
140
  # look for a charset in a content-encoding header
@@ -154,6 +154,7 @@ module Nokogiri
154
154
  encoding ||= Encoding::ISO_8859_1
155
155
 
156
156
  # change the encoding to match the detected or inferred encoding
157
+ body = body.dup
157
158
  begin
158
159
  body.force_encoding(encoding)
159
160
  rescue ArgumentError
@@ -12,6 +12,9 @@ module Nokogiri
12
12
  if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
13
13
  url ||= string_or_io.path
14
14
  end
15
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
16
+ raise ArgumentError.new("not a string or IO object")
17
+ end
15
18
  do_parse(string_or_io, url, encoding, options)
16
19
  end
17
20
 
@@ -21,7 +24,8 @@ module Nokogiri
21
24
  end
22
25
 
23
26
  def self.read_memory(string, url = nil, encoding = nil, **options)
24
- do_parse(string.to_s, url, encoding, options)
27
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
28
+ do_parse(string, url, encoding, options)
25
29
  end
26
30
 
27
31
  def fragment(tags = nil)
@@ -37,9 +41,10 @@ module Nokogiri
37
41
  private
38
42
  def self.do_parse(string_or_io, url, encoding, options)
39
43
  string = HTML5.read_and_encode(string_or_io, encoding)
44
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
40
45
  max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
41
46
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
42
- doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
47
+ doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
43
48
  doc.encoding = 'UTF-8'
44
49
  doc
45
50
  end
@@ -12,10 +12,11 @@ module Nokogiri
12
12
  self.errors = []
13
13
  return self unless tags
14
14
 
15
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
15
16
  max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
16
17
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
17
18
  tags = Nokogiri::HTML5.read_and_encode(tags, nil)
18
- Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
19
+ Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
19
20
  end
20
21
 
21
22
  def serialize(options = {}, &block)
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.0"
2
+ VERSION = "2.0.5"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.0.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2018-10-04 00:00:00.000000000 Z
12
+ date: 2021-03-19 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -109,8 +109,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  requirements: []
112
- rubyforge_project:
113
- rubygems_version: 2.7.6
112
+ rubygems_version: 3.1.4
114
113
  signing_key:
115
114
  specification_version: 4
116
115
  summary: Nokogiri interface to the Gumbo HTML5 parser