nokogumbo 2.0.2 → 2.0.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dbc23d3b7a9665b48d0516f523756407ca46733286e89edaf2b5b01b05820ffd
4
- data.tar.gz: 9d777f65fe5170fe66fde53dd9fa0d2904e2ca2d73d25d6fd240fc383cb20804
3
+ metadata.gz: e4694cf3eefbeee2a55cd4bb355b7ec6159c64eac4454dff02b1fbf7e5e8375a
4
+ data.tar.gz: 67832a7c26148f59755360758fcc0b0c1969949bf1e5a1b27f5cabe4b9e8b40b
5
5
  SHA512:
6
- metadata.gz: 602ecadc3afd998eb380a9d4fed204cdd7fd48e5ee8539e4ae5713289fa65a923d27c4b67835546c048522f1a04a165d87f257f1389c591e41e127be06e98109
7
- data.tar.gz: 88a714a552e5cc6f11b65da15bd9816d21bfc383dd5c68e1b6e8ab82ce842adf3159ad56579428b87b7c1dba002b9e38758494865153285f2563aeae9edeba54
6
+ metadata.gz: 3a415817caaf0c3c03037664bda8ed8aa17cc14419e75672dcaa2e2a7dd6d9a20e6ab59095a2295f90da5e45de2c3d72f9a25557533836d55dc67966fe8c7a14
7
+ data.tar.gz: 8dc8f9f2d55936a63097301dc5eb6fb54ed1e4c274b03cdcd6f45e2b4ac2cdc911a54e8e5838ce468820ebade731a62f7cfc167817528fd0adb415087ce924b6
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
2
2
 
3
- Nokogumbo provides the ability for a Ruby program to invoke the
4
- [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
3
+ Nokogumbo provides the ability for a Ruby program to invoke
4
+ [our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src)
5
5
  and to access the result as a
6
6
  [Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
7
7
 
@@ -128,6 +128,22 @@ doc = Nokogiri.HTML5(html)
128
128
  doc = Nokogiri.HTML5(html, max_tree_depth: -1)
129
129
  ```
130
130
 
131
+ ### Attribute limit per element
132
+ The maximum number of attributes per DOM element is configurable by the
133
+ `:max_attributes` option. If a given element would exceed this limit, then an
134
+ [ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
135
+
136
+ This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can
137
+ be removed by giving the option `max_attributes: -1`.
138
+
139
+ ``` ruby
140
+ html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
141
+ # "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
142
+ doc = Nokogiri.HTML5(html)
143
+ # raises ArgumentError: Attributes per element limit exceeded
144
+ doc = Nokogiri.HTML5(html, max_attributes: -1)
145
+ ```
146
+
131
147
  ## HTML Serialization
132
148
 
133
149
  After parsing HTML, it may be serialized using any of the Nokogiri
@@ -281,6 +281,7 @@ static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
281
281
  switch (output->status) {
282
282
  case GUMBO_STATUS_OK:
283
283
  break;
284
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
284
285
  case GUMBO_STATUS_TREE_TOO_DEEP:
285
286
  gumbo_destroy_output(output);
286
287
  rb_raise(rb_eArgError, "%s", status_string);
@@ -479,19 +480,43 @@ typedef struct {
479
480
  xmlDocPtr doc;
480
481
  } ParseArgs;
481
482
 
482
- static VALUE parse_cleanup(ParseArgs *args) {
483
+ static void parse_args_mark(void *parse_args) {
484
+ ParseArgs *args = parse_args;
485
+ rb_gc_mark_maybe(args->input);
486
+ rb_gc_mark_maybe(args->url_or_frag);
487
+ }
488
+
489
+ // Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the
490
+ // wrapper.
491
+ static VALUE wrap_parse_args(ParseArgs *args) {
492
+ return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args);
493
+ }
494
+
495
+ // Returnsd the underlying ParseArgs wrapped by wrap_parse_args.
496
+ static ParseArgs *unwrap_parse_args(VALUE obj) {
497
+ ParseArgs *args;
498
+ Data_Get_Struct(obj, ParseArgs, args);
499
+ return args;
500
+ }
501
+
502
+ static VALUE parse_cleanup(VALUE parse_args) {
503
+ ParseArgs *args = unwrap_parse_args(parse_args);
483
504
  gumbo_destroy_output(args->output);
505
+ // Make sure garbage collection doesn't mark the objects as being live based
506
+ // on references from the ParseArgs. This may be unnecessary.
507
+ args->input = Qnil;
508
+ args->url_or_frag = Qnil;
484
509
  if (args->doc != NIL)
485
510
  xmlFreeDoc(args->doc);
486
511
  return Qnil;
487
512
  }
488
513
 
489
-
490
- static VALUE parse_continue(ParseArgs *args);
514
+ static VALUE parse_continue(VALUE parse_args);
491
515
 
492
516
  // Parse a string using gumbo_parse into a Nokogiri document
493
- static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE max_depth) {
517
+ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
494
518
  GumboOptions options = kGumboDefaultOptions;
519
+ options.max_attributes = NUM2INT(max_attributes);
495
520
  options.max_errors = NUM2INT(max_errors);
496
521
  options.max_tree_depth = NUM2INT(max_depth);
497
522
 
@@ -502,10 +527,13 @@ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE m
502
527
  .url_or_frag = url,
503
528
  .doc = NIL,
504
529
  };
505
- return rb_ensure(parse_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
530
+ VALUE parse_args = wrap_parse_args(&args);
531
+
532
+ return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
506
533
  }
507
534
 
508
- static VALUE parse_continue(ParseArgs *args) {
535
+ static VALUE parse_continue(VALUE parse_args) {
536
+ ParseArgs *args = unwrap_parse_args(parse_args);
509
537
  GumboOutput *output = args->output;
510
538
  xmlDocPtr doc;
511
539
  if (output->document->v.document.has_doctype) {
@@ -563,13 +591,14 @@ static xmlNodePtr extract_xml_node(VALUE node) {
563
591
  #endif
564
592
  }
565
593
 
566
- static VALUE fragment_continue(ParseArgs *args);
594
+ static VALUE fragment_continue(VALUE parse_args);
567
595
 
568
596
  static VALUE fragment (
569
597
  VALUE self,
570
598
  VALUE doc_fragment,
571
599
  VALUE tags,
572
600
  VALUE ctx,
601
+ VALUE max_attributes,
573
602
  VALUE max_errors,
574
603
  VALUE max_depth
575
604
  ) {
@@ -676,6 +705,7 @@ static VALUE fragment (
676
705
  // Perform a fragment parse.
677
706
  int depth = NUM2INT(max_depth);
678
707
  GumboOptions options = kGumboDefaultOptions;
708
+ options.max_attributes = NUM2INT(max_attributes);
679
709
  options.max_errors = NUM2INT(max_errors);
680
710
  // Add one to account for the HTML element.
681
711
  options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
@@ -692,11 +722,13 @@ static VALUE fragment (
692
722
  .url_or_frag = doc_fragment,
693
723
  .doc = (xmlDocPtr)extract_xml_node(doc),
694
724
  };
695
- rb_ensure(fragment_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
725
+ VALUE parse_args = wrap_parse_args(&args);
726
+ rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
696
727
  return Qnil;
697
728
  }
698
729
 
699
- static VALUE fragment_continue(ParseArgs *args) {
730
+ static VALUE fragment_continue(VALUE parse_args) {
731
+ ParseArgs *args = unwrap_parse_args(parse_args);
700
732
  GumboOutput *output = args->output;
701
733
  VALUE doc_fragment = args->url_or_frag;
702
734
  xmlDocPtr xml_doc = args->doc;
@@ -720,10 +752,15 @@ void Init_nokogumbo() {
720
752
  VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
721
753
  VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
722
754
  cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
755
+ rb_gc_register_mark_object(cNokogiriXmlSyntaxError);
723
756
  cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
757
+ rb_gc_register_mark_object(cNokogiriXmlElement);
724
758
  cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
759
+ rb_gc_register_mark_object(cNokogiriXmlText);
725
760
  cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
761
+ rb_gc_register_mark_object(cNokogiriXmlCData);
726
762
  cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
763
+ rb_gc_register_mark_object(cNokogiriXmlComment);
727
764
 
728
765
  // Interned symbols.
729
766
  new = rb_intern_const("new");
@@ -736,6 +773,7 @@ void Init_nokogumbo() {
736
773
  // Class constants.
737
774
  VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
738
775
  Document = rb_const_get(HTML5, rb_intern_const("Document"));
776
+ rb_gc_register_mark_object(Document);
739
777
 
740
778
  // Interned symbols.
741
779
  internal_subset = rb_intern_const("internal_subset");
@@ -743,8 +781,8 @@ void Init_nokogumbo() {
743
781
 
744
782
  // Define Nokogumbo module with parse and fragment methods.
745
783
  VALUE Gumbo = rb_define_module("Nokogumbo");
746
- rb_define_singleton_method(Gumbo, "parse", parse, 4);
747
- rb_define_singleton_method(Gumbo, "fragment", fragment, 5);
784
+ rb_define_singleton_method(Gumbo, "parse", parse, 5);
785
+ rb_define_singleton_method(Gumbo, "fragment", fragment, 6);
748
786
 
749
787
  // Add private constant for testing.
750
788
  rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
@@ -706,6 +706,15 @@ typedef struct GumboInternalOptions {
706
706
  */
707
707
  bool stop_on_first_error;
708
708
 
709
+ /**
710
+ * Maximum allowed number of attributes per element. If this limit is
711
+ * exceeded, the parser will return early with a partial document and
712
+ * the returned `GumboOutput` will have its `status` field set to
713
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
714
+ * Default: `400`.
715
+ */
716
+ int max_attributes;
717
+
709
718
  /**
710
719
  * Maximum allowed depth for the parse tree. If this limit is exceeded,
711
720
  * the parser will return early with a partial document and the returned
@@ -796,6 +805,16 @@ typedef enum {
796
805
  */
797
806
  GUMBO_STATUS_TREE_TOO_DEEP,
798
807
 
808
+ /**
809
+ * Indicates that the maximum number of attributes per element
810
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
811
+ * resulting tree will be a partial document, with no further nodes
812
+ * created after the point where the limit was reached. The partial
813
+ * document may be useful for constructing an error message but
814
+ * typically shouldn't be used for other purposes.
815
+ */
816
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
817
+
799
818
  // Currently unused
800
819
  GUMBO_STATUS_OUT_OF_MEMORY,
801
820
  } GumboOutputStatus;
@@ -48,6 +48,7 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
48
48
  const GumboOptions kGumboDefaultOptions = {
49
49
  .tab_stop = 8,
50
50
  .stop_on_first_error = false,
51
+ .max_attributes = 400,
51
52
  .max_tree_depth = 400,
52
53
  .max_errors = -1,
53
54
  .fragment_context = NULL,
@@ -4858,6 +4859,8 @@ const char* gumbo_status_to_string(GumboOutputStatus status) {
4858
4859
  return "OK";
4859
4860
  case GUMBO_STATUS_OUT_OF_MEMORY:
4860
4861
  return "System allocator returned NULL during parsing";
4862
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
4863
+ return "Attributes per element limit exceeded";
4861
4864
  case GUMBO_STATUS_TREE_TOO_DEEP:
4862
4865
  return "Document tree depth limit exceeded";
4863
4866
  default:
@@ -784,12 +784,22 @@ static void add_duplicate_attr_error(GumboParser* parser) {
784
784
  static void finish_attribute_name(GumboParser* parser) {
785
785
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
786
786
  GumboTagState* tag_state = &tokenizer->_tag_state;
787
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
788
+
789
+ int max_attributes = parser->_options->max_attributes;
790
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
791
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
792
+ gumbo_debug("Attributes limit exceeded.\n");
793
+ reinitialize_tag_buffer(parser);
794
+ tag_state->_drop_next_attr_value = true;
795
+ return;
796
+ }
797
+
787
798
  // May've been set by a previous attribute without a value; reset it here.
788
799
  tag_state->_drop_next_attr_value = false;
789
800
  assert(tag_state->_attributes.data);
790
801
  assert(tag_state->_attributes.capacity);
791
802
 
792
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
793
803
  for (unsigned int i = 0; i < attributes->length; ++i) {
794
804
  GumboAttribute* attr = attributes->data[i];
795
805
  if (
@@ -193,6 +193,11 @@ void utf8iterator_init (
193
193
  iter->_pos.offset = 0;
194
194
  iter->_parser = parser;
195
195
  read_char(iter);
196
+ if (iter->_current == kUtf8BomChar) {
197
+ iter->_start += iter->_width;
198
+ iter->_pos.offset += iter->_width;
199
+ read_char(iter);
200
+ }
196
201
  }
197
202
 
198
203
  void utf8iterator_next(Utf8Iterator* iter) {
@@ -31,6 +31,7 @@ struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
33
  #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8BomChar 0xFEFF
34
35
  #define kUtf8MaxChar 0x10FFFF
35
36
 
36
37
  typedef struct GumboInternalUtf8Iterator {
@@ -5,6 +5,9 @@ require 'nokogumbo/html5'
5
5
  require 'nokogumbo/nokogumbo'
6
6
 
7
7
  module Nokogumbo
8
+ # The default maximum number of attributes per element.
9
+ DEFAULT_MAX_ATTRIBUTES = 400
10
+
8
11
  # The default maximum number of errors for parsing a document or a fragment.
9
12
  DEFAULT_MAX_ERRORS = 0
10
13
 
@@ -19,7 +19,7 @@ module Nokogiri
19
19
 
20
20
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
21
21
  def self.parse(string, url = nil, encoding = nil, **options, &block)
22
- Document.parse(string, url, encoding, options, &block)
22
+ Document.parse(string, url, encoding, **options, &block)
23
23
  end
24
24
 
25
25
  # Parse a fragment from +string+. Convenience method for
@@ -92,19 +92,20 @@ module Nokogiri
92
92
  if encoding.nil?
93
93
  string = string.read
94
94
  else
95
- string = string.read(encoding: encoding)
95
+ string = string.read(encoding: encoding)
96
96
  end
97
97
  else
98
98
  # Otherwise the string has the given encoding.
99
- if encoding && string.respond_to?(:force_encoding)
99
+ string = string.to_str
100
+ if encoding
100
101
  string = string.dup
101
102
  string.force_encoding(encoding)
102
103
  end
103
104
  end
104
105
 
105
- # convert to UTF-8 (Ruby 1.9+)
106
- if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
107
- string = reencode(string.dup)
106
+ # convert to UTF-8
107
+ if string.encoding != Encoding::UTF_8
108
+ string = reencode(string)
108
109
  end
109
110
  string
110
111
  end
@@ -123,18 +124,17 @@ module Nokogiri
123
124
  # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
124
125
  #
125
126
  def self.reencode(body, content_type=nil)
126
- return body unless body.respond_to? :encoding
127
-
128
127
  if body.encoding == Encoding::ASCII_8BIT
129
128
  encoding = nil
130
129
 
131
130
  # look for a Byte Order Mark (BOM)
132
- if body[0..1] == "\xFE\xFF"
133
- encoding = 'utf-16be'
134
- elsif body[0..1] == "\xFF\xFE"
135
- encoding = 'utf-16le'
136
- elsif body[0..2] == "\xEF\xBB\xBF"
137
- encoding = 'utf-8'
131
+ initial_bytes = body[0..2].bytes
132
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
133
+ encoding = Encoding::UTF_8
134
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
135
+ encoding = Encoding::UTF_16BE
136
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
137
+ encoding = Encoding::UTF_16LE
138
138
  end
139
139
 
140
140
  # look for a charset in a content-encoding header
@@ -154,6 +154,7 @@ module Nokogiri
154
154
  encoding ||= Encoding::ISO_8859_1
155
155
 
156
156
  # change the encoding to match the detected or inferred encoding
157
+ body = body.dup
157
158
  begin
158
159
  body.force_encoding(encoding)
159
160
  rescue ArgumentError
@@ -12,6 +12,9 @@ module Nokogiri
12
12
  if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
13
13
  url ||= string_or_io.path
14
14
  end
15
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
16
+ raise ArgumentError.new("not a string or IO object")
17
+ end
15
18
  do_parse(string_or_io, url, encoding, options)
16
19
  end
17
20
 
@@ -21,7 +24,8 @@ module Nokogiri
21
24
  end
22
25
 
23
26
  def self.read_memory(string, url = nil, encoding = nil, **options)
24
- do_parse(string.to_s, url, encoding, options)
27
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
28
+ do_parse(string, url, encoding, options)
25
29
  end
26
30
 
27
31
  def fragment(tags = nil)
@@ -37,9 +41,10 @@ module Nokogiri
37
41
  private
38
42
  def self.do_parse(string_or_io, url, encoding, options)
39
43
  string = HTML5.read_and_encode(string_or_io, encoding)
44
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
40
45
  max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
41
46
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
42
- doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
47
+ doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
43
48
  doc.encoding = 'UTF-8'
44
49
  doc
45
50
  end
@@ -12,10 +12,11 @@ module Nokogiri
12
12
  self.errors = []
13
13
  return self unless tags
14
14
 
15
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
15
16
  max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
16
17
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
17
18
  tags = Nokogiri::HTML5.read_and_encode(tags, nil)
18
- Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
19
+ Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
19
20
  end
20
21
 
21
22
  def serialize(options = {}, &block)
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.2"
2
+ VERSION = "2.0.3"
3
3
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
8
8
  - Stephen Checkoway
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-11-19 00:00:00.000000000 Z
12
+ date: 2020-11-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -94,7 +94,7 @@ metadata:
94
94
  changelog_uri: https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md
95
95
  homepage_uri: https://github.com/rubys/nokogumbo/#readme
96
96
  source_code_uri: https://github.com/rubys/nokogumbo
97
- post_install_message:
97
+ post_install_message:
98
98
  rdoc_options: []
99
99
  require_paths:
100
100
  - lib
@@ -109,8 +109,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  requirements: []
112
- rubygems_version: 3.0.6
113
- signing_key:
112
+ rubygems_version: 3.1.2
113
+ signing_key:
114
114
  specification_version: 4
115
115
  summary: Nokogiri interface to the Gumbo HTML5 parser
116
116
  test_files: []