nokogumbo 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dbc23d3b7a9665b48d0516f523756407ca46733286e89edaf2b5b01b05820ffd
4
- data.tar.gz: 9d777f65fe5170fe66fde53dd9fa0d2904e2ca2d73d25d6fd240fc383cb20804
3
+ metadata.gz: e4694cf3eefbeee2a55cd4bb355b7ec6159c64eac4454dff02b1fbf7e5e8375a
4
+ data.tar.gz: 67832a7c26148f59755360758fcc0b0c1969949bf1e5a1b27f5cabe4b9e8b40b
5
5
  SHA512:
6
- metadata.gz: 602ecadc3afd998eb380a9d4fed204cdd7fd48e5ee8539e4ae5713289fa65a923d27c4b67835546c048522f1a04a165d87f257f1389c591e41e127be06e98109
7
- data.tar.gz: 88a714a552e5cc6f11b65da15bd9816d21bfc383dd5c68e1b6e8ab82ce842adf3159ad56579428b87b7c1dba002b9e38758494865153285f2563aeae9edeba54
6
+ metadata.gz: 3a415817caaf0c3c03037664bda8ed8aa17cc14419e75672dcaa2e2a7dd6d9a20e6ab59095a2295f90da5e45de2c3d72f9a25557533836d55dc67966fe8c7a14
7
+ data.tar.gz: 8dc8f9f2d55936a63097301dc5eb6fb54ed1e4c274b03cdcd6f45e2b4ac2cdc911a54e8e5838ce468820ebade731a62f7cfc167817528fd0adb415087ce924b6
data/README.md CHANGED
@@ -1,7 +1,7 @@
1
1
  # Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
2
2
 
3
- Nokogumbo provides the ability for a Ruby program to invoke the
4
- [Gumbo HTML5 parser](https://github.com/google/gumbo-parser#readme)
3
+ Nokogumbo provides the ability for a Ruby program to invoke
4
+ [our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src)
5
5
  and to access the result as a
6
6
  [Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
7
7
 
@@ -128,6 +128,22 @@ doc = Nokogiri.HTML5(html)
128
128
  doc = Nokogiri.HTML5(html, max_tree_depth: -1)
129
129
  ```
130
130
 
131
+ ### Attribute limit per element
132
+ The maximum number of attributes per DOM element is configurable by the
133
+ `:max_attributes` option. If a given element would exceed this limit, then an
134
+ [ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
135
+
136
+ This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can
137
+ be removed by giving the option `max_attributes: -1`.
138
+
139
+ ``` ruby
140
+ html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
141
+ # "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
142
+ doc = Nokogiri.HTML5(html)
143
+ # raises ArgumentError: Attributes per element limit exceeded
144
+ doc = Nokogiri.HTML5(html, max_attributes: -1)
145
+ ```
146
+
131
147
  ## HTML Serialization
132
148
 
133
149
  After parsing HTML, it may be serialized using any of the Nokogiri
@@ -281,6 +281,7 @@ static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
281
281
  switch (output->status) {
282
282
  case GUMBO_STATUS_OK:
283
283
  break;
284
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
284
285
  case GUMBO_STATUS_TREE_TOO_DEEP:
285
286
  gumbo_destroy_output(output);
286
287
  rb_raise(rb_eArgError, "%s", status_string);
@@ -479,19 +480,43 @@ typedef struct {
479
480
  xmlDocPtr doc;
480
481
  } ParseArgs;
481
482
 
482
- static VALUE parse_cleanup(ParseArgs *args) {
483
+ static void parse_args_mark(void *parse_args) {
484
+ ParseArgs *args = parse_args;
485
+ rb_gc_mark_maybe(args->input);
486
+ rb_gc_mark_maybe(args->url_or_frag);
487
+ }
488
+
489
+ // Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the
490
+ // wrapper.
491
+ static VALUE wrap_parse_args(ParseArgs *args) {
492
+ return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args);
493
+ }
494
+
495
+ // Returnsd the underlying ParseArgs wrapped by wrap_parse_args.
496
+ static ParseArgs *unwrap_parse_args(VALUE obj) {
497
+ ParseArgs *args;
498
+ Data_Get_Struct(obj, ParseArgs, args);
499
+ return args;
500
+ }
501
+
502
+ static VALUE parse_cleanup(VALUE parse_args) {
503
+ ParseArgs *args = unwrap_parse_args(parse_args);
483
504
  gumbo_destroy_output(args->output);
505
+ // Make sure garbage collection doesn't mark the objects as being live based
506
+ // on references from the ParseArgs. This may be unnecessary.
507
+ args->input = Qnil;
508
+ args->url_or_frag = Qnil;
484
509
  if (args->doc != NIL)
485
510
  xmlFreeDoc(args->doc);
486
511
  return Qnil;
487
512
  }
488
513
 
489
-
490
- static VALUE parse_continue(ParseArgs *args);
514
+ static VALUE parse_continue(VALUE parse_args);
491
515
 
492
516
  // Parse a string using gumbo_parse into a Nokogiri document
493
- static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE max_depth) {
517
+ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
494
518
  GumboOptions options = kGumboDefaultOptions;
519
+ options.max_attributes = NUM2INT(max_attributes);
495
520
  options.max_errors = NUM2INT(max_errors);
496
521
  options.max_tree_depth = NUM2INT(max_depth);
497
522
 
@@ -502,10 +527,13 @@ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE m
502
527
  .url_or_frag = url,
503
528
  .doc = NIL,
504
529
  };
505
- return rb_ensure(parse_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
530
+ VALUE parse_args = wrap_parse_args(&args);
531
+
532
+ return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
506
533
  }
507
534
 
508
- static VALUE parse_continue(ParseArgs *args) {
535
+ static VALUE parse_continue(VALUE parse_args) {
536
+ ParseArgs *args = unwrap_parse_args(parse_args);
509
537
  GumboOutput *output = args->output;
510
538
  xmlDocPtr doc;
511
539
  if (output->document->v.document.has_doctype) {
@@ -563,13 +591,14 @@ static xmlNodePtr extract_xml_node(VALUE node) {
563
591
  #endif
564
592
  }
565
593
 
566
- static VALUE fragment_continue(ParseArgs *args);
594
+ static VALUE fragment_continue(VALUE parse_args);
567
595
 
568
596
  static VALUE fragment (
569
597
  VALUE self,
570
598
  VALUE doc_fragment,
571
599
  VALUE tags,
572
600
  VALUE ctx,
601
+ VALUE max_attributes,
573
602
  VALUE max_errors,
574
603
  VALUE max_depth
575
604
  ) {
@@ -676,6 +705,7 @@ static VALUE fragment (
676
705
  // Perform a fragment parse.
677
706
  int depth = NUM2INT(max_depth);
678
707
  GumboOptions options = kGumboDefaultOptions;
708
+ options.max_attributes = NUM2INT(max_attributes);
679
709
  options.max_errors = NUM2INT(max_errors);
680
710
  // Add one to account for the HTML element.
681
711
  options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
@@ -692,11 +722,13 @@ static VALUE fragment (
692
722
  .url_or_frag = doc_fragment,
693
723
  .doc = (xmlDocPtr)extract_xml_node(doc),
694
724
  };
695
- rb_ensure(fragment_continue, (VALUE)&args, parse_cleanup, (VALUE)&args);
725
+ VALUE parse_args = wrap_parse_args(&args);
726
+ rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
696
727
  return Qnil;
697
728
  }
698
729
 
699
- static VALUE fragment_continue(ParseArgs *args) {
730
+ static VALUE fragment_continue(VALUE parse_args) {
731
+ ParseArgs *args = unwrap_parse_args(parse_args);
700
732
  GumboOutput *output = args->output;
701
733
  VALUE doc_fragment = args->url_or_frag;
702
734
  xmlDocPtr xml_doc = args->doc;
@@ -720,10 +752,15 @@ void Init_nokogumbo() {
720
752
  VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
721
753
  VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
722
754
  cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
755
+ rb_gc_register_mark_object(cNokogiriXmlSyntaxError);
723
756
  cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
757
+ rb_gc_register_mark_object(cNokogiriXmlElement);
724
758
  cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
759
+ rb_gc_register_mark_object(cNokogiriXmlText);
725
760
  cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
761
+ rb_gc_register_mark_object(cNokogiriXmlCData);
726
762
  cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
763
+ rb_gc_register_mark_object(cNokogiriXmlComment);
727
764
 
728
765
  // Interned symbols.
729
766
  new = rb_intern_const("new");
@@ -736,6 +773,7 @@ void Init_nokogumbo() {
736
773
  // Class constants.
737
774
  VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
738
775
  Document = rb_const_get(HTML5, rb_intern_const("Document"));
776
+ rb_gc_register_mark_object(Document);
739
777
 
740
778
  // Interned symbols.
741
779
  internal_subset = rb_intern_const("internal_subset");
@@ -743,8 +781,8 @@ void Init_nokogumbo() {
743
781
 
744
782
  // Define Nokogumbo module with parse and fragment methods.
745
783
  VALUE Gumbo = rb_define_module("Nokogumbo");
746
- rb_define_singleton_method(Gumbo, "parse", parse, 4);
747
- rb_define_singleton_method(Gumbo, "fragment", fragment, 5);
784
+ rb_define_singleton_method(Gumbo, "parse", parse, 5);
785
+ rb_define_singleton_method(Gumbo, "fragment", fragment, 6);
748
786
 
749
787
  // Add private constant for testing.
750
788
  rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
@@ -706,6 +706,15 @@ typedef struct GumboInternalOptions {
706
706
  */
707
707
  bool stop_on_first_error;
708
708
 
709
+ /**
710
+ * Maximum allowed number of attributes per element. If this limit is
711
+ * exceeded, the parser will return early with a partial document and
712
+ * the returned `GumboOutput` will have its `status` field set to
713
+ * `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
714
+ * Default: `400`.
715
+ */
716
+ int max_attributes;
717
+
709
718
  /**
710
719
  * Maximum allowed depth for the parse tree. If this limit is exceeded,
711
720
  * the parser will return early with a partial document and the returned
@@ -796,6 +805,16 @@ typedef enum {
796
805
  */
797
806
  GUMBO_STATUS_TREE_TOO_DEEP,
798
807
 
808
+ /**
809
+ * Indicates that the maximum number of attributes per element
810
+ * (`GumboOptions::max_attributes`) was reached during parsing. The
811
+ * resulting tree will be a partial document, with no further nodes
812
+ * created after the point where the limit was reached. The partial
813
+ * document may be useful for constructing an error message but
814
+ * typically shouldn't be used for other purposes.
815
+ */
816
+ GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
817
+
799
818
  // Currently unused
800
819
  GUMBO_STATUS_OUT_OF_MEMORY,
801
820
  } GumboOutputStatus;
@@ -48,6 +48,7 @@ typedef uint8_t TagSet[GUMBO_TAG_LAST + 1];
48
48
  const GumboOptions kGumboDefaultOptions = {
49
49
  .tab_stop = 8,
50
50
  .stop_on_first_error = false,
51
+ .max_attributes = 400,
51
52
  .max_tree_depth = 400,
52
53
  .max_errors = -1,
53
54
  .fragment_context = NULL,
@@ -4858,6 +4859,8 @@ const char* gumbo_status_to_string(GumboOutputStatus status) {
4858
4859
  return "OK";
4859
4860
  case GUMBO_STATUS_OUT_OF_MEMORY:
4860
4861
  return "System allocator returned NULL during parsing";
4862
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
4863
+ return "Attributes per element limit exceeded";
4861
4864
  case GUMBO_STATUS_TREE_TOO_DEEP:
4862
4865
  return "Document tree depth limit exceeded";
4863
4866
  default:
@@ -784,12 +784,22 @@ static void add_duplicate_attr_error(GumboParser* parser) {
784
784
  static void finish_attribute_name(GumboParser* parser) {
785
785
  GumboTokenizerState* tokenizer = parser->_tokenizer_state;
786
786
  GumboTagState* tag_state = &tokenizer->_tag_state;
787
+ GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
788
+
789
+ int max_attributes = parser->_options->max_attributes;
790
+ if (unlikely(max_attributes >= 0 && attributes->length >= (unsigned int) max_attributes)) {
791
+ parser->_output->status = GUMBO_STATUS_TOO_MANY_ATTRIBUTES;
792
+ gumbo_debug("Attributes limit exceeded.\n");
793
+ reinitialize_tag_buffer(parser);
794
+ tag_state->_drop_next_attr_value = true;
795
+ return;
796
+ }
797
+
787
798
  // May've been set by a previous attribute without a value; reset it here.
788
799
  tag_state->_drop_next_attr_value = false;
789
800
  assert(tag_state->_attributes.data);
790
801
  assert(tag_state->_attributes.capacity);
791
802
 
792
- GumboVector* /* GumboAttribute* */ attributes = &tag_state->_attributes;
793
803
  for (unsigned int i = 0; i < attributes->length; ++i) {
794
804
  GumboAttribute* attr = attributes->data[i];
795
805
  if (
@@ -193,6 +193,11 @@ void utf8iterator_init (
193
193
  iter->_pos.offset = 0;
194
194
  iter->_parser = parser;
195
195
  read_char(iter);
196
+ if (iter->_current == kUtf8BomChar) {
197
+ iter->_start += iter->_width;
198
+ iter->_pos.offset += iter->_width;
199
+ read_char(iter);
200
+ }
196
201
  }
197
202
 
198
203
  void utf8iterator_next(Utf8Iterator* iter) {
@@ -31,6 +31,7 @@ struct GumboInternalParser;
31
31
 
32
32
  // Unicode replacement char.
33
33
  #define kUtf8ReplacementChar 0xFFFD
34
+ #define kUtf8BomChar 0xFEFF
34
35
  #define kUtf8MaxChar 0x10FFFF
35
36
 
36
37
  typedef struct GumboInternalUtf8Iterator {
@@ -5,6 +5,9 @@ require 'nokogumbo/html5'
5
5
  require 'nokogumbo/nokogumbo'
6
6
 
7
7
  module Nokogumbo
8
+ # The default maximum number of attributes per element.
9
+ DEFAULT_MAX_ATTRIBUTES = 400
10
+
8
11
  # The default maximum number of errors for parsing a document or a fragment.
9
12
  DEFAULT_MAX_ERRORS = 0
10
13
 
@@ -19,7 +19,7 @@ module Nokogiri
19
19
 
20
20
  # Parse an HTML 5 document. Convenience method for Nokogiri::HTML5::Document.parse
21
21
  def self.parse(string, url = nil, encoding = nil, **options, &block)
22
- Document.parse(string, url, encoding, options, &block)
22
+ Document.parse(string, url, encoding, **options, &block)
23
23
  end
24
24
 
25
25
  # Parse a fragment from +string+. Convenience method for
@@ -92,19 +92,20 @@ module Nokogiri
92
92
  if encoding.nil?
93
93
  string = string.read
94
94
  else
95
- string = string.read(encoding: encoding)
95
+ string = string.read(encoding: encoding)
96
96
  end
97
97
  else
98
98
  # Otherwise the string has the given encoding.
99
- if encoding && string.respond_to?(:force_encoding)
99
+ string = string.to_str
100
+ if encoding
100
101
  string = string.dup
101
102
  string.force_encoding(encoding)
102
103
  end
103
104
  end
104
105
 
105
- # convert to UTF-8 (Ruby 1.9+)
106
- if string.respond_to?(:encoding) && string.encoding != Encoding::UTF_8
107
- string = reencode(string.dup)
106
+ # convert to UTF-8
107
+ if string.encoding != Encoding::UTF_8
108
+ string = reencode(string)
108
109
  end
109
110
  string
110
111
  end
@@ -123,18 +124,17 @@ module Nokogiri
123
124
  # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
124
125
  #
125
126
  def self.reencode(body, content_type=nil)
126
- return body unless body.respond_to? :encoding
127
-
128
127
  if body.encoding == Encoding::ASCII_8BIT
129
128
  encoding = nil
130
129
 
131
130
  # look for a Byte Order Mark (BOM)
132
- if body[0..1] == "\xFE\xFF"
133
- encoding = 'utf-16be'
134
- elsif body[0..1] == "\xFF\xFE"
135
- encoding = 'utf-16le'
136
- elsif body[0..2] == "\xEF\xBB\xBF"
137
- encoding = 'utf-8'
131
+ initial_bytes = body[0..2].bytes
132
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
133
+ encoding = Encoding::UTF_8
134
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
135
+ encoding = Encoding::UTF_16BE
136
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
137
+ encoding = Encoding::UTF_16LE
138
138
  end
139
139
 
140
140
  # look for a charset in a content-encoding header
@@ -154,6 +154,7 @@ module Nokogiri
154
154
  encoding ||= Encoding::ISO_8859_1
155
155
 
156
156
  # change the encoding to match the detected or inferred encoding
157
+ body = body.dup
157
158
  begin
158
159
  body.force_encoding(encoding)
159
160
  rescue ArgumentError
@@ -12,6 +12,9 @@ module Nokogiri
12
12
  if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
13
13
  url ||= string_or_io.path
14
14
  end
15
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
16
+ raise ArgumentError.new("not a string or IO object")
17
+ end
15
18
  do_parse(string_or_io, url, encoding, options)
16
19
  end
17
20
 
@@ -21,7 +24,8 @@ module Nokogiri
21
24
  end
22
25
 
23
26
  def self.read_memory(string, url = nil, encoding = nil, **options)
24
- do_parse(string.to_s, url, encoding, options)
27
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
28
+ do_parse(string, url, encoding, options)
25
29
  end
26
30
 
27
31
  def fragment(tags = nil)
@@ -37,9 +41,10 @@ module Nokogiri
37
41
  private
38
42
  def self.do_parse(string_or_io, url, encoding, options)
39
43
  string = HTML5.read_and_encode(string_or_io, encoding)
44
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
40
45
  max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
41
46
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
42
- doc = Nokogumbo.parse(string.to_s, url, max_errors, max_depth)
47
+ doc = Nokogumbo.parse(string, url, max_attributes, max_errors, max_depth)
43
48
  doc.encoding = 'UTF-8'
44
49
  doc
45
50
  end
@@ -12,10 +12,11 @@ module Nokogiri
12
12
  self.errors = []
13
13
  return self unless tags
14
14
 
15
+ max_attributes = options[:max_attributes] || Nokogumbo::DEFAULT_MAX_ATTRIBUTES
15
16
  max_errors = options[:max_errors] || Nokogumbo::DEFAULT_MAX_ERRORS
16
17
  max_depth = options[:max_tree_depth] || Nokogumbo::DEFAULT_MAX_TREE_DEPTH
17
18
  tags = Nokogiri::HTML5.read_and_encode(tags, nil)
18
- Nokogumbo.fragment(self, tags, ctx, max_errors, max_depth)
19
+ Nokogumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
19
20
  end
20
21
 
21
22
  def serialize(options = {}, &block)
@@ -1,3 +1,3 @@
1
1
  module Nokogumbo
2
- VERSION = "2.0.2"
2
+ VERSION = "2.0.3"
3
3
  end
metadata CHANGED
@@ -1,15 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.2
4
+ version: 2.0.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Sam Ruby
8
8
  - Stephen Checkoway
9
- autorequire:
9
+ autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2019-11-19 00:00:00.000000000 Z
12
+ date: 2020-11-22 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -94,7 +94,7 @@ metadata:
94
94
  changelog_uri: https://github.com/rubys/nokogumbo/blob/master/CHANGELOG.md
95
95
  homepage_uri: https://github.com/rubys/nokogumbo/#readme
96
96
  source_code_uri: https://github.com/rubys/nokogumbo
97
- post_install_message:
97
+ post_install_message:
98
98
  rdoc_options: []
99
99
  require_paths:
100
100
  - lib
@@ -109,8 +109,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
111
  requirements: []
112
- rubygems_version: 3.0.6
113
- signing_key:
112
+ rubygems_version: 3.1.2
113
+ signing_key:
114
114
  specification_version: 4
115
115
  summary: Nokogiri interface to the Gumbo HTML5 parser
116
116
  test_files: []