nokogumbo 2.0.0 → 2.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/ext/nokogumbo/extconf.rb +50 -27
- data/ext/nokogumbo/nokogumbo.c +63 -14
- data/gumbo-parser/src/error.c +17 -8
- data/gumbo-parser/src/gumbo.h +27 -0
- data/gumbo-parser/src/parser.c +476 -480
- data/gumbo-parser/src/tokenizer.c +24 -27
- data/gumbo-parser/src/tokenizer.h +2 -13
- data/gumbo-parser/src/utf8.c +5 -0
- data/gumbo-parser/src/utf8.h +1 -0
- data/lib/nokogumbo.rb +22 -9
- data/lib/nokogumbo/html5.rb +15 -14
- data/lib/nokogumbo/html5/document.rb +7 -2
- data/lib/nokogumbo/html5/document_fragment.rb +2 -1
- data/lib/nokogumbo/version.rb +1 -1
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a84b367d94046358f7844781b0f92cea51a75e052d54e35b53ab03602743f1b8
|
4
|
+
data.tar.gz: 8d96a5adfa701f658f7ba193ee96bb8a7e6901c1ff4d3fb2dad6f3e372ce66d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de2472c6ff89e3f0076a44ac13fa67688e82f909b265a2b70fe45225daf01aaf6059c6ca94f06e10ff94e10ac8a8f42b685e63f494849f04f3af56f337a73382
|
7
|
+
data.tar.gz: 3880defdaa15cb278236cf170d5727d1d73b14698f1ea41e7a7141da7a2fe8c3bafea19367196214c0dc0c1c27854602714d80abd30ecfd6be90f4277f3e33d7
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
2
|
|
3
|
-
Nokogumbo provides the ability for a Ruby program to invoke
|
4
|
-
[Gumbo HTML5 parser](https://github.com/
|
3
|
+
Nokogumbo provides the ability for a Ruby program to invoke
|
4
|
+
[our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src)
|
5
5
|
and to access the result as a
|
6
6
|
[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
|
7
7
|
|
@@ -39,8 +39,8 @@ The document and fragment parsing methods,
|
|
39
39
|
- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
|
40
40
|
support options that are different from Nokogiri's.
|
41
41
|
|
42
|
-
The
|
43
|
-
described below.
|
42
|
+
The three currently supported options are `:max_errors`, `:max_tree_depth` and
|
43
|
+
`:max_attributes`, described below.
|
44
44
|
|
45
45
|
### Error reporting
|
46
46
|
Nokogumbo contains an experimental parse error reporting facility. By default,
|
@@ -128,6 +128,22 @@ doc = Nokogiri.HTML5(html)
|
|
128
128
|
doc = Nokogiri.HTML5(html, max_tree_depth: -1)
|
129
129
|
```
|
130
130
|
|
131
|
+
### Attribute limit per element
|
132
|
+
The maximum number of attributes per DOM element is configurable by the
|
133
|
+
`:max_attributes` option. If a given element would exceed this limit, then an
|
134
|
+
[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
|
135
|
+
|
136
|
+
This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can
|
137
|
+
be removed by giving the option `max_attributes: -1`.
|
138
|
+
|
139
|
+
``` ruby
|
140
|
+
html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
|
141
|
+
# "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
|
142
|
+
doc = Nokogiri.HTML5(html)
|
143
|
+
# raises ArgumentError: Attributes per element limit exceeded
|
144
|
+
doc = Nokogiri.HTML5(html, max_attributes: -1)
|
145
|
+
```
|
146
|
+
|
131
147
|
## HTML Serialization
|
132
148
|
|
133
149
|
After parsing HTML, it may be serialized using any of the Nokogiri
|
data/ext/nokogumbo/extconf.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'fileutils'
|
2
3
|
require 'mkmf'
|
3
4
|
require 'nokogiri'
|
4
5
|
|
5
6
|
$CFLAGS += " -std=c99"
|
6
7
|
$LDFLAGS.gsub!('-Wl,--no-undefined', '')
|
8
|
+
$DLDFLAGS.gsub!('-Wl,--no-undefined', '')
|
7
9
|
$warnflags = CONFIG['warnflags'] = '-Wall'
|
8
10
|
|
9
11
|
NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
|
@@ -23,7 +25,6 @@ def download_headers
|
|
23
25
|
return nil if dep_index.nil?
|
24
26
|
requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
|
25
27
|
|
26
|
-
require 'rubygems'
|
27
28
|
gem 'mini_portile2', requirement
|
28
29
|
require 'mini_portile2'
|
29
30
|
p = MiniPortile::new('libxml2', version).tap do |r|
|
@@ -60,41 +61,64 @@ end
|
|
60
61
|
have_libxml2 = false
|
61
62
|
have_ng = false
|
62
63
|
|
64
|
+
def windows?
|
65
|
+
::RUBY_PLATFORM =~ /mingw|mswin/
|
66
|
+
end
|
67
|
+
|
68
|
+
def modern_nokogiri?
|
69
|
+
nokogiri_version = Gem::Version.new(Nokogiri::VERSION)
|
70
|
+
requirement = windows? ? ">= 1.11.2" : ">= 1.11.0.rc4"
|
71
|
+
Gem::Requirement.new(requirement).satisfied_by?(nokogiri_version)
|
72
|
+
end
|
73
|
+
|
63
74
|
if !prohibited
|
64
|
-
if
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
75
|
+
if modern_nokogiri?
|
76
|
+
append_cflags(Nokogiri::VERSION_INFO["nokogiri"]["cppflags"])
|
77
|
+
append_ldflags(Nokogiri::VERSION_INFO["nokogiri"]["ldflags"]) # may be nil for nokogiri pre-1.11.2
|
78
|
+
have_libxml2 = if Nokogiri::VERSION_INFO["nokogiri"]["ldflags"].empty?
|
79
|
+
have_header('libxml/tree.h')
|
80
|
+
else
|
81
|
+
have_func("xmlNewDoc", "libxml/tree.h")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
if !have_libxml2
|
86
|
+
if Nokogiri::VERSION_INFO.include?('libxml') and
|
87
|
+
Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
|
88
|
+
# Nokogiri has libxml2 built in. Find the headers.
|
89
|
+
libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
|
90
|
+
'include/libxml2')
|
91
|
+
if find_header('libxml/tree.h', libxml2_path)
|
92
|
+
have_libxml2 = true
|
93
|
+
else
|
94
|
+
# Unfortunately, some versions of Nokogiri delete these files.
|
95
|
+
# https://github.com/sparklemotion/nokogiri/pull/1788
|
96
|
+
# Try to download them
|
97
|
+
libxml2_path = download_headers
|
98
|
+
unless libxml2_path.nil?
|
99
|
+
have_libxml2 = find_header('libxml/tree.h', libxml2_path)
|
100
|
+
end
|
101
|
+
end
|
71
102
|
else
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
-
|
76
|
-
|
77
|
-
have_libxml2 = find_header('libxml/tree.h', libxml2_path)
|
103
|
+
# Nokogiri is compiled with system headers.
|
104
|
+
# Hack to work around broken mkmf on macOS
|
105
|
+
# (https://bugs.ruby-lang.org/issues/14992 fixed now)
|
106
|
+
if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
|
107
|
+
RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
|
78
108
|
end
|
79
|
-
end
|
80
|
-
else
|
81
|
-
# Nokogiri is compiled with system headers.
|
82
|
-
# Hack to work around broken mkmf on macOS
|
83
|
-
# (https://bugs.ruby-lang.org/issues/14992 fixed now)
|
84
|
-
if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
|
85
|
-
RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
|
86
|
-
end
|
87
109
|
|
88
|
-
|
89
|
-
|
110
|
+
pkg_config('libxml-2.0')
|
111
|
+
have_libxml2 = have_library('xml2', 'xmlNewDoc')
|
112
|
+
end
|
90
113
|
end
|
114
|
+
|
91
115
|
if required and !have_libxml2
|
92
116
|
abort "libxml2 required but could not be located"
|
93
117
|
end
|
94
118
|
|
119
|
+
|
95
120
|
if have_libxml2
|
96
|
-
|
97
|
-
have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
|
121
|
+
have_ng = have_header('nokogiri.h') || find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
|
98
122
|
end
|
99
123
|
end
|
100
124
|
|
@@ -104,7 +128,6 @@ end
|
|
104
128
|
|
105
129
|
# Symlink gumbo-parser source files.
|
106
130
|
ext_dir = File.dirname(__FILE__)
|
107
|
-
gumbo_src = File.join(ext_dir, 'gumbo_src')
|
108
131
|
|
109
132
|
Dir.chdir(ext_dir) do
|
110
133
|
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
data/ext/nokogumbo/nokogumbo.c
CHANGED
@@ -33,6 +33,7 @@ static ID parent;
|
|
33
33
|
|
34
34
|
/* Backwards compatibility to Ruby 2.1.0 */
|
35
35
|
#if RUBY_API_VERSION_CODE < 20200
|
36
|
+
#define ONIG_ESCAPE_UCHAR_COLLISION 1
|
36
37
|
#include <ruby/encoding.h>
|
37
38
|
|
38
39
|
static VALUE rb_utf8_str_new(const char *str, long length) {
|
@@ -280,6 +281,7 @@ static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
|
|
280
281
|
switch (output->status) {
|
281
282
|
case GUMBO_STATUS_OK:
|
282
283
|
break;
|
284
|
+
case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
|
283
285
|
case GUMBO_STATUS_TREE_TOO_DEEP:
|
284
286
|
gumbo_destroy_output(output);
|
285
287
|
rb_raise(rb_eArgError, "%s", status_string);
|
@@ -385,7 +387,7 @@ static void build_tree (
|
|
385
387
|
case GUMBO_NODE_ELEMENT:
|
386
388
|
{
|
387
389
|
xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
|
388
|
-
set_line(xml_child, gumbo_child->v.
|
390
|
+
set_line(xml_child, gumbo_child->v.element.start_pos.line);
|
389
391
|
if (xml_root == NIL)
|
390
392
|
xml_root = xml_child;
|
391
393
|
xmlNsPtr ns = NIL;
|
@@ -478,19 +480,43 @@ typedef struct {
|
|
478
480
|
xmlDocPtr doc;
|
479
481
|
} ParseArgs;
|
480
482
|
|
481
|
-
static
|
483
|
+
static void parse_args_mark(void *parse_args) {
|
484
|
+
ParseArgs *args = parse_args;
|
485
|
+
rb_gc_mark_maybe(args->input);
|
486
|
+
rb_gc_mark_maybe(args->url_or_frag);
|
487
|
+
}
|
488
|
+
|
489
|
+
// Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the
|
490
|
+
// wrapper.
|
491
|
+
static VALUE wrap_parse_args(ParseArgs *args) {
|
492
|
+
return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args);
|
493
|
+
}
|
494
|
+
|
495
|
+
// Returnsd the underlying ParseArgs wrapped by wrap_parse_args.
|
496
|
+
static ParseArgs *unwrap_parse_args(VALUE obj) {
|
497
|
+
ParseArgs *args;
|
498
|
+
Data_Get_Struct(obj, ParseArgs, args);
|
499
|
+
return args;
|
500
|
+
}
|
501
|
+
|
502
|
+
static VALUE parse_cleanup(VALUE parse_args) {
|
503
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
482
504
|
gumbo_destroy_output(args->output);
|
505
|
+
// Make sure garbage collection doesn't mark the objects as being live based
|
506
|
+
// on references from the ParseArgs. This may be unnecessary.
|
507
|
+
args->input = Qnil;
|
508
|
+
args->url_or_frag = Qnil;
|
483
509
|
if (args->doc != NIL)
|
484
510
|
xmlFreeDoc(args->doc);
|
485
511
|
return Qnil;
|
486
512
|
}
|
487
513
|
|
488
|
-
|
489
|
-
static VALUE parse_continue(ParseArgs *args);
|
514
|
+
static VALUE parse_continue(VALUE parse_args);
|
490
515
|
|
491
516
|
// Parse a string using gumbo_parse into a Nokogiri document
|
492
|
-
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE max_depth) {
|
517
|
+
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
|
493
518
|
GumboOptions options = kGumboDefaultOptions;
|
519
|
+
options.max_attributes = NUM2INT(max_attributes);
|
494
520
|
options.max_errors = NUM2INT(max_errors);
|
495
521
|
options.max_tree_depth = NUM2INT(max_depth);
|
496
522
|
|
@@ -501,10 +527,13 @@ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE m
|
|
501
527
|
.url_or_frag = url,
|
502
528
|
.doc = NIL,
|
503
529
|
};
|
504
|
-
|
530
|
+
VALUE parse_args = wrap_parse_args(&args);
|
531
|
+
|
532
|
+
return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
|
505
533
|
}
|
506
534
|
|
507
|
-
static VALUE parse_continue(
|
535
|
+
static VALUE parse_continue(VALUE parse_args) {
|
536
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
508
537
|
GumboOutput *output = args->output;
|
509
538
|
xmlDocPtr doc;
|
510
539
|
if (output->document->v.document.has_doctype) {
|
@@ -562,13 +591,14 @@ static xmlNodePtr extract_xml_node(VALUE node) {
|
|
562
591
|
#endif
|
563
592
|
}
|
564
593
|
|
565
|
-
static VALUE fragment_continue(
|
594
|
+
static VALUE fragment_continue(VALUE parse_args);
|
566
595
|
|
567
596
|
static VALUE fragment (
|
568
597
|
VALUE self,
|
569
598
|
VALUE doc_fragment,
|
570
599
|
VALUE tags,
|
571
600
|
VALUE ctx,
|
601
|
+
VALUE max_attributes,
|
572
602
|
VALUE max_errors,
|
573
603
|
VALUE max_depth
|
574
604
|
) {
|
@@ -675,6 +705,7 @@ static VALUE fragment (
|
|
675
705
|
// Perform a fragment parse.
|
676
706
|
int depth = NUM2INT(max_depth);
|
677
707
|
GumboOptions options = kGumboDefaultOptions;
|
708
|
+
options.max_attributes = NUM2INT(max_attributes);
|
678
709
|
options.max_errors = NUM2INT(max_errors);
|
679
710
|
// Add one to account for the HTML element.
|
680
711
|
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
|
@@ -691,11 +722,13 @@ static VALUE fragment (
|
|
691
722
|
.url_or_frag = doc_fragment,
|
692
723
|
.doc = (xmlDocPtr)extract_xml_node(doc),
|
693
724
|
};
|
694
|
-
|
725
|
+
VALUE parse_args = wrap_parse_args(&args);
|
726
|
+
rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
|
695
727
|
return Qnil;
|
696
728
|
}
|
697
729
|
|
698
|
-
static VALUE fragment_continue(
|
730
|
+
static VALUE fragment_continue(VALUE parse_args) {
|
731
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
699
732
|
GumboOutput *output = args->output;
|
700
733
|
VALUE doc_fragment = args->url_or_frag;
|
701
734
|
xmlDocPtr xml_doc = args->doc;
|
@@ -709,27 +742,38 @@ static VALUE fragment_continue(ParseArgs *args) {
|
|
709
742
|
|
710
743
|
// Initialize the Nokogumbo class and fetch constants we will use later.
|
711
744
|
void Init_nokogumbo() {
|
712
|
-
rb_funcall(rb_mKernel,
|
745
|
+
rb_funcall(rb_mKernel, rb_intern_const("gem"), 1, rb_utf8_str_new_static("nokogiri", 8));
|
713
746
|
rb_require("nokogiri");
|
714
747
|
|
715
|
-
|
748
|
+
VALUE line_supported = Qtrue;
|
749
|
+
|
750
|
+
#if !NGLIB
|
716
751
|
// Class constants.
|
717
752
|
VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
|
718
753
|
VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
|
719
754
|
cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
|
755
|
+
rb_gc_register_mark_object(cNokogiriXmlSyntaxError);
|
720
756
|
cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
|
757
|
+
rb_gc_register_mark_object(cNokogiriXmlElement);
|
721
758
|
cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
|
759
|
+
rb_gc_register_mark_object(cNokogiriXmlText);
|
722
760
|
cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
|
761
|
+
rb_gc_register_mark_object(cNokogiriXmlCData);
|
723
762
|
cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
|
763
|
+
rb_gc_register_mark_object(cNokogiriXmlComment);
|
724
764
|
|
725
765
|
// Interned symbols.
|
726
766
|
new = rb_intern_const("new");
|
727
767
|
node_name_ = rb_intern_const("node_name=");
|
768
|
+
|
769
|
+
// #line is not supported (returns 0)
|
770
|
+
line_supported = Qfalse;
|
728
771
|
#endif
|
729
772
|
|
730
773
|
// Class constants.
|
731
774
|
VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
|
732
775
|
Document = rb_const_get(HTML5, rb_intern_const("Document"));
|
776
|
+
rb_gc_register_mark_object(Document);
|
733
777
|
|
734
778
|
// Interned symbols.
|
735
779
|
internal_subset = rb_intern_const("internal_subset");
|
@@ -737,8 +781,13 @@ void Init_nokogumbo() {
|
|
737
781
|
|
738
782
|
// Define Nokogumbo module with parse and fragment methods.
|
739
783
|
VALUE Gumbo = rb_define_module("Nokogumbo");
|
740
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
741
|
-
rb_define_singleton_method(Gumbo, "fragment", fragment,
|
784
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 5);
|
785
|
+
rb_define_singleton_method(Gumbo, "fragment", fragment, 6);
|
786
|
+
|
787
|
+
// Add private constant for testing.
|
788
|
+
rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
|
789
|
+
rb_funcall(Gumbo, rb_intern_const("private_constant"), 1,
|
790
|
+
rb_utf8_str_new_cstr("LINE_SUPPORTED"));
|
742
791
|
}
|
743
792
|
|
744
793
|
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
data/gumbo-parser/src/error.c
CHANGED
@@ -365,11 +365,14 @@ static void handle_parser_error (
|
|
365
365
|
// pointer to the beginning of the string if this is the first line.
|
366
366
|
static const char* find_prev_newline (
|
367
367
|
const char* source_text,
|
368
|
+
size_t source_length,
|
368
369
|
const char* error_location
|
369
370
|
) {
|
371
|
+
const char* source_end = source_text + source_length;
|
370
372
|
assert(error_location >= source_text);
|
373
|
+
assert(error_location <= source_end);
|
371
374
|
const char* c = error_location;
|
372
|
-
if (*c == '\n'
|
375
|
+
if (c != source_text && (error_location == source_end || *c == '\n'))
|
373
376
|
--c;
|
374
377
|
while (c != source_text && *c != '\n')
|
375
378
|
--c;
|
@@ -377,20 +380,25 @@ static const char* find_prev_newline (
|
|
377
380
|
}
|
378
381
|
|
379
382
|
// Finds the next newline in the original source buffer from a given byte
|
380
|
-
// location. Returns a character pointer to that newline, or a pointer to
|
381
|
-
//
|
383
|
+
// location. Returns a character pointer to that newline, or a pointer to
|
384
|
+
// source_text + source_length if this is the last line.
|
382
385
|
static const char* find_next_newline(
|
383
|
-
const char*
|
386
|
+
const char* source_text,
|
387
|
+
size_t source_length,
|
384
388
|
const char* error_location
|
385
389
|
) {
|
386
|
-
|
390
|
+
const char* source_end = source_text + source_length;
|
391
|
+
assert(error_location >= source_text);
|
392
|
+
assert(error_location <= source_end);
|
387
393
|
const char* c = error_location;
|
388
|
-
while (c !=
|
394
|
+
while (c != source_end && *c != '\n')
|
389
395
|
++c;
|
390
396
|
return c;
|
391
397
|
}
|
392
398
|
|
393
399
|
GumboError* gumbo_add_error(GumboParser* parser) {
|
400
|
+
parser->_output->document_error = true;
|
401
|
+
|
394
402
|
int max_errors = parser->_options->max_errors;
|
395
403
|
if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
|
396
404
|
return NULL;
|
@@ -547,8 +555,9 @@ void caret_diagnostic_to_string (
|
|
547
555
|
) {
|
548
556
|
error_to_string(error, output);
|
549
557
|
|
550
|
-
const char*
|
551
|
-
const char*
|
558
|
+
const char* error_text = error->original_text.data;
|
559
|
+
const char* line_start = find_prev_newline(source_text, source_length, error_text);
|
560
|
+
const char* line_end = find_next_newline(source_text, source_length, error_text);
|
552
561
|
GumboStringPiece original_line;
|
553
562
|
original_line.data = line_start;
|
554
563
|
original_line.length = line_end - line_start;
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -706,6 +706,15 @@ typedef struct GumboInternalOptions {
|
|
706
706
|
*/
|
707
707
|
bool stop_on_first_error;
|
708
708
|
|
709
|
+
/**
|
710
|
+
* Maximum allowed number of attributes per element. If this limit is
|
711
|
+
* exceeded, the parser will return early with a partial document and
|
712
|
+
* the returned `GumboOutput` will have its `status` field set to
|
713
|
+
* `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
|
714
|
+
* Default: `400`.
|
715
|
+
*/
|
716
|
+
int max_attributes;
|
717
|
+
|
709
718
|
/**
|
710
719
|
* Maximum allowed depth for the parse tree. If this limit is exceeded,
|
711
720
|
* the parser will return early with a partial document and the returned
|
@@ -796,6 +805,16 @@ typedef enum {
|
|
796
805
|
*/
|
797
806
|
GUMBO_STATUS_TREE_TOO_DEEP,
|
798
807
|
|
808
|
+
/**
|
809
|
+
* Indicates that the maximum number of attributes per element
|
810
|
+
* (`GumboOptions::max_attributes`) was reached during parsing. The
|
811
|
+
* resulting tree will be a partial document, with no further nodes
|
812
|
+
* created after the point where the limit was reached. The partial
|
813
|
+
* document may be useful for constructing an error message but
|
814
|
+
* typically shouldn't be used for other purposes.
|
815
|
+
*/
|
816
|
+
GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
|
817
|
+
|
799
818
|
// Currently unused
|
800
819
|
GUMBO_STATUS_OUT_OF_MEMORY,
|
801
820
|
} GumboOutputStatus;
|
@@ -820,6 +839,14 @@ typedef struct GumboInternalOutput {
|
|
820
839
|
*/
|
821
840
|
GumboVector /* GumboError */ errors;
|
822
841
|
|
842
|
+
/**
|
843
|
+
* True if the parser encounted an error.
|
844
|
+
*
|
845
|
+
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
846
|
+
* option was set to 0.
|
847
|
+
*/
|
848
|
+
bool document_error;
|
849
|
+
|
823
850
|
/**
|
824
851
|
* A status code indicating whether parsing finished successfully or was
|
825
852
|
* stopped mid-document due to exceptional circumstances.
|