nokogumbo 2.0.0 → 2.0.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +20 -4
- data/ext/nokogumbo/extconf.rb +50 -27
- data/ext/nokogumbo/nokogumbo.c +63 -14
- data/gumbo-parser/src/error.c +17 -8
- data/gumbo-parser/src/gumbo.h +27 -0
- data/gumbo-parser/src/parser.c +476 -480
- data/gumbo-parser/src/tokenizer.c +24 -27
- data/gumbo-parser/src/tokenizer.h +2 -13
- data/gumbo-parser/src/utf8.c +5 -0
- data/gumbo-parser/src/utf8.h +1 -0
- data/lib/nokogumbo.rb +22 -9
- data/lib/nokogumbo/html5.rb +15 -14
- data/lib/nokogumbo/html5/document.rb +7 -2
- data/lib/nokogumbo/html5/document_fragment.rb +2 -1
- data/lib/nokogumbo/version.rb +1 -1
- metadata +3 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a84b367d94046358f7844781b0f92cea51a75e052d54e35b53ab03602743f1b8
|
4
|
+
data.tar.gz: 8d96a5adfa701f658f7ba193ee96bb8a7e6901c1ff4d3fb2dad6f3e372ce66d2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de2472c6ff89e3f0076a44ac13fa67688e82f909b265a2b70fe45225daf01aaf6059c6ca94f06e10ff94e10ac8a8f42b685e63f494849f04f3af56f337a73382
|
7
|
+
data.tar.gz: 3880defdaa15cb278236cf170d5727d1d73b14698f1ea41e7a7141da7a2fe8c3bafea19367196214c0dc0c1c27854602714d80abd30ecfd6be90f4277f3e33d7
|
data/README.md
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
2
|
|
3
|
-
Nokogumbo provides the ability for a Ruby program to invoke
|
4
|
-
[Gumbo HTML5 parser](https://github.com/
|
3
|
+
Nokogumbo provides the ability for a Ruby program to invoke
|
4
|
+
[our version of the Gumbo HTML5 parser](https://github.com/rubys/nokogumbo/tree/master/gumbo-parser/src)
|
5
5
|
and to access the result as a
|
6
6
|
[Nokogiri::HTML::Document](http://rdoc.info/github/sparklemotion/nokogiri/Nokogiri/HTML/Document).
|
7
7
|
|
@@ -39,8 +39,8 @@ The document and fragment parsing methods,
|
|
39
39
|
- `Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})`
|
40
40
|
support options that are different from Nokogiri's.
|
41
41
|
|
42
|
-
The
|
43
|
-
described below.
|
42
|
+
The three currently supported options are `:max_errors`, `:max_tree_depth` and
|
43
|
+
`:max_attributes`, described below.
|
44
44
|
|
45
45
|
### Error reporting
|
46
46
|
Nokogumbo contains an experimental parse error reporting facility. By default,
|
@@ -128,6 +128,22 @@ doc = Nokogiri.HTML5(html)
|
|
128
128
|
doc = Nokogiri.HTML5(html, max_tree_depth: -1)
|
129
129
|
```
|
130
130
|
|
131
|
+
### Attribute limit per element
|
132
|
+
The maximum number of attributes per DOM element is configurable by the
|
133
|
+
`:max_attributes` option. If a given element would exceed this limit, then an
|
134
|
+
[ArgumentError](https://ruby-doc.org/core-2.5.0/ArgumentError.html) is thrown.
|
135
|
+
|
136
|
+
This limit (which defaults to `Nokogumbo::DEFAULT_MAX_ATTRIBUTES = 400`) can
|
137
|
+
be removed by giving the option `max_attributes: -1`.
|
138
|
+
|
139
|
+
``` ruby
|
140
|
+
html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
|
141
|
+
# "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
|
142
|
+
doc = Nokogiri.HTML5(html)
|
143
|
+
# raises ArgumentError: Attributes per element limit exceeded
|
144
|
+
doc = Nokogiri.HTML5(html, max_attributes: -1)
|
145
|
+
```
|
146
|
+
|
131
147
|
## HTML Serialization
|
132
148
|
|
133
149
|
After parsing HTML, it may be serialized using any of the Nokogiri
|
data/ext/nokogumbo/extconf.rb
CHANGED
@@ -1,9 +1,11 @@
|
|
1
|
+
require 'rubygems'
|
1
2
|
require 'fileutils'
|
2
3
|
require 'mkmf'
|
3
4
|
require 'nokogiri'
|
4
5
|
|
5
6
|
$CFLAGS += " -std=c99"
|
6
7
|
$LDFLAGS.gsub!('-Wl,--no-undefined', '')
|
8
|
+
$DLDFLAGS.gsub!('-Wl,--no-undefined', '')
|
7
9
|
$warnflags = CONFIG['warnflags'] = '-Wall'
|
8
10
|
|
9
11
|
NG_SPEC = Gem::Specification.find_by_name('nokogiri', "= #{Nokogiri::VERSION}")
|
@@ -23,7 +25,6 @@ def download_headers
|
|
23
25
|
return nil if dep_index.nil?
|
24
26
|
requirement = NG_SPEC.dependencies[dep_index].requirement.to_s
|
25
27
|
|
26
|
-
require 'rubygems'
|
27
28
|
gem 'mini_portile2', requirement
|
28
29
|
require 'mini_portile2'
|
29
30
|
p = MiniPortile::new('libxml2', version).tap do |r|
|
@@ -60,41 +61,64 @@ end
|
|
60
61
|
have_libxml2 = false
|
61
62
|
have_ng = false
|
62
63
|
|
64
|
+
def windows?
|
65
|
+
::RUBY_PLATFORM =~ /mingw|mswin/
|
66
|
+
end
|
67
|
+
|
68
|
+
def modern_nokogiri?
|
69
|
+
nokogiri_version = Gem::Version.new(Nokogiri::VERSION)
|
70
|
+
requirement = windows? ? ">= 1.11.2" : ">= 1.11.0.rc4"
|
71
|
+
Gem::Requirement.new(requirement).satisfied_by?(nokogiri_version)
|
72
|
+
end
|
73
|
+
|
63
74
|
if !prohibited
|
64
|
-
if
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
75
|
+
if modern_nokogiri?
|
76
|
+
append_cflags(Nokogiri::VERSION_INFO["nokogiri"]["cppflags"])
|
77
|
+
append_ldflags(Nokogiri::VERSION_INFO["nokogiri"]["ldflags"]) # may be nil for nokogiri pre-1.11.2
|
78
|
+
have_libxml2 = if Nokogiri::VERSION_INFO["nokogiri"]["ldflags"].empty?
|
79
|
+
have_header('libxml/tree.h')
|
80
|
+
else
|
81
|
+
have_func("xmlNewDoc", "libxml/tree.h")
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
if !have_libxml2
|
86
|
+
if Nokogiri::VERSION_INFO.include?('libxml') and
|
87
|
+
Nokogiri::VERSION_INFO['libxml']['source'] == 'packaged'
|
88
|
+
# Nokogiri has libxml2 built in. Find the headers.
|
89
|
+
libxml2_path = File.join(Nokogiri::VERSION_INFO['libxml']['libxml2_path'],
|
90
|
+
'include/libxml2')
|
91
|
+
if find_header('libxml/tree.h', libxml2_path)
|
92
|
+
have_libxml2 = true
|
93
|
+
else
|
94
|
+
# Unfortunately, some versions of Nokogiri delete these files.
|
95
|
+
# https://github.com/sparklemotion/nokogiri/pull/1788
|
96
|
+
# Try to download them
|
97
|
+
libxml2_path = download_headers
|
98
|
+
unless libxml2_path.nil?
|
99
|
+
have_libxml2 = find_header('libxml/tree.h', libxml2_path)
|
100
|
+
end
|
101
|
+
end
|
71
102
|
else
|
72
|
-
#
|
73
|
-
#
|
74
|
-
#
|
75
|
-
|
76
|
-
|
77
|
-
have_libxml2 = find_header('libxml/tree.h', libxml2_path)
|
103
|
+
# Nokogiri is compiled with system headers.
|
104
|
+
# Hack to work around broken mkmf on macOS
|
105
|
+
# (https://bugs.ruby-lang.org/issues/14992 fixed now)
|
106
|
+
if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
|
107
|
+
RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
|
78
108
|
end
|
79
|
-
end
|
80
|
-
else
|
81
|
-
# Nokogiri is compiled with system headers.
|
82
|
-
# Hack to work around broken mkmf on macOS
|
83
|
-
# (https://bugs.ruby-lang.org/issues/14992 fixed now)
|
84
|
-
if RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] == 'DYLD_LIBRARY_PATH'
|
85
|
-
RbConfig::MAKEFILE_CONFIG['LIBPATHENV'] = 'DYLD_FALLBACK_LIBRARY_PATH'
|
86
|
-
end
|
87
109
|
|
88
|
-
|
89
|
-
|
110
|
+
pkg_config('libxml-2.0')
|
111
|
+
have_libxml2 = have_library('xml2', 'xmlNewDoc')
|
112
|
+
end
|
90
113
|
end
|
114
|
+
|
91
115
|
if required and !have_libxml2
|
92
116
|
abort "libxml2 required but could not be located"
|
93
117
|
end
|
94
118
|
|
119
|
+
|
95
120
|
if have_libxml2
|
96
|
-
|
97
|
-
have_ng = find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
|
121
|
+
have_ng = have_header('nokogiri.h') || find_header('nokogiri.h', File.join(NG_SPEC.gem_dir, 'ext/nokogiri'))
|
98
122
|
end
|
99
123
|
end
|
100
124
|
|
@@ -104,7 +128,6 @@ end
|
|
104
128
|
|
105
129
|
# Symlink gumbo-parser source files.
|
106
130
|
ext_dir = File.dirname(__FILE__)
|
107
|
-
gumbo_src = File.join(ext_dir, 'gumbo_src')
|
108
131
|
|
109
132
|
Dir.chdir(ext_dir) do
|
110
133
|
$srcs = Dir['*.c', '../../gumbo-parser/src/*.c']
|
data/ext/nokogumbo/nokogumbo.c
CHANGED
@@ -33,6 +33,7 @@ static ID parent;
|
|
33
33
|
|
34
34
|
/* Backwards compatibility to Ruby 2.1.0 */
|
35
35
|
#if RUBY_API_VERSION_CODE < 20200
|
36
|
+
#define ONIG_ESCAPE_UCHAR_COLLISION 1
|
36
37
|
#include <ruby/encoding.h>
|
37
38
|
|
38
39
|
static VALUE rb_utf8_str_new(const char *str, long length) {
|
@@ -280,6 +281,7 @@ static GumboOutput *perform_parse(const GumboOptions *options, VALUE input) {
|
|
280
281
|
switch (output->status) {
|
281
282
|
case GUMBO_STATUS_OK:
|
282
283
|
break;
|
284
|
+
case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
|
283
285
|
case GUMBO_STATUS_TREE_TOO_DEEP:
|
284
286
|
gumbo_destroy_output(output);
|
285
287
|
rb_raise(rb_eArgError, "%s", status_string);
|
@@ -385,7 +387,7 @@ static void build_tree (
|
|
385
387
|
case GUMBO_NODE_ELEMENT:
|
386
388
|
{
|
387
389
|
xml_child = xmlNewDocNode(doc, NIL, BAD_CAST gumbo_child->v.element.name, NULL);
|
388
|
-
set_line(xml_child, gumbo_child->v.
|
390
|
+
set_line(xml_child, gumbo_child->v.element.start_pos.line);
|
389
391
|
if (xml_root == NIL)
|
390
392
|
xml_root = xml_child;
|
391
393
|
xmlNsPtr ns = NIL;
|
@@ -478,19 +480,43 @@ typedef struct {
|
|
478
480
|
xmlDocPtr doc;
|
479
481
|
} ParseArgs;
|
480
482
|
|
481
|
-
static
|
483
|
+
static void parse_args_mark(void *parse_args) {
|
484
|
+
ParseArgs *args = parse_args;
|
485
|
+
rb_gc_mark_maybe(args->input);
|
486
|
+
rb_gc_mark_maybe(args->url_or_frag);
|
487
|
+
}
|
488
|
+
|
489
|
+
// Wrap a ParseArgs pointer. The underlying ParseArgs must outlive the
|
490
|
+
// wrapper.
|
491
|
+
static VALUE wrap_parse_args(ParseArgs *args) {
|
492
|
+
return Data_Wrap_Struct(rb_cData, parse_args_mark, RUBY_NEVER_FREE, args);
|
493
|
+
}
|
494
|
+
|
495
|
+
// Returnsd the underlying ParseArgs wrapped by wrap_parse_args.
|
496
|
+
static ParseArgs *unwrap_parse_args(VALUE obj) {
|
497
|
+
ParseArgs *args;
|
498
|
+
Data_Get_Struct(obj, ParseArgs, args);
|
499
|
+
return args;
|
500
|
+
}
|
501
|
+
|
502
|
+
static VALUE parse_cleanup(VALUE parse_args) {
|
503
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
482
504
|
gumbo_destroy_output(args->output);
|
505
|
+
// Make sure garbage collection doesn't mark the objects as being live based
|
506
|
+
// on references from the ParseArgs. This may be unnecessary.
|
507
|
+
args->input = Qnil;
|
508
|
+
args->url_or_frag = Qnil;
|
483
509
|
if (args->doc != NIL)
|
484
510
|
xmlFreeDoc(args->doc);
|
485
511
|
return Qnil;
|
486
512
|
}
|
487
513
|
|
488
|
-
|
489
|
-
static VALUE parse_continue(ParseArgs *args);
|
514
|
+
static VALUE parse_continue(VALUE parse_args);
|
490
515
|
|
491
516
|
// Parse a string using gumbo_parse into a Nokogiri document
|
492
|
-
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE max_depth) {
|
517
|
+
static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth) {
|
493
518
|
GumboOptions options = kGumboDefaultOptions;
|
519
|
+
options.max_attributes = NUM2INT(max_attributes);
|
494
520
|
options.max_errors = NUM2INT(max_errors);
|
495
521
|
options.max_tree_depth = NUM2INT(max_depth);
|
496
522
|
|
@@ -501,10 +527,13 @@ static VALUE parse(VALUE self, VALUE input, VALUE url, VALUE max_errors, VALUE m
|
|
501
527
|
.url_or_frag = url,
|
502
528
|
.doc = NIL,
|
503
529
|
};
|
504
|
-
|
530
|
+
VALUE parse_args = wrap_parse_args(&args);
|
531
|
+
|
532
|
+
return rb_ensure(parse_continue, parse_args, parse_cleanup, parse_args);
|
505
533
|
}
|
506
534
|
|
507
|
-
static VALUE parse_continue(
|
535
|
+
static VALUE parse_continue(VALUE parse_args) {
|
536
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
508
537
|
GumboOutput *output = args->output;
|
509
538
|
xmlDocPtr doc;
|
510
539
|
if (output->document->v.document.has_doctype) {
|
@@ -562,13 +591,14 @@ static xmlNodePtr extract_xml_node(VALUE node) {
|
|
562
591
|
#endif
|
563
592
|
}
|
564
593
|
|
565
|
-
static VALUE fragment_continue(
|
594
|
+
static VALUE fragment_continue(VALUE parse_args);
|
566
595
|
|
567
596
|
static VALUE fragment (
|
568
597
|
VALUE self,
|
569
598
|
VALUE doc_fragment,
|
570
599
|
VALUE tags,
|
571
600
|
VALUE ctx,
|
601
|
+
VALUE max_attributes,
|
572
602
|
VALUE max_errors,
|
573
603
|
VALUE max_depth
|
574
604
|
) {
|
@@ -675,6 +705,7 @@ static VALUE fragment (
|
|
675
705
|
// Perform a fragment parse.
|
676
706
|
int depth = NUM2INT(max_depth);
|
677
707
|
GumboOptions options = kGumboDefaultOptions;
|
708
|
+
options.max_attributes = NUM2INT(max_attributes);
|
678
709
|
options.max_errors = NUM2INT(max_errors);
|
679
710
|
// Add one to account for the HTML element.
|
680
711
|
options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
|
@@ -691,11 +722,13 @@ static VALUE fragment (
|
|
691
722
|
.url_or_frag = doc_fragment,
|
692
723
|
.doc = (xmlDocPtr)extract_xml_node(doc),
|
693
724
|
};
|
694
|
-
|
725
|
+
VALUE parse_args = wrap_parse_args(&args);
|
726
|
+
rb_ensure(fragment_continue, parse_args, parse_cleanup, parse_args);
|
695
727
|
return Qnil;
|
696
728
|
}
|
697
729
|
|
698
|
-
static VALUE fragment_continue(
|
730
|
+
static VALUE fragment_continue(VALUE parse_args) {
|
731
|
+
ParseArgs *args = unwrap_parse_args(parse_args);
|
699
732
|
GumboOutput *output = args->output;
|
700
733
|
VALUE doc_fragment = args->url_or_frag;
|
701
734
|
xmlDocPtr xml_doc = args->doc;
|
@@ -709,27 +742,38 @@ static VALUE fragment_continue(ParseArgs *args) {
|
|
709
742
|
|
710
743
|
// Initialize the Nokogumbo class and fetch constants we will use later.
|
711
744
|
void Init_nokogumbo() {
|
712
|
-
rb_funcall(rb_mKernel,
|
745
|
+
rb_funcall(rb_mKernel, rb_intern_const("gem"), 1, rb_utf8_str_new_static("nokogiri", 8));
|
713
746
|
rb_require("nokogiri");
|
714
747
|
|
715
|
-
|
748
|
+
VALUE line_supported = Qtrue;
|
749
|
+
|
750
|
+
#if !NGLIB
|
716
751
|
// Class constants.
|
717
752
|
VALUE mNokogiri = rb_const_get(rb_cObject, rb_intern_const("Nokogiri"));
|
718
753
|
VALUE mNokogiriXml = rb_const_get(mNokogiri, rb_intern_const("XML"));
|
719
754
|
cNokogiriXmlSyntaxError = rb_const_get(mNokogiriXml, rb_intern_const("SyntaxError"));
|
755
|
+
rb_gc_register_mark_object(cNokogiriXmlSyntaxError);
|
720
756
|
cNokogiriXmlElement = rb_const_get(mNokogiriXml, rb_intern_const("Element"));
|
757
|
+
rb_gc_register_mark_object(cNokogiriXmlElement);
|
721
758
|
cNokogiriXmlText = rb_const_get(mNokogiriXml, rb_intern_const("Text"));
|
759
|
+
rb_gc_register_mark_object(cNokogiriXmlText);
|
722
760
|
cNokogiriXmlCData = rb_const_get(mNokogiriXml, rb_intern_const("CDATA"));
|
761
|
+
rb_gc_register_mark_object(cNokogiriXmlCData);
|
723
762
|
cNokogiriXmlComment = rb_const_get(mNokogiriXml, rb_intern_const("Comment"));
|
763
|
+
rb_gc_register_mark_object(cNokogiriXmlComment);
|
724
764
|
|
725
765
|
// Interned symbols.
|
726
766
|
new = rb_intern_const("new");
|
727
767
|
node_name_ = rb_intern_const("node_name=");
|
768
|
+
|
769
|
+
// #line is not supported (returns 0)
|
770
|
+
line_supported = Qfalse;
|
728
771
|
#endif
|
729
772
|
|
730
773
|
// Class constants.
|
731
774
|
VALUE HTML5 = rb_const_get(mNokogiri, rb_intern_const("HTML5"));
|
732
775
|
Document = rb_const_get(HTML5, rb_intern_const("Document"));
|
776
|
+
rb_gc_register_mark_object(Document);
|
733
777
|
|
734
778
|
// Interned symbols.
|
735
779
|
internal_subset = rb_intern_const("internal_subset");
|
@@ -737,8 +781,13 @@ void Init_nokogumbo() {
|
|
737
781
|
|
738
782
|
// Define Nokogumbo module with parse and fragment methods.
|
739
783
|
VALUE Gumbo = rb_define_module("Nokogumbo");
|
740
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
741
|
-
rb_define_singleton_method(Gumbo, "fragment", fragment,
|
784
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 5);
|
785
|
+
rb_define_singleton_method(Gumbo, "fragment", fragment, 6);
|
786
|
+
|
787
|
+
// Add private constant for testing.
|
788
|
+
rb_define_const(Gumbo, "LINE_SUPPORTED", line_supported);
|
789
|
+
rb_funcall(Gumbo, rb_intern_const("private_constant"), 1,
|
790
|
+
rb_utf8_str_new_cstr("LINE_SUPPORTED"));
|
742
791
|
}
|
743
792
|
|
744
793
|
// vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
data/gumbo-parser/src/error.c
CHANGED
@@ -365,11 +365,14 @@ static void handle_parser_error (
|
|
365
365
|
// pointer to the beginning of the string if this is the first line.
|
366
366
|
static const char* find_prev_newline (
|
367
367
|
const char* source_text,
|
368
|
+
size_t source_length,
|
368
369
|
const char* error_location
|
369
370
|
) {
|
371
|
+
const char* source_end = source_text + source_length;
|
370
372
|
assert(error_location >= source_text);
|
373
|
+
assert(error_location <= source_end);
|
371
374
|
const char* c = error_location;
|
372
|
-
if (*c == '\n'
|
375
|
+
if (c != source_text && (error_location == source_end || *c == '\n'))
|
373
376
|
--c;
|
374
377
|
while (c != source_text && *c != '\n')
|
375
378
|
--c;
|
@@ -377,20 +380,25 @@ static const char* find_prev_newline (
|
|
377
380
|
}
|
378
381
|
|
379
382
|
// Finds the next newline in the original source buffer from a given byte
|
380
|
-
// location. Returns a character pointer to that newline, or a pointer to
|
381
|
-
//
|
383
|
+
// location. Returns a character pointer to that newline, or a pointer to
|
384
|
+
// source_text + source_length if this is the last line.
|
382
385
|
static const char* find_next_newline(
|
383
|
-
const char*
|
386
|
+
const char* source_text,
|
387
|
+
size_t source_length,
|
384
388
|
const char* error_location
|
385
389
|
) {
|
386
|
-
|
390
|
+
const char* source_end = source_text + source_length;
|
391
|
+
assert(error_location >= source_text);
|
392
|
+
assert(error_location <= source_end);
|
387
393
|
const char* c = error_location;
|
388
|
-
while (c !=
|
394
|
+
while (c != source_end && *c != '\n')
|
389
395
|
++c;
|
390
396
|
return c;
|
391
397
|
}
|
392
398
|
|
393
399
|
GumboError* gumbo_add_error(GumboParser* parser) {
|
400
|
+
parser->_output->document_error = true;
|
401
|
+
|
394
402
|
int max_errors = parser->_options->max_errors;
|
395
403
|
if (max_errors >= 0 && parser->_output->errors.length >= (unsigned int) max_errors) {
|
396
404
|
return NULL;
|
@@ -547,8 +555,9 @@ void caret_diagnostic_to_string (
|
|
547
555
|
) {
|
548
556
|
error_to_string(error, output);
|
549
557
|
|
550
|
-
const char*
|
551
|
-
const char*
|
558
|
+
const char* error_text = error->original_text.data;
|
559
|
+
const char* line_start = find_prev_newline(source_text, source_length, error_text);
|
560
|
+
const char* line_end = find_next_newline(source_text, source_length, error_text);
|
552
561
|
GumboStringPiece original_line;
|
553
562
|
original_line.data = line_start;
|
554
563
|
original_line.length = line_end - line_start;
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -706,6 +706,15 @@ typedef struct GumboInternalOptions {
|
|
706
706
|
*/
|
707
707
|
bool stop_on_first_error;
|
708
708
|
|
709
|
+
/**
|
710
|
+
* Maximum allowed number of attributes per element. If this limit is
|
711
|
+
* exceeded, the parser will return early with a partial document and
|
712
|
+
* the returned `GumboOutput` will have its `status` field set to
|
713
|
+
* `GUMBO_STATUS_TOO_MANY_ATTRIBUTES`. Set to `-1` to disable the limit.
|
714
|
+
* Default: `400`.
|
715
|
+
*/
|
716
|
+
int max_attributes;
|
717
|
+
|
709
718
|
/**
|
710
719
|
* Maximum allowed depth for the parse tree. If this limit is exceeded,
|
711
720
|
* the parser will return early with a partial document and the returned
|
@@ -796,6 +805,16 @@ typedef enum {
|
|
796
805
|
*/
|
797
806
|
GUMBO_STATUS_TREE_TOO_DEEP,
|
798
807
|
|
808
|
+
/**
|
809
|
+
* Indicates that the maximum number of attributes per element
|
810
|
+
* (`GumboOptions::max_attributes`) was reached during parsing. The
|
811
|
+
* resulting tree will be a partial document, with no further nodes
|
812
|
+
* created after the point where the limit was reached. The partial
|
813
|
+
* document may be useful for constructing an error message but
|
814
|
+
* typically shouldn't be used for other purposes.
|
815
|
+
*/
|
816
|
+
GUMBO_STATUS_TOO_MANY_ATTRIBUTES,
|
817
|
+
|
799
818
|
// Currently unused
|
800
819
|
GUMBO_STATUS_OUT_OF_MEMORY,
|
801
820
|
} GumboOutputStatus;
|
@@ -820,6 +839,14 @@ typedef struct GumboInternalOutput {
|
|
820
839
|
*/
|
821
840
|
GumboVector /* GumboError */ errors;
|
822
841
|
|
842
|
+
/**
|
843
|
+
* True if the parser encounted an error.
|
844
|
+
*
|
845
|
+
* This can be true and `errors` an empty `GumboVector` if the `max_errors`
|
846
|
+
* option was set to 0.
|
847
|
+
*/
|
848
|
+
bool document_error;
|
849
|
+
|
823
850
|
/**
|
824
851
|
* A status code indicating whether parsing finished successfully or was
|
825
852
|
* stopped mid-document due to exceptional circumstances.
|