nokogumbo 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +8 -2
- data/ext/nokogumboc/extconf.rb +18 -6
- data/ext/nokogumboc/nokogumbo.c +102 -42
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +51 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1439 -1172
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +278 -361
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +53 -52
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/lib/nokogumbo.rb +8 -8
- data/test-nokogumbo.rb +190 -0
- metadata +19 -17
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
|
4
|
+
data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
|
7
|
+
data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
|
data/README.md
CHANGED
@@ -36,7 +36,13 @@ Example
|
|
36
36
|
-----
|
37
37
|
```ruby
|
38
38
|
require 'nokogumbo'
|
39
|
-
puts Nokogiri::HTML5.get('http://nokogiri.org').
|
39
|
+
puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
|
40
|
+
```
|
41
|
+
|
42
|
+
Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
|
43
|
+
```
|
44
|
+
require 'nokogumbo'
|
45
|
+
Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
|
40
46
|
```
|
41
47
|
|
42
48
|
Notes
|
@@ -83,5 +89,5 @@ Installation
|
|
83
89
|
Related efforts
|
84
90
|
============
|
85
91
|
|
86
|
-
* [ruby-gumbo](https://github.com/
|
92
|
+
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
|
87
93
|
for the Gumbo HTML5 parser.
|
data/ext/nokogumboc/extconf.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'mkmf'
|
2
2
|
$CFLAGS += " -std=c99"
|
3
3
|
|
4
|
-
if have_library('xml2', 'xmlNewDoc')
|
4
|
+
if have_library('xml2', 'xmlNewDoc')
|
5
5
|
# libxml2 libraries from http://www.xmlsoft.org/
|
6
6
|
pkg_config('libxml-2.0')
|
7
7
|
|
@@ -19,11 +19,6 @@ if have_library('xml2', 'xmlNewDoc')
|
|
19
19
|
|
20
20
|
# if found, enable direct calls to Nokogiri (and libxml2)
|
21
21
|
$CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
|
22
|
-
|
23
|
-
if File.exists?("/etc/gentoo-release")
|
24
|
-
# link to the library to prevent: nokogumbo.c:(.text+0x26a): undefined reference to `Nokogiri_wrap_xml_document'
|
25
|
-
$LDFLAGS += " -L#{nokogiri_ext} -l:nokogiri.so"
|
26
|
-
end
|
27
22
|
end
|
28
23
|
end
|
29
24
|
|
@@ -45,4 +40,21 @@ unless have_library('gumbo', 'gumbo_parse')
|
|
45
40
|
end
|
46
41
|
end
|
47
42
|
|
43
|
+
# We use some Gumbo Internals, and not all distros ship the internal headers.
|
44
|
+
header_typedefs = {
|
45
|
+
'error.h' => 'GumboErrorType',
|
46
|
+
'insertion_mode.h' => 'GumboInsertionMode',
|
47
|
+
'parser.h' => 'GumboParser',
|
48
|
+
'string_buffer.h' => 'GumboStringBuffer',
|
49
|
+
'token_type.h' => 'GumboTokenType',
|
50
|
+
}
|
51
|
+
|
52
|
+
header_typedefs.each_pair do |header, type|
|
53
|
+
unless find_type(type, header)
|
54
|
+
require 'fileutils'
|
55
|
+
FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/#{header}"],
|
56
|
+
"#{rakehome}/ext/nokogumboc"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
48
60
|
create_makefile('nokogumboc')
|
data/ext/nokogumboc/nokogumbo.c
CHANGED
@@ -19,10 +19,13 @@
|
|
19
19
|
//
|
20
20
|
|
21
21
|
#include <ruby.h>
|
22
|
-
#include
|
22
|
+
#include "gumbo.h"
|
23
|
+
#include "error.h"
|
24
|
+
#include "parser.h"
|
23
25
|
|
24
26
|
// class constants
|
25
27
|
static VALUE Document;
|
28
|
+
static VALUE XMLSyntaxError;
|
26
29
|
|
27
30
|
#ifdef NGLIB
|
28
31
|
#include <nokogiri.h>
|
@@ -82,8 +85,11 @@ static VALUE xmlNewDoc(char* version) {
|
|
82
85
|
}
|
83
86
|
#endif
|
84
87
|
|
85
|
-
// Build a
|
86
|
-
static xmlNodePtr walk_tree(xmlDocPtr document,
|
88
|
+
// Build a xmlNodePtr for a given GumboNode (recursively)
|
89
|
+
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
|
90
|
+
|
91
|
+
// Build a xmlNodePtr for a given GumboElement (recursively)
|
92
|
+
static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
87
93
|
// determine tag name for a given node
|
88
94
|
xmlNodePtr element;
|
89
95
|
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
@@ -151,55 +157,108 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
|
151
157
|
// add in the children
|
152
158
|
GumboVector* children = &node->children;
|
153
159
|
for (int i=0; i < children->length; i++) {
|
154
|
-
|
155
|
-
|
156
|
-
xmlNodePtr node = NIL;
|
157
|
-
|
158
|
-
switch (child->type) {
|
159
|
-
case GUMBO_NODE_ELEMENT:
|
160
|
-
node = walk_tree(document, &child->v.element);
|
161
|
-
break;
|
162
|
-
case GUMBO_NODE_WHITESPACE:
|
163
|
-
case GUMBO_NODE_TEXT:
|
164
|
-
node = xmlNewDocText(document, CONST_CAST child->v.text.text);
|
165
|
-
break;
|
166
|
-
case GUMBO_NODE_CDATA:
|
167
|
-
node = xmlNewCDataBlock(document,
|
168
|
-
CONST_CAST child->v.text.original_text.data,
|
169
|
-
(int) child->v.text.original_text.length);
|
170
|
-
break;
|
171
|
-
case GUMBO_NODE_COMMENT:
|
172
|
-
node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
|
173
|
-
break;
|
174
|
-
case GUMBO_NODE_DOCUMENT:
|
175
|
-
break; // should never happen -- ignore
|
176
|
-
}
|
177
|
-
|
160
|
+
xmlNodePtr node = walk_tree(document, children->data[i]);
|
178
161
|
if (node) xmlAddChild(element, node);
|
179
162
|
}
|
180
163
|
|
181
164
|
return element;
|
182
165
|
}
|
183
166
|
|
167
|
+
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
168
|
+
switch (node->type) {
|
169
|
+
case GUMBO_NODE_DOCUMENT:
|
170
|
+
return NIL;
|
171
|
+
case GUMBO_NODE_ELEMENT:
|
172
|
+
case GUMBO_NODE_TEMPLATE:
|
173
|
+
return walk_element(document, &node->v.element);
|
174
|
+
case GUMBO_NODE_TEXT:
|
175
|
+
case GUMBO_NODE_WHITESPACE:
|
176
|
+
return xmlNewDocText(document, CONST_CAST node->v.text.text);
|
177
|
+
case GUMBO_NODE_CDATA:
|
178
|
+
return xmlNewCDataBlock(document,
|
179
|
+
CONST_CAST node->v.text.original_text.data,
|
180
|
+
(int) node->v.text.original_text.length);
|
181
|
+
case GUMBO_NODE_COMMENT:
|
182
|
+
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
184
186
|
// Parse a string using gumbo_parse into a Nokogiri document
|
185
|
-
static VALUE parse(VALUE self, VALUE string) {
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
187
|
+
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
188
|
+
GumboOptions options;
|
189
|
+
memcpy(&options, &kGumboDefaultOptions, sizeof options);
|
190
|
+
options.max_errors = NUM2INT(max_parse_errors);
|
191
|
+
|
192
|
+
const char *input = RSTRING_PTR(string);
|
193
|
+
size_t input_len = RSTRING_LEN(string);
|
194
|
+
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
|
190
195
|
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
191
|
-
|
192
|
-
|
196
|
+
#ifdef NGLIB
|
197
|
+
doc->type = XML_HTML_DOCUMENT_NODE;
|
198
|
+
#endif
|
193
199
|
if (output->document->v.document.has_doctype) {
|
200
|
+
const char *name = output->document->v.document.name;
|
194
201
|
const char *public = output->document->v.document.public_identifier;
|
195
202
|
const char *system = output->document->v.document.system_identifier;
|
196
|
-
xmlCreateIntSubset(doc, CONST_CAST
|
197
|
-
(
|
198
|
-
(
|
203
|
+
xmlCreateIntSubset(doc, CONST_CAST name,
|
204
|
+
(public[0] ? CONST_CAST public : NIL),
|
205
|
+
(system[0] ? CONST_CAST system : NIL));
|
199
206
|
}
|
200
|
-
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
201
207
|
|
202
|
-
|
208
|
+
GumboVector *children = &output->document->v.document.children;
|
209
|
+
for (int i=0; i < children->length; i++) {
|
210
|
+
GumboNode *child = children->data[i];
|
211
|
+
xmlNodePtr node = walk_tree(doc, child);
|
212
|
+
if (node) {
|
213
|
+
if (child == output->root)
|
214
|
+
xmlDocSetRootElement(doc, node);
|
215
|
+
else
|
216
|
+
xmlAddChild((xmlNodePtr)doc, node);
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
|
221
|
+
|
222
|
+
// Add parse errors to rdoc.
|
223
|
+
if (output->errors.length) {
|
224
|
+
GumboVector *errors = &output->errors;
|
225
|
+
GumboParser parser = { ._options = &options };
|
226
|
+
GumboStringBuffer msg;
|
227
|
+
VALUE rerrors = rb_ary_new2(errors->length);
|
228
|
+
|
229
|
+
gumbo_string_buffer_init(&parser, &msg);
|
230
|
+
for (int i=0; i < errors->length; i++) {
|
231
|
+
GumboError *err = errors->data[i];
|
232
|
+
gumbo_string_buffer_clear(&parser, &msg);
|
233
|
+
// Work around bug in gumbo_caret_diagnostic_to_string.
|
234
|
+
// See https://github.com/google/gumbo-parser/pull/371
|
235
|
+
// The bug occurs when the error starts with a newline (unless it's the
|
236
|
+
// first character in the input--but that shouldn't cause an error in
|
237
|
+
// the first place.
|
238
|
+
if (*err->original_text == '\n' && err->original_text != input)
|
239
|
+
--err->original_text;
|
240
|
+
gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
|
241
|
+
VALUE err_str = rb_str_new(msg.data, msg.length);
|
242
|
+
VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
|
243
|
+
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
244
|
+
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
245
|
+
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
246
|
+
rb_iv_set(syntax_error, "@file", Qnil);
|
247
|
+
rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
|
248
|
+
rb_iv_set(syntax_error, "@str1", Qnil);
|
249
|
+
rb_iv_set(syntax_error, "@str2", Qnil);
|
250
|
+
rb_iv_set(syntax_error, "@str3", Qnil);
|
251
|
+
rb_iv_set(syntax_error, "@int1", INT2NUM(err->type));
|
252
|
+
rb_iv_set(syntax_error, "@column", INT2NUM(err->position.column));
|
253
|
+
rb_ary_push(rerrors, syntax_error);
|
254
|
+
}
|
255
|
+
rb_iv_set(rdoc, "@errors", rerrors);
|
256
|
+
gumbo_string_buffer_destroy(&parser, &msg);
|
257
|
+
}
|
258
|
+
|
259
|
+
gumbo_destroy_output(&options, output);
|
260
|
+
|
261
|
+
return rdoc;
|
203
262
|
}
|
204
263
|
|
205
264
|
// Initialize the Nokogumbo class and fetch constants we will use later
|
@@ -211,10 +270,11 @@ void Init_nokogumboc() {
|
|
211
270
|
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
212
271
|
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
213
272
|
Document = rb_const_get(HTML, rb_intern("Document"));
|
273
|
+
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
274
|
+
XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
|
214
275
|
|
215
276
|
#ifndef NGLIB
|
216
277
|
// more class constants
|
217
|
-
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
218
278
|
Element = rb_const_get(XML, rb_intern("Element"));
|
219
279
|
Text = rb_const_get(XML, rb_intern("Text"));
|
220
280
|
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
@@ -223,7 +283,7 @@ void Init_nokogumboc() {
|
|
223
283
|
// interned symbols
|
224
284
|
new = rb_intern("new");
|
225
285
|
set_attribute = rb_intern("set_attribute");
|
226
|
-
add_child = rb_intern("
|
286
|
+
add_child = rb_intern("add_child_node_and_reparent_attrs");
|
227
287
|
internal_subset = rb_intern("internal_subset");
|
228
288
|
remove_ = rb_intern("remove");
|
229
289
|
create_internal_subset = rb_intern("create_internal_subset");
|
@@ -231,5 +291,5 @@ void Init_nokogumboc() {
|
|
231
291
|
|
232
292
|
// define Nokogumbo class with a singleton parse method
|
233
293
|
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
234
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
294
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 2);
|
235
295
|
}
|
@@ -27,7 +27,7 @@ struct GumboInternalParser;
|
|
27
27
|
|
28
28
|
GumboAttribute* gumbo_get_attribute(
|
29
29
|
const GumboVector* attributes, const char* name) {
|
30
|
-
for (int i = 0; i < attributes->length; ++i) {
|
30
|
+
for (unsigned int i = 0; i < attributes->length; ++i) {
|
31
31
|
GumboAttribute* attr = attributes->data[i];
|
32
32
|
if (!strcasecmp(attr->name, name)) {
|
33
33
|
return attr;
|
data/gumbo-parser/src/char_ref.c
CHANGED
@@ -30,7 +30,7 @@
|
|
30
30
|
#include <ctype.h>
|
31
31
|
#include <stddef.h>
|
32
32
|
#include <stdio.h>
|
33
|
-
#include <string.h>
|
33
|
+
#include <string.h> // Only for debug assertions at present.
|
34
34
|
|
35
35
|
#include "error.h"
|
36
36
|
#include "string_piece.h"
|
@@ -49,44 +49,18 @@ typedef struct {
|
|
49
49
|
int to_char;
|
50
50
|
} CharReplacement;
|
51
51
|
|
52
|
-
static const CharReplacement kCharReplacements[] = {
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
{ 0x89, 0x2030 },
|
65
|
-
{ 0x8A, 0x0160 },
|
66
|
-
{ 0x8B, 0x2039 },
|
67
|
-
{ 0x8C, 0x0152 },
|
68
|
-
{ 0x8D, 0x008D },
|
69
|
-
{ 0x8E, 0x017D },
|
70
|
-
{ 0x8F, 0x008F },
|
71
|
-
{ 0x90, 0x0090 },
|
72
|
-
{ 0x91, 0x2018 },
|
73
|
-
{ 0x92, 0x2019 },
|
74
|
-
{ 0x93, 0x201C },
|
75
|
-
{ 0x94, 0x201D },
|
76
|
-
{ 0x95, 0x2022 },
|
77
|
-
{ 0x96, 0x2013 },
|
78
|
-
{ 0x97, 0x2014 },
|
79
|
-
{ 0x98, 0x02DC },
|
80
|
-
{ 0x99, 0x2122 },
|
81
|
-
{ 0x9A, 0x0161 },
|
82
|
-
{ 0x9B, 0x203A },
|
83
|
-
{ 0x9C, 0x0153 },
|
84
|
-
{ 0x9D, 0x009D },
|
85
|
-
{ 0x9E, 0x017E },
|
86
|
-
{ 0x9F, 0x0178 },
|
87
|
-
// Terminator.
|
88
|
-
{ -1, -1 }
|
89
|
-
};
|
52
|
+
static const CharReplacement kCharReplacements[] = {{0x00, 0xfffd},
|
53
|
+
{0x0d, 0x000d}, {0x80, 0x20ac}, {0x81, 0x0081}, {0x82, 0x201A},
|
54
|
+
{0x83, 0x0192}, {0x84, 0x201E}, {0x85, 0x2026}, {0x86, 0x2020},
|
55
|
+
{0x87, 0x2021}, {0x88, 0x02C6}, {0x89, 0x2030}, {0x8A, 0x0160},
|
56
|
+
{0x8B, 0x2039}, {0x8C, 0x0152}, {0x8D, 0x008D}, {0x8E, 0x017D},
|
57
|
+
{0x8F, 0x008F}, {0x90, 0x0090}, {0x91, 0x2018}, {0x92, 0x2019},
|
58
|
+
{0x93, 0x201C}, {0x94, 0x201D}, {0x95, 0x2022}, {0x96, 0x2013},
|
59
|
+
{0x97, 0x2014}, {0x98, 0x02DC}, {0x99, 0x2122}, {0x9A, 0x0161},
|
60
|
+
{0x9B, 0x203A}, {0x9C, 0x0153}, {0x9D, 0x009D}, {0x9E, 0x017E},
|
61
|
+
{0x9F, 0x0178},
|
62
|
+
// Terminator.
|
63
|
+
{-1, -1}};
|
90
64
|
|
91
65
|
static int parse_digit(int c, bool allow_hex) {
|
92
66
|
if (c >= '0' && c <= '9') {
|
@@ -111,9 +85,8 @@ static void add_no_digit_error(
|
|
111
85
|
error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS;
|
112
86
|
}
|
113
87
|
|
114
|
-
static void add_codepoint_error(
|
115
|
-
|
116
|
-
GumboErrorType type, int codepoint) {
|
88
|
+
static void add_codepoint_error(struct GumboInternalParser* parser,
|
89
|
+
Utf8Iterator* input, GumboErrorType type, int codepoint) {
|
117
90
|
GumboError* error = gumbo_add_error(parser);
|
118
91
|
if (!error) {
|
119
92
|
return;
|
@@ -123,9 +96,8 @@ static void add_codepoint_error(
|
|
123
96
|
error->v.codepoint = codepoint;
|
124
97
|
}
|
125
98
|
|
126
|
-
static void add_named_reference_error(
|
127
|
-
|
128
|
-
GumboErrorType type, GumboStringPiece text) {
|
99
|
+
static void add_named_reference_error(struct GumboInternalParser* parser,
|
100
|
+
Utf8Iterator* input, GumboErrorType type, GumboStringPiece text) {
|
129
101
|
GumboError* error = gumbo_add_error(parser);
|
130
102
|
if (!error) {
|
131
103
|
return;
|
@@ -211,8 +183,7 @@ static bool maybe_add_invalid_named_reference(
|
|
211
183
|
// worry about consuming characters.
|
212
184
|
const char* start = utf8iterator_get_char_pointer(input);
|
213
185
|
int c = utf8iterator_current(input);
|
214
|
-
while ((c >= 'a' && c <= 'z') ||
|
215
|
-
(c >= 'A' && c <= 'Z') ||
|
186
|
+
while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
216
187
|
(c >= '0' && c <= '9')) {
|
217
188
|
utf8iterator_next(input);
|
218
189
|
c = utf8iterator_current(input);
|
@@ -228,12 +199,11 @@ static bool maybe_add_invalid_named_reference(
|
|
228
199
|
return true;
|
229
200
|
}
|
230
201
|
|
231
|
-
|
232
202
|
#line 2465 "char_ref.rl"
|
233
203
|
|
204
|
+
// clang-format off
|
234
205
|
|
235
|
-
|
236
|
-
#line 237 "char_ref.c"
|
206
|
+
#line 238 "char_ref.c"
|
237
207
|
static const short _char_ref_actions[] = {
|
238
208
|
0, 1, 0, 1, 1, 1, 2, 1,
|
239
209
|
3, 1, 4, 1, 5, 1, 6, 1,
|
@@ -13960,17 +13930,15 @@ static const short _char_ref_eof_trans[] = {
|
|
13960
13930
|
};
|
13961
13931
|
|
13962
13932
|
static const int char_ref_start = 7623;
|
13963
|
-
static const int char_ref_first_final = 7623;
|
13964
|
-
static const int char_ref_error = 0;
|
13965
13933
|
|
13966
13934
|
static const int char_ref_en_valid_named_ref = 7623;
|
13967
13935
|
|
13968
13936
|
|
13969
|
-
#line
|
13937
|
+
#line 2469 "char_ref.rl"
|
13938
|
+
// clang-format on
|
13970
13939
|
|
13971
|
-
static bool consume_named_ref(
|
13972
|
-
|
13973
|
-
OneOrTwoCodepoints* output) {
|
13940
|
+
static bool consume_named_ref(struct GumboInternalParser* parser,
|
13941
|
+
Utf8Iterator* input, bool is_in_attribute, OneOrTwoCodepoints* output) {
|
13974
13942
|
assert(output->first == kGumboNoChar);
|
13975
13943
|
const char* p = utf8iterator_get_char_pointer(input);
|
13976
13944
|
const char* pe = utf8iterator_get_end_pointer(input);
|
@@ -13979,8 +13947,9 @@ static bool consume_named_ref(
|
|
13979
13947
|
const char *ts, *start;
|
13980
13948
|
int cs, act;
|
13981
13949
|
|
13950
|
+
// clang-format off
|
13982
13951
|
|
13983
|
-
#line
|
13952
|
+
#line 13985 "char_ref.c"
|
13984
13953
|
{
|
13985
13954
|
cs = char_ref_start;
|
13986
13955
|
ts = 0;
|
@@ -13988,14 +13957,15 @@ static bool consume_named_ref(
|
|
13988
13957
|
act = 0;
|
13989
13958
|
}
|
13990
13959
|
|
13991
|
-
#line
|
13960
|
+
#line 2484 "char_ref.rl"
|
13992
13961
|
// Avoid unused variable warnings.
|
13993
13962
|
(void) act;
|
13994
13963
|
(void) ts;
|
13964
|
+
(void) char_ref_en_valid_named_ref;
|
13995
13965
|
|
13996
13966
|
start = p;
|
13997
13967
|
|
13998
|
-
#line
|
13968
|
+
#line 14001 "char_ref.c"
|
13999
13969
|
{
|
14000
13970
|
int _slen;
|
14001
13971
|
int _trans;
|
@@ -14017,7 +13987,7 @@ _resume:
|
|
14017
13987
|
#line 1 "NONE"
|
14018
13988
|
{ts = p;}
|
14019
13989
|
break;
|
14020
|
-
#line
|
13990
|
+
#line 14023 "char_ref.c"
|
14021
13991
|
}
|
14022
13992
|
}
|
14023
13993
|
|
@@ -23000,7 +22970,7 @@ _eof_trans:
|
|
23000
22970
|
#line 2273 "char_ref.rl"
|
23001
22971
|
{{p = ((te))-1;}{ output->first = 0xd7; {p++; goto _out; } }}
|
23002
22972
|
break;
|
23003
|
-
#line
|
22973
|
+
#line 23006 "char_ref.c"
|
23004
22974
|
}
|
23005
22975
|
}
|
23006
22976
|
|
@@ -23013,7 +22983,7 @@ _again:
|
|
23013
22983
|
#line 1 "NONE"
|
23014
22984
|
{ts = 0;}
|
23015
22985
|
break;
|
23016
|
-
#line
|
22986
|
+
#line 23019 "char_ref.c"
|
23017
22987
|
}
|
23018
22988
|
}
|
23019
22989
|
|
@@ -23033,7 +23003,8 @@ _again:
|
|
23033
23003
|
_out: {}
|
23034
23004
|
}
|
23035
23005
|
|
23036
|
-
#line
|
23006
|
+
#line 2491 "char_ref.rl"
|
23007
|
+
// clang-format on
|
23037
23008
|
|
23038
23009
|
if (cs >= 7623) {
|
23039
23010
|
assert(output->first != kGumboNoChar);
|
@@ -23067,10 +23038,9 @@ _again:
|
|
23067
23038
|
}
|
23068
23039
|
}
|
23069
23040
|
|
23070
|
-
bool consume_char_ref(
|
23071
|
-
struct
|
23072
|
-
|
23073
|
-
OneOrTwoCodepoints* output) {
|
23041
|
+
bool consume_char_ref(struct GumboInternalParser* parser,
|
23042
|
+
struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
|
23043
|
+
bool is_in_attribute, OneOrTwoCodepoints* output) {
|
23074
23044
|
utf8iterator_mark(input);
|
23075
23045
|
utf8iterator_next(input);
|
23076
23046
|
int c = utf8iterator_current(input);
|