nokogumbo 1.3.0 → 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.md +8 -2
- data/ext/nokogumboc/extconf.rb +18 -6
- data/ext/nokogumboc/nokogumbo.c +102 -42
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +51 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1439 -1172
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +278 -361
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +53 -52
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/lib/nokogumbo.rb +8 -8
- data/test-nokogumbo.rb +190 -0
- metadata +19 -17
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 96fa61565f78d5491e0b6d5b505cf936524745eb848b8b6584fc15e20c7ae35b
|
4
|
+
data.tar.gz: e5416f71bbe90323f04b8aad4dc48b28947e43a9eb46f446f8ca1444f519a07b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 676bf3585d38cd4ad5c72b8b3afd4952e248c747683ae1072dd43f6ce1ccd279177e4d0c75a9821ed76d32806333128152231349d8d113ae5d81279580b13004
|
7
|
+
data.tar.gz: 3459078d96977399e75551c4a3ee5623091f48569984b771e540ec111125f5af91e39a8d78cbd3ce9280326b1b9395dc4a0b0d7f0a72294876682cb9fe35e3d9
|
data/README.md
CHANGED
@@ -36,7 +36,13 @@ Example
|
|
36
36
|
-----
|
37
37
|
```ruby
|
38
38
|
require 'nokogumbo'
|
39
|
-
puts Nokogiri::HTML5.get('http://nokogiri.org').
|
39
|
+
puts Nokogiri::HTML5.get('http://nokogiri.org').search('ol li')[2].text
|
40
|
+
```
|
41
|
+
|
42
|
+
Use `.to_html` instead of `.to_s` when parsing and serializing multiple times
|
43
|
+
```
|
44
|
+
require 'nokogumbo'
|
45
|
+
Nokogiri::HTML5.parse(Nokogiri::HTML5.parse('<div></div> a').to_html).to_html
|
40
46
|
```
|
41
47
|
|
42
48
|
Notes
|
@@ -83,5 +89,5 @@ Installation
|
|
83
89
|
Related efforts
|
84
90
|
============
|
85
91
|
|
86
|
-
* [ruby-gumbo](https://github.com/
|
92
|
+
* [ruby-gumbo](https://github.com/nevir/ruby-gumbo#readme) - a ruby binding
|
87
93
|
for the Gumbo HTML5 parser.
|
data/ext/nokogumboc/extconf.rb
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
require 'mkmf'
|
2
2
|
$CFLAGS += " -std=c99"
|
3
3
|
|
4
|
-
if have_library('xml2', 'xmlNewDoc')
|
4
|
+
if have_library('xml2', 'xmlNewDoc')
|
5
5
|
# libxml2 libraries from http://www.xmlsoft.org/
|
6
6
|
pkg_config('libxml-2.0')
|
7
7
|
|
@@ -19,11 +19,6 @@ if have_library('xml2', 'xmlNewDoc')
|
|
19
19
|
|
20
20
|
# if found, enable direct calls to Nokogiri (and libxml2)
|
21
21
|
$CFLAGS += ' -DNGLIB' if find_header('nokogiri.h', nokogiri_ext)
|
22
|
-
|
23
|
-
if File.exists?("/etc/gentoo-release")
|
24
|
-
# link to the library to prevent: nokogumbo.c:(.text+0x26a): undefined reference to `Nokogiri_wrap_xml_document'
|
25
|
-
$LDFLAGS += " -L#{nokogiri_ext} -l:nokogiri.so"
|
26
|
-
end
|
27
22
|
end
|
28
23
|
end
|
29
24
|
|
@@ -45,4 +40,21 @@ unless have_library('gumbo', 'gumbo_parse')
|
|
45
40
|
end
|
46
41
|
end
|
47
42
|
|
43
|
+
# We use some Gumbo Internals, and not all distros ship the internal headers.
|
44
|
+
header_typedefs = {
|
45
|
+
'error.h' => 'GumboErrorType',
|
46
|
+
'insertion_mode.h' => 'GumboInsertionMode',
|
47
|
+
'parser.h' => 'GumboParser',
|
48
|
+
'string_buffer.h' => 'GumboStringBuffer',
|
49
|
+
'token_type.h' => 'GumboTokenType',
|
50
|
+
}
|
51
|
+
|
52
|
+
header_typedefs.each_pair do |header, type|
|
53
|
+
unless find_type(type, header)
|
54
|
+
require 'fileutils'
|
55
|
+
FileUtils.cp Dir["#{rakehome}/gumbo-parser/src/#{header}"],
|
56
|
+
"#{rakehome}/ext/nokogumboc"
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
48
60
|
create_makefile('nokogumboc')
|
data/ext/nokogumboc/nokogumbo.c
CHANGED
@@ -19,10 +19,13 @@
|
|
19
19
|
//
|
20
20
|
|
21
21
|
#include <ruby.h>
|
22
|
-
#include
|
22
|
+
#include "gumbo.h"
|
23
|
+
#include "error.h"
|
24
|
+
#include "parser.h"
|
23
25
|
|
24
26
|
// class constants
|
25
27
|
static VALUE Document;
|
28
|
+
static VALUE XMLSyntaxError;
|
26
29
|
|
27
30
|
#ifdef NGLIB
|
28
31
|
#include <nokogiri.h>
|
@@ -82,8 +85,11 @@ static VALUE xmlNewDoc(char* version) {
|
|
82
85
|
}
|
83
86
|
#endif
|
84
87
|
|
85
|
-
// Build a
|
86
|
-
static xmlNodePtr walk_tree(xmlDocPtr document,
|
88
|
+
// Build a xmlNodePtr for a given GumboNode (recursively)
|
89
|
+
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node);
|
90
|
+
|
91
|
+
// Build a xmlNodePtr for a given GumboElement (recursively)
|
92
|
+
static xmlNodePtr walk_element(xmlDocPtr document, GumboElement *node) {
|
87
93
|
// determine tag name for a given node
|
88
94
|
xmlNodePtr element;
|
89
95
|
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
@@ -151,55 +157,108 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
|
151
157
|
// add in the children
|
152
158
|
GumboVector* children = &node->children;
|
153
159
|
for (int i=0; i < children->length; i++) {
|
154
|
-
|
155
|
-
|
156
|
-
xmlNodePtr node = NIL;
|
157
|
-
|
158
|
-
switch (child->type) {
|
159
|
-
case GUMBO_NODE_ELEMENT:
|
160
|
-
node = walk_tree(document, &child->v.element);
|
161
|
-
break;
|
162
|
-
case GUMBO_NODE_WHITESPACE:
|
163
|
-
case GUMBO_NODE_TEXT:
|
164
|
-
node = xmlNewDocText(document, CONST_CAST child->v.text.text);
|
165
|
-
break;
|
166
|
-
case GUMBO_NODE_CDATA:
|
167
|
-
node = xmlNewCDataBlock(document,
|
168
|
-
CONST_CAST child->v.text.original_text.data,
|
169
|
-
(int) child->v.text.original_text.length);
|
170
|
-
break;
|
171
|
-
case GUMBO_NODE_COMMENT:
|
172
|
-
node = xmlNewDocComment(document, CONST_CAST child->v.text.text);
|
173
|
-
break;
|
174
|
-
case GUMBO_NODE_DOCUMENT:
|
175
|
-
break; // should never happen -- ignore
|
176
|
-
}
|
177
|
-
|
160
|
+
xmlNodePtr node = walk_tree(document, children->data[i]);
|
178
161
|
if (node) xmlAddChild(element, node);
|
179
162
|
}
|
180
163
|
|
181
164
|
return element;
|
182
165
|
}
|
183
166
|
|
167
|
+
static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
|
168
|
+
switch (node->type) {
|
169
|
+
case GUMBO_NODE_DOCUMENT:
|
170
|
+
return NIL;
|
171
|
+
case GUMBO_NODE_ELEMENT:
|
172
|
+
case GUMBO_NODE_TEMPLATE:
|
173
|
+
return walk_element(document, &node->v.element);
|
174
|
+
case GUMBO_NODE_TEXT:
|
175
|
+
case GUMBO_NODE_WHITESPACE:
|
176
|
+
return xmlNewDocText(document, CONST_CAST node->v.text.text);
|
177
|
+
case GUMBO_NODE_CDATA:
|
178
|
+
return xmlNewCDataBlock(document,
|
179
|
+
CONST_CAST node->v.text.original_text.data,
|
180
|
+
(int) node->v.text.original_text.length);
|
181
|
+
case GUMBO_NODE_COMMENT:
|
182
|
+
return xmlNewDocComment(document, CONST_CAST node->v.text.text);
|
183
|
+
}
|
184
|
+
}
|
185
|
+
|
184
186
|
// Parse a string using gumbo_parse into a Nokogiri document
|
185
|
-
static VALUE parse(VALUE self, VALUE string) {
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
187
|
+
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
|
188
|
+
GumboOptions options;
|
189
|
+
memcpy(&options, &kGumboDefaultOptions, sizeof options);
|
190
|
+
options.max_errors = NUM2INT(max_parse_errors);
|
191
|
+
|
192
|
+
const char *input = RSTRING_PTR(string);
|
193
|
+
size_t input_len = RSTRING_LEN(string);
|
194
|
+
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
|
190
195
|
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
191
|
-
|
192
|
-
|
196
|
+
#ifdef NGLIB
|
197
|
+
doc->type = XML_HTML_DOCUMENT_NODE;
|
198
|
+
#endif
|
193
199
|
if (output->document->v.document.has_doctype) {
|
200
|
+
const char *name = output->document->v.document.name;
|
194
201
|
const char *public = output->document->v.document.public_identifier;
|
195
202
|
const char *system = output->document->v.document.system_identifier;
|
196
|
-
xmlCreateIntSubset(doc, CONST_CAST
|
197
|
-
(
|
198
|
-
(
|
203
|
+
xmlCreateIntSubset(doc, CONST_CAST name,
|
204
|
+
(public[0] ? CONST_CAST public : NIL),
|
205
|
+
(system[0] ? CONST_CAST system : NIL));
|
199
206
|
}
|
200
|
-
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
201
207
|
|
202
|
-
|
208
|
+
GumboVector *children = &output->document->v.document.children;
|
209
|
+
for (int i=0; i < children->length; i++) {
|
210
|
+
GumboNode *child = children->data[i];
|
211
|
+
xmlNodePtr node = walk_tree(doc, child);
|
212
|
+
if (node) {
|
213
|
+
if (child == output->root)
|
214
|
+
xmlDocSetRootElement(doc, node);
|
215
|
+
else
|
216
|
+
xmlAddChild((xmlNodePtr)doc, node);
|
217
|
+
}
|
218
|
+
}
|
219
|
+
|
220
|
+
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);
|
221
|
+
|
222
|
+
// Add parse errors to rdoc.
|
223
|
+
if (output->errors.length) {
|
224
|
+
GumboVector *errors = &output->errors;
|
225
|
+
GumboParser parser = { ._options = &options };
|
226
|
+
GumboStringBuffer msg;
|
227
|
+
VALUE rerrors = rb_ary_new2(errors->length);
|
228
|
+
|
229
|
+
gumbo_string_buffer_init(&parser, &msg);
|
230
|
+
for (int i=0; i < errors->length; i++) {
|
231
|
+
GumboError *err = errors->data[i];
|
232
|
+
gumbo_string_buffer_clear(&parser, &msg);
|
233
|
+
// Work around bug in gumbo_caret_diagnostic_to_string.
|
234
|
+
// See https://github.com/google/gumbo-parser/pull/371
|
235
|
+
// The bug occurs when the error starts with a newline (unless it's the
|
236
|
+
// first character in the input--but that shouldn't cause an error in
|
237
|
+
// the first place.
|
238
|
+
if (*err->original_text == '\n' && err->original_text != input)
|
239
|
+
--err->original_text;
|
240
|
+
gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
|
241
|
+
VALUE err_str = rb_str_new(msg.data, msg.length);
|
242
|
+
VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
|
243
|
+
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
|
244
|
+
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
|
245
|
+
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
|
246
|
+
rb_iv_set(syntax_error, "@file", Qnil);
|
247
|
+
rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
|
248
|
+
rb_iv_set(syntax_error, "@str1", Qnil);
|
249
|
+
rb_iv_set(syntax_error, "@str2", Qnil);
|
250
|
+
rb_iv_set(syntax_error, "@str3", Qnil);
|
251
|
+
rb_iv_set(syntax_error, "@int1", INT2NUM(err->type));
|
252
|
+
rb_iv_set(syntax_error, "@column", INT2NUM(err->position.column));
|
253
|
+
rb_ary_push(rerrors, syntax_error);
|
254
|
+
}
|
255
|
+
rb_iv_set(rdoc, "@errors", rerrors);
|
256
|
+
gumbo_string_buffer_destroy(&parser, &msg);
|
257
|
+
}
|
258
|
+
|
259
|
+
gumbo_destroy_output(&options, output);
|
260
|
+
|
261
|
+
return rdoc;
|
203
262
|
}
|
204
263
|
|
205
264
|
// Initialize the Nokogumbo class and fetch constants we will use later
|
@@ -211,10 +270,11 @@ void Init_nokogumboc() {
|
|
211
270
|
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
212
271
|
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
213
272
|
Document = rb_const_get(HTML, rb_intern("Document"));
|
273
|
+
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
274
|
+
XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));
|
214
275
|
|
215
276
|
#ifndef NGLIB
|
216
277
|
// more class constants
|
217
|
-
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
218
278
|
Element = rb_const_get(XML, rb_intern("Element"));
|
219
279
|
Text = rb_const_get(XML, rb_intern("Text"));
|
220
280
|
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
@@ -223,7 +283,7 @@ void Init_nokogumboc() {
|
|
223
283
|
// interned symbols
|
224
284
|
new = rb_intern("new");
|
225
285
|
set_attribute = rb_intern("set_attribute");
|
226
|
-
add_child = rb_intern("
|
286
|
+
add_child = rb_intern("add_child_node_and_reparent_attrs");
|
227
287
|
internal_subset = rb_intern("internal_subset");
|
228
288
|
remove_ = rb_intern("remove");
|
229
289
|
create_internal_subset = rb_intern("create_internal_subset");
|
@@ -231,5 +291,5 @@ void Init_nokogumboc() {
|
|
231
291
|
|
232
292
|
// define Nokogumbo class with a singleton parse method
|
233
293
|
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
234
|
-
rb_define_singleton_method(Gumbo, "parse", parse,
|
294
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 2);
|
235
295
|
}
|
@@ -27,7 +27,7 @@ struct GumboInternalParser;
|
|
27
27
|
|
28
28
|
GumboAttribute* gumbo_get_attribute(
|
29
29
|
const GumboVector* attributes, const char* name) {
|
30
|
-
for (int i = 0; i < attributes->length; ++i) {
|
30
|
+
for (unsigned int i = 0; i < attributes->length; ++i) {
|
31
31
|
GumboAttribute* attr = attributes->data[i];
|
32
32
|
if (!strcasecmp(attr->name, name)) {
|
33
33
|
return attr;
|
data/gumbo-parser/src/char_ref.c
CHANGED
@@ -30,7 +30,7 @@
|
|
30
30
|
#include <ctype.h>
|
31
31
|
#include <stddef.h>
|
32
32
|
#include <stdio.h>
|
33
|
-
#include <string.h>
|
33
|
+
#include <string.h> // Only for debug assertions at present.
|
34
34
|
|
35
35
|
#include "error.h"
|
36
36
|
#include "string_piece.h"
|
@@ -49,44 +49,18 @@ typedef struct {
|
|
49
49
|
int to_char;
|
50
50
|
} CharReplacement;
|
51
51
|
|
52
|
-
static const CharReplacement kCharReplacements[] = {
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
{ 0x89, 0x2030 },
|
65
|
-
{ 0x8A, 0x0160 },
|
66
|
-
{ 0x8B, 0x2039 },
|
67
|
-
{ 0x8C, 0x0152 },
|
68
|
-
{ 0x8D, 0x008D },
|
69
|
-
{ 0x8E, 0x017D },
|
70
|
-
{ 0x8F, 0x008F },
|
71
|
-
{ 0x90, 0x0090 },
|
72
|
-
{ 0x91, 0x2018 },
|
73
|
-
{ 0x92, 0x2019 },
|
74
|
-
{ 0x93, 0x201C },
|
75
|
-
{ 0x94, 0x201D },
|
76
|
-
{ 0x95, 0x2022 },
|
77
|
-
{ 0x96, 0x2013 },
|
78
|
-
{ 0x97, 0x2014 },
|
79
|
-
{ 0x98, 0x02DC },
|
80
|
-
{ 0x99, 0x2122 },
|
81
|
-
{ 0x9A, 0x0161 },
|
82
|
-
{ 0x9B, 0x203A },
|
83
|
-
{ 0x9C, 0x0153 },
|
84
|
-
{ 0x9D, 0x009D },
|
85
|
-
{ 0x9E, 0x017E },
|
86
|
-
{ 0x9F, 0x0178 },
|
87
|
-
// Terminator.
|
88
|
-
{ -1, -1 }
|
89
|
-
};
|
52
|
+
static const CharReplacement kCharReplacements[] = {{0x00, 0xfffd},
|
53
|
+
{0x0d, 0x000d}, {0x80, 0x20ac}, {0x81, 0x0081}, {0x82, 0x201A},
|
54
|
+
{0x83, 0x0192}, {0x84, 0x201E}, {0x85, 0x2026}, {0x86, 0x2020},
|
55
|
+
{0x87, 0x2021}, {0x88, 0x02C6}, {0x89, 0x2030}, {0x8A, 0x0160},
|
56
|
+
{0x8B, 0x2039}, {0x8C, 0x0152}, {0x8D, 0x008D}, {0x8E, 0x017D},
|
57
|
+
{0x8F, 0x008F}, {0x90, 0x0090}, {0x91, 0x2018}, {0x92, 0x2019},
|
58
|
+
{0x93, 0x201C}, {0x94, 0x201D}, {0x95, 0x2022}, {0x96, 0x2013},
|
59
|
+
{0x97, 0x2014}, {0x98, 0x02DC}, {0x99, 0x2122}, {0x9A, 0x0161},
|
60
|
+
{0x9B, 0x203A}, {0x9C, 0x0153}, {0x9D, 0x009D}, {0x9E, 0x017E},
|
61
|
+
{0x9F, 0x0178},
|
62
|
+
// Terminator.
|
63
|
+
{-1, -1}};
|
90
64
|
|
91
65
|
static int parse_digit(int c, bool allow_hex) {
|
92
66
|
if (c >= '0' && c <= '9') {
|
@@ -111,9 +85,8 @@ static void add_no_digit_error(
|
|
111
85
|
error->type = GUMBO_ERR_NUMERIC_CHAR_REF_NO_DIGITS;
|
112
86
|
}
|
113
87
|
|
114
|
-
static void add_codepoint_error(
|
115
|
-
|
116
|
-
GumboErrorType type, int codepoint) {
|
88
|
+
static void add_codepoint_error(struct GumboInternalParser* parser,
|
89
|
+
Utf8Iterator* input, GumboErrorType type, int codepoint) {
|
117
90
|
GumboError* error = gumbo_add_error(parser);
|
118
91
|
if (!error) {
|
119
92
|
return;
|
@@ -123,9 +96,8 @@ static void add_codepoint_error(
|
|
123
96
|
error->v.codepoint = codepoint;
|
124
97
|
}
|
125
98
|
|
126
|
-
static void add_named_reference_error(
|
127
|
-
|
128
|
-
GumboErrorType type, GumboStringPiece text) {
|
99
|
+
static void add_named_reference_error(struct GumboInternalParser* parser,
|
100
|
+
Utf8Iterator* input, GumboErrorType type, GumboStringPiece text) {
|
129
101
|
GumboError* error = gumbo_add_error(parser);
|
130
102
|
if (!error) {
|
131
103
|
return;
|
@@ -211,8 +183,7 @@ static bool maybe_add_invalid_named_reference(
|
|
211
183
|
// worry about consuming characters.
|
212
184
|
const char* start = utf8iterator_get_char_pointer(input);
|
213
185
|
int c = utf8iterator_current(input);
|
214
|
-
while ((c >= 'a' && c <= 'z') ||
|
215
|
-
(c >= 'A' && c <= 'Z') ||
|
186
|
+
while ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
|
216
187
|
(c >= '0' && c <= '9')) {
|
217
188
|
utf8iterator_next(input);
|
218
189
|
c = utf8iterator_current(input);
|
@@ -228,12 +199,11 @@ static bool maybe_add_invalid_named_reference(
|
|
228
199
|
return true;
|
229
200
|
}
|
230
201
|
|
231
|
-
|
232
202
|
#line 2465 "char_ref.rl"
|
233
203
|
|
204
|
+
// clang-format off
|
234
205
|
|
235
|
-
|
236
|
-
#line 237 "char_ref.c"
|
206
|
+
#line 238 "char_ref.c"
|
237
207
|
static const short _char_ref_actions[] = {
|
238
208
|
0, 1, 0, 1, 1, 1, 2, 1,
|
239
209
|
3, 1, 4, 1, 5, 1, 6, 1,
|
@@ -13960,17 +13930,15 @@ static const short _char_ref_eof_trans[] = {
|
|
13960
13930
|
};
|
13961
13931
|
|
13962
13932
|
static const int char_ref_start = 7623;
|
13963
|
-
static const int char_ref_first_final = 7623;
|
13964
|
-
static const int char_ref_error = 0;
|
13965
13933
|
|
13966
13934
|
static const int char_ref_en_valid_named_ref = 7623;
|
13967
13935
|
|
13968
13936
|
|
13969
|
-
#line
|
13937
|
+
#line 2469 "char_ref.rl"
|
13938
|
+
// clang-format on
|
13970
13939
|
|
13971
|
-
static bool consume_named_ref(
|
13972
|
-
|
13973
|
-
OneOrTwoCodepoints* output) {
|
13940
|
+
static bool consume_named_ref(struct GumboInternalParser* parser,
|
13941
|
+
Utf8Iterator* input, bool is_in_attribute, OneOrTwoCodepoints* output) {
|
13974
13942
|
assert(output->first == kGumboNoChar);
|
13975
13943
|
const char* p = utf8iterator_get_char_pointer(input);
|
13976
13944
|
const char* pe = utf8iterator_get_end_pointer(input);
|
@@ -13979,8 +13947,9 @@ static bool consume_named_ref(
|
|
13979
13947
|
const char *ts, *start;
|
13980
13948
|
int cs, act;
|
13981
13949
|
|
13950
|
+
// clang-format off
|
13982
13951
|
|
13983
|
-
#line
|
13952
|
+
#line 13985 "char_ref.c"
|
13984
13953
|
{
|
13985
13954
|
cs = char_ref_start;
|
13986
13955
|
ts = 0;
|
@@ -13988,14 +13957,15 @@ static bool consume_named_ref(
|
|
13988
13957
|
act = 0;
|
13989
13958
|
}
|
13990
13959
|
|
13991
|
-
#line
|
13960
|
+
#line 2484 "char_ref.rl"
|
13992
13961
|
// Avoid unused variable warnings.
|
13993
13962
|
(void) act;
|
13994
13963
|
(void) ts;
|
13964
|
+
(void) char_ref_en_valid_named_ref;
|
13995
13965
|
|
13996
13966
|
start = p;
|
13997
13967
|
|
13998
|
-
#line
|
13968
|
+
#line 14001 "char_ref.c"
|
13999
13969
|
{
|
14000
13970
|
int _slen;
|
14001
13971
|
int _trans;
|
@@ -14017,7 +13987,7 @@ _resume:
|
|
14017
13987
|
#line 1 "NONE"
|
14018
13988
|
{ts = p;}
|
14019
13989
|
break;
|
14020
|
-
#line
|
13990
|
+
#line 14023 "char_ref.c"
|
14021
13991
|
}
|
14022
13992
|
}
|
14023
13993
|
|
@@ -23000,7 +22970,7 @@ _eof_trans:
|
|
23000
22970
|
#line 2273 "char_ref.rl"
|
23001
22971
|
{{p = ((te))-1;}{ output->first = 0xd7; {p++; goto _out; } }}
|
23002
22972
|
break;
|
23003
|
-
#line
|
22973
|
+
#line 23006 "char_ref.c"
|
23004
22974
|
}
|
23005
22975
|
}
|
23006
22976
|
|
@@ -23013,7 +22983,7 @@ _again:
|
|
23013
22983
|
#line 1 "NONE"
|
23014
22984
|
{ts = 0;}
|
23015
22985
|
break;
|
23016
|
-
#line
|
22986
|
+
#line 23019 "char_ref.c"
|
23017
22987
|
}
|
23018
22988
|
}
|
23019
22989
|
|
@@ -23033,7 +23003,8 @@ _again:
|
|
23033
23003
|
_out: {}
|
23034
23004
|
}
|
23035
23005
|
|
23036
|
-
#line
|
23006
|
+
#line 2491 "char_ref.rl"
|
23007
|
+
// clang-format on
|
23037
23008
|
|
23038
23009
|
if (cs >= 7623) {
|
23039
23010
|
assert(output->first != kGumboNoChar);
|
@@ -23067,10 +23038,9 @@ _again:
|
|
23067
23038
|
}
|
23068
23039
|
}
|
23069
23040
|
|
23070
|
-
bool consume_char_ref(
|
23071
|
-
struct
|
23072
|
-
|
23073
|
-
OneOrTwoCodepoints* output) {
|
23041
|
+
bool consume_char_ref(struct GumboInternalParser* parser,
|
23042
|
+
struct GumboInternalUtf8Iterator* input, int additional_allowed_char,
|
23043
|
+
bool is_in_attribute, OneOrTwoCodepoints* output) {
|
23074
23044
|
utf8iterator_mark(input);
|
23075
23045
|
utf8iterator_next(input);
|
23076
23046
|
int c = utf8iterator_current(input);
|