gumbo-html 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -10
- package/binding.gyp +49 -0
- package/examples/example.js +87 -0
- package/examples/scrape.js +301 -0
- package/index.d.ts +58 -3
- package/index.js +7 -2
- package/lib/wrapper.js +385 -0
- package/package.json +36 -5
- package/src/addon.cc +19 -0
- package/src/gumbo-parser/COPYING +201 -0
- package/src/gumbo-parser/README.md +8 -0
- package/src/gumbo-parser/src/attribute.c +44 -0
- package/src/gumbo-parser/src/attribute.h +37 -0
- package/src/gumbo-parser/src/char_ref.c +23069 -0
- package/src/gumbo-parser/src/char_ref.h +60 -0
- package/src/gumbo-parser/src/error.c +279 -0
- package/src/gumbo-parser/src/error.h +225 -0
- package/src/gumbo-parser/src/gumbo.h +671 -0
- package/src/gumbo-parser/src/insertion_mode.h +57 -0
- package/src/gumbo-parser/src/parser.c +4192 -0
- package/src/gumbo-parser/src/parser.h +57 -0
- package/src/gumbo-parser/src/string_buffer.c +110 -0
- package/src/gumbo-parser/src/string_buffer.h +84 -0
- package/src/gumbo-parser/src/string_piece.c +48 -0
- package/src/gumbo-parser/src/string_piece.h +38 -0
- package/src/gumbo-parser/src/tag.c +95 -0
- package/src/gumbo-parser/src/tag_enum.h +153 -0
- package/src/gumbo-parser/src/tag_gperf.h +105 -0
- package/src/gumbo-parser/src/tag_sizes.h +4 -0
- package/src/gumbo-parser/src/tag_strings.h +153 -0
- package/src/gumbo-parser/src/token_type.h +41 -0
- package/src/gumbo-parser/src/tokenizer.c +2897 -0
- package/src/gumbo-parser/src/tokenizer.h +123 -0
- package/src/gumbo-parser/src/tokenizer_states.h +103 -0
- package/src/gumbo-parser/src/utf8.c +270 -0
- package/src/gumbo-parser/src/utf8.h +132 -0
- package/src/gumbo-parser/src/util.c +58 -0
- package/src/gumbo-parser/src/util.h +60 -0
- package/src/gumbo-parser/src/vector.c +123 -0
- package/src/gumbo-parser/src/vector.h +67 -0
- package/src/html_document.cc +411 -0
- package/src/html_document.h +56 -0
- package/src/html_element.cc +963 -0
- package/src/html_element.h +70 -0
- package/src/include/win/strings.h +11 -0
- package/src/jsa.c +182 -0
- package/src/jsa.h +44 -0
- package/src/xnode.c +372 -0
- package/src/xnode_query.c +330 -0
- package/src/xnode_query.h +186 -0
- package/src/xnode_query_parser.c +414 -0
- package/install.js +0 -15
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
#include "xnode_query.h"
|
|
2
|
+
|
|
3
|
+
#include <assert.h>
|
|
4
|
+
#include <ctype.h>
|
|
5
|
+
#include <string.h>
|
|
6
|
+
|
|
7
|
+
#ifdef _WIN32
|
|
8
|
+
#define strcasecmp(x,y) _stricmp(x,y)
|
|
9
|
+
#define strncasecmp(x,y,z) _strnicmp(x,y,z)
|
|
10
|
+
#else
|
|
11
|
+
#include <strings.h>
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
static int string_ends_with(const char *str, const char *suffix) {
|
|
15
|
+
if (!str || !suffix) {
|
|
16
|
+
return 0;
|
|
17
|
+
} else {
|
|
18
|
+
size_t lenstr = strlen(str);
|
|
19
|
+
size_t lensuffix = strlen(suffix);
|
|
20
|
+
if (lensuffix > lenstr) return 0;
|
|
21
|
+
return strncmp(str + lenstr - lensuffix, suffix, lensuffix) == 0;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
static int is_html_space(char c) {
|
|
26
|
+
return isspace((unsigned char)c);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
static int string_in_space_list(const char *str, const char *tok) {
|
|
30
|
+
size_t tok_len;
|
|
31
|
+
|
|
32
|
+
if (!str || !tok || *tok == '\0') {
|
|
33
|
+
return 0;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
tok_len = strlen(tok);
|
|
37
|
+
while (*str) {
|
|
38
|
+
while (*str && is_html_space(*str)) {
|
|
39
|
+
str++;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
if (!strncmp(str, tok, tok_len) &&
|
|
43
|
+
(str[tok_len] == '\0' || is_html_space(str[tok_len]))) {
|
|
44
|
+
return 1;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
while (*str && !is_html_space(*str)) {
|
|
48
|
+
str++;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return 0;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
static int string_dash_match(const char *str, const char *tok) {
|
|
56
|
+
size_t tok_len;
|
|
57
|
+
|
|
58
|
+
if (!str || !tok || *tok == '\0') {
|
|
59
|
+
return 0;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
tok_len = strlen(tok);
|
|
63
|
+
return !strcmp(str, tok) || (!strncmp(str, tok, tok_len) && str[tok_len] == '-');
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Evaluate an attribute expression against an attribute.
|
|
68
|
+
*/
|
|
69
|
+
static int xnode_match_attribute(GumboAttribute *attribute,
|
|
70
|
+
XNodeAttributeSelector *selector) {
|
|
71
|
+
|
|
72
|
+
if (strcasecmp(attribute->name, selector->name) == 0) {
|
|
73
|
+
switch (selector->type) {
|
|
74
|
+
case XQSEL_ATTR_PRESENCE:
|
|
75
|
+
return 1;
|
|
76
|
+
case XQSEL_ATTR_EQUAL:
|
|
77
|
+
return !strcmp(attribute->value, selector->value);
|
|
78
|
+
case XQSEL_ATTR_SPACE_MATCH: // ~=
|
|
79
|
+
return string_in_space_list(attribute->value, selector->value);
|
|
80
|
+
case XQSEL_ATTR_DASH_MATCH: // |=
|
|
81
|
+
return string_dash_match(attribute->value, selector->value);
|
|
82
|
+
case XQSEL_ATTR_PREFIX_MATCH: // ^=
|
|
83
|
+
return !strncmp(selector->value, attribute->value, strlen(selector->value));
|
|
84
|
+
case XQSEL_ATTR_SUFFIX_MATCH: // $=
|
|
85
|
+
return string_ends_with(attribute->value, selector->value);
|
|
86
|
+
case XQSEL_ATTR_SUBSTRING_MATCH: // *=
|
|
87
|
+
return strstr(attribute->value, selector->value) != NULL;
|
|
88
|
+
default:
|
|
89
|
+
assert(0);
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return 0;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
const char *xnode_get_tag_name(GumboNode *node, size_t *length) {
|
|
98
|
+
const char *data;
|
|
99
|
+
|
|
100
|
+
if (node->type == GUMBO_NODE_DOCUMENT) {
|
|
101
|
+
data = "document";
|
|
102
|
+
*length = sizeof("document") - 1;
|
|
103
|
+
}
|
|
104
|
+
else {
|
|
105
|
+
data = gumbo_normalized_tagname(node->v.element.tag);
|
|
106
|
+
*length = strlen(data);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
if (*length == 0) {
|
|
110
|
+
GumboStringPiece tag = node->v.element.original_tag;
|
|
111
|
+
if (tag.data != NULL) {
|
|
112
|
+
gumbo_tag_from_original_text(&tag);
|
|
113
|
+
data = tag.data;
|
|
114
|
+
*length = tag.length;
|
|
115
|
+
}
|
|
116
|
+
else {
|
|
117
|
+
data = NULL;
|
|
118
|
+
*length = 0;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return data;
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
static int xnode_match_node_type(GumboNode *node, const char *type) {
|
|
126
|
+
const char *name;
|
|
127
|
+
size_t length;
|
|
128
|
+
|
|
129
|
+
if ((name = xnode_get_tag_name(node, &length))) {
|
|
130
|
+
if (!strcmp(type, "*")) {
|
|
131
|
+
return 1;
|
|
132
|
+
}
|
|
133
|
+
else {
|
|
134
|
+
if (strlen(type) == length) {
|
|
135
|
+
return !strncasecmp(name, type, length);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
return 0;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
int xnode_match_node(GumboNode *node, XNodeSelector *node_selector) {
|
|
144
|
+
GumboNodeType type = node->type;
|
|
145
|
+
GumboVector *attributes;
|
|
146
|
+
uint32_t i, j;
|
|
147
|
+
|
|
148
|
+
if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
|
|
149
|
+
return 0;
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
if (!xnode_match_node_type(node, node_selector->type)) {
|
|
153
|
+
return 0;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
attributes = &node->v.element.attributes;
|
|
157
|
+
|
|
158
|
+
for (i = 0; i < jsa_size(&node_selector->attributes); i++) {
|
|
159
|
+
XNodeAttributeSelector *selector;
|
|
160
|
+
selector = (XNodeAttributeSelector*) jsa_get(&node_selector->attributes, i);
|
|
161
|
+
for (j = 0; j < attributes->length; ++j) {
|
|
162
|
+
GumboAttribute* attribute = (GumboAttribute*) attributes->data[j];
|
|
163
|
+
if (xnode_match_attribute(attribute, selector)) break;
|
|
164
|
+
}
|
|
165
|
+
if (j == attributes->length) {
|
|
166
|
+
return 0;
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
return 1;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
static void xnode_match_node_descendant(GumboNode* node, jsa_t *next_nodes,
|
|
174
|
+
XNodeSelector *selector) {
|
|
175
|
+
if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
|
|
176
|
+
uint32_t i;
|
|
177
|
+
for (i = 0; i < node->v.element.children.length; ++i) {
|
|
178
|
+
GumboNode* child = (GumboNode*) node->v.element.children.data[i];
|
|
179
|
+
if (xnode_match_node(child, selector)) {
|
|
180
|
+
jsa_push(next_nodes, child);
|
|
181
|
+
}
|
|
182
|
+
xnode_match_node_descendant(child, next_nodes, selector);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
static void xnode_match_node_child(GumboNode* node, jsa_t *next_nodes,
|
|
188
|
+
XNodeSelector *selector) {
|
|
189
|
+
if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
|
|
190
|
+
uint32_t i;
|
|
191
|
+
for (i = 0; i < node->v.element.children.length; ++i) {
|
|
192
|
+
GumboNode* child = (GumboNode*) node->v.element.children.data[i];
|
|
193
|
+
if (xnode_match_node(child, selector)) {
|
|
194
|
+
jsa_push(next_nodes, child);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
static void xnode_match_node_adjacent_sibling(GumboNode* node,
|
|
201
|
+
jsa_t *next_nodes, XNodeSelector *selector) {
|
|
202
|
+
if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
|
|
203
|
+
GumboNode *next = xnode_next(node, NULL);
|
|
204
|
+
if (next && xnode_match_node(next, selector)) {
|
|
205
|
+
jsa_push(next_nodes, next);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
static void xnode_match_node_general_sibling(GumboNode* node, jsa_t *next_nodes,
|
|
211
|
+
XNodeSelector *selector) {
|
|
212
|
+
|
|
213
|
+
if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
|
|
214
|
+
while ((node = xnode_next(node, NULL))) {
|
|
215
|
+
if (xnode_match_node(node, selector)) {
|
|
216
|
+
jsa_push(next_nodes, node);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
/*
|
|
223
|
+
#include <stdio.h>
|
|
224
|
+
static void xnode_dump(XNode *node) {
|
|
225
|
+
size_t name_size, html_size;
|
|
226
|
+
const char *name = xnode_type(node, &name_size);
|
|
227
|
+
const char *html = xnode_html(node, &html_size);
|
|
228
|
+
const char *id = xnode_attr(node, "id");
|
|
229
|
+
const char *Class = xnode_attr(node, "class");
|
|
230
|
+
printf("%s\t%s\t%s\t%.*s\n", name, id ? id : "", Class ? Class : "",
|
|
231
|
+
(int) html_size, html);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
static void dump_query_result(const XNodeArray *result) {
|
|
235
|
+
if (result == NULL) {
|
|
236
|
+
printf("<null>\n");
|
|
237
|
+
return;
|
|
238
|
+
}
|
|
239
|
+
else {
|
|
240
|
+
uint32_t i;
|
|
241
|
+
printf("%u element(s):\n", result->size);
|
|
242
|
+
for (i = 0; i < result->size; i++) {
|
|
243
|
+
xnode_dump((XNode*) jsa_get(result, i));
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
*/
|
|
248
|
+
#define SWAP(x, y) do { jsa_t *SWAP = x; x = y; y = SWAP; } while (0)
|
|
249
|
+
|
|
250
|
+
const XNodeArray *xnode_query_execute(XNodeQuery *query, const XNode *node) {
|
|
251
|
+
uint32_t i, j;
|
|
252
|
+
jsa_t *prev_nodes = &query->result[0];
|
|
253
|
+
jsa_t *next_nodes = &query->result[1];
|
|
254
|
+
|
|
255
|
+
prev_nodes->size = 0;
|
|
256
|
+
jsa_push(prev_nodes, node);
|
|
257
|
+
|
|
258
|
+
for (i = 0; i < query->selectors.size; i++) {
|
|
259
|
+
XNodeSelector *selector = xnode_query_get_selector(query, i);
|
|
260
|
+
|
|
261
|
+
next_nodes->size = 0;
|
|
262
|
+
|
|
263
|
+
for (j = 0; j < jsa_size(prev_nodes); j++) {
|
|
264
|
+
GumboNode *node = (GumboNode*) jsa_get(prev_nodes, j);
|
|
265
|
+
switch (selector->combinator) {
|
|
266
|
+
case XQSEL_DESCENDANT:
|
|
267
|
+
xnode_match_node_descendant(node, next_nodes, selector);
|
|
268
|
+
break;
|
|
269
|
+
case XQSEL_CHILD:
|
|
270
|
+
xnode_match_node_child(node, next_nodes, selector);
|
|
271
|
+
break;
|
|
272
|
+
case XQSEL_ADJACENT_SIBLING:
|
|
273
|
+
xnode_match_node_adjacent_sibling(node, next_nodes, selector);
|
|
274
|
+
break;
|
|
275
|
+
case XQSEL_GENERAL_SIBLING:
|
|
276
|
+
xnode_match_node_general_sibling(node, next_nodes, selector);
|
|
277
|
+
break;
|
|
278
|
+
default:
|
|
279
|
+
assert(0);
|
|
280
|
+
break;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
jsa_dedup(next_nodes);
|
|
285
|
+
// dump_query_result(next_nodes);
|
|
286
|
+
SWAP(prev_nodes, next_nodes);
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
return jsa_size(prev_nodes) > 0 ? prev_nodes : NULL;
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
XNodeSelector *xnode_query_selector(XNodeQuery *query, uint32_t n) {
|
|
293
|
+
return (XNodeSelector*) jsa_get(&query->selectors, n);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Locate the first instance of \btok\b in str, where \b = sep.
|
|
298
|
+
*/
|
|
299
|
+
char *strtok_simple(const char *str, const char *tok, char sep) {
|
|
300
|
+
const char *a, *b;
|
|
301
|
+
int flag;
|
|
302
|
+
|
|
303
|
+
b = tok;
|
|
304
|
+
|
|
305
|
+
if (*b == 0) {
|
|
306
|
+
return (char*) str;
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
for (flag = 1; *str != 0; str += 1) {
|
|
310
|
+
if (*str != *b || !flag) {
|
|
311
|
+
flag = sep ? (*str == sep) : (isspace(*str) || ispunct(*str));
|
|
312
|
+
continue;
|
|
313
|
+
}
|
|
314
|
+
a = str;
|
|
315
|
+
while (1) {
|
|
316
|
+
if (*b == 0) {
|
|
317
|
+
if (*a == 0 || (sep ? (*a == sep) : (isspace(*a)) || ispunct(*a))) {
|
|
318
|
+
return (char*) str;
|
|
319
|
+
} else {
|
|
320
|
+
break;
|
|
321
|
+
}
|
|
322
|
+
}
|
|
323
|
+
if (*a++ != *b++) {
|
|
324
|
+
break;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
b = tok;
|
|
328
|
+
}
|
|
329
|
+
return (char *) 0;
|
|
330
|
+
}
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
#ifndef XNODE_QUERY_H_
|
|
2
|
+
#define XNODE_QUERY_H_
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
typedef enum {
|
|
9
|
+
// 6.1. Type selector
|
|
10
|
+
// h1 element in the document tree: h1
|
|
11
|
+
XQSEL_TYPE,
|
|
12
|
+
|
|
13
|
+
// 6.2. Universal selector
|
|
14
|
+
// Represented by asterisk (*) any document type. '*' may be omitted in some cases.
|
|
15
|
+
XQSEL_UNIVERSAL,
|
|
16
|
+
|
|
17
|
+
// 6.3.1. Attribute presence and value selectors
|
|
18
|
+
// An E element with a "foo" attribute: E[foo]
|
|
19
|
+
XQSEL_ATTR_PRESENCE,
|
|
20
|
+
|
|
21
|
+
// An E element whose "foo" attribute value is exactly equal to "bar":
|
|
22
|
+
// E[foo="bar"]
|
|
23
|
+
XQSEL_ATTR_EQUAL,
|
|
24
|
+
|
|
25
|
+
// An E element whose "foo" attribute value is a list of whitespace-
|
|
26
|
+
// separated values, one of which is exactly equal to "bar": E[foo~="bar"]
|
|
27
|
+
XQSEL_ATTR_SPACE_MATCH,
|
|
28
|
+
|
|
29
|
+
// 6.3.2. Substring matching attribute selectors
|
|
30
|
+
// An E element whose "foo" attribute value begins exactly with the string
|
|
31
|
+
// "bar": E[foo^="bar"]
|
|
32
|
+
XQSEL_ATTR_PREFIX_MATCH,
|
|
33
|
+
|
|
34
|
+
// An E element whose "foo" attribute value ends exactly with the string
|
|
35
|
+
// "bar": E[foo$="bar"]
|
|
36
|
+
XQSEL_ATTR_SUFFIX_MATCH,
|
|
37
|
+
|
|
38
|
+
// An E element whose "foo" attribute value contains the substring "bar":
|
|
39
|
+
// E[foo*="bar"]
|
|
40
|
+
XQSEL_ATTR_SUBSTRING_MATCH,
|
|
41
|
+
|
|
42
|
+
// An E element whose "foo" attribute has a hyphen-separated list of values
|
|
43
|
+
// beginning (from the left) with "en": E[foo|="en"]
|
|
44
|
+
XQSEL_ATTR_DASH_MATCH,
|
|
45
|
+
|
|
46
|
+
// 6.4. Class selectors
|
|
47
|
+
// All elements with class~=pastoral: *.pastoral or just .pastoral
|
|
48
|
+
XQSEL_CLASS,
|
|
49
|
+
|
|
50
|
+
// 6.5. ID selectors
|
|
51
|
+
// Examples: h1#chapter1, #chapter1
|
|
52
|
+
XQSEL_ID,
|
|
53
|
+
|
|
54
|
+
// 6.6. Pseudo-classes (NOT SUPPORTED)
|
|
55
|
+
// A pseudo-class always consists of a "colon" (:) followed by the name of
|
|
56
|
+
// the pseudo-class and optionally by a value between parentheses.
|
|
57
|
+
// E.g. a:link, tr:nth-child(2n+1), :last-child, :first-of-type, etc.
|
|
58
|
+
XQSEL_PSEUDO_CLASS,
|
|
59
|
+
|
|
60
|
+
// 7. Pseudo-elements (NOT SUPPORTED)
|
|
61
|
+
// E.g. ::first-line, ::first-letter, ::before, ::after
|
|
62
|
+
XQSEL_PSEUDO_ELEMENT,
|
|
63
|
+
|
|
64
|
+
// 8. Combinators
|
|
65
|
+
// 8.1. Descendant combinator
|
|
66
|
+
// E.g. h1 em, div * p
|
|
67
|
+
XQSEL_DESCENDANT,
|
|
68
|
+
|
|
69
|
+
// 8.2. Child combinators
|
|
70
|
+
// E.g. body > p
|
|
71
|
+
XQSEL_CHILD,
|
|
72
|
+
|
|
73
|
+
// 8.3. Sibling combinators
|
|
74
|
+
// 8.3.1. Adjacent sibling combinator
|
|
75
|
+
// E.g. math + p (p element immediately following a math element)
|
|
76
|
+
XQSEL_ADJACENT_SIBLING,
|
|
77
|
+
|
|
78
|
+
// 8.3.2. General sibling combinator
|
|
79
|
+
// The elements represented by the two sequences share the same parent
|
|
80
|
+
// in the document tree and the element represented by the first sequence
|
|
81
|
+
// precedes (not necessarily immediately) the element represented by the
|
|
82
|
+
// second one. E.g. h1 ~ pre (pre element following an h1)
|
|
83
|
+
XQSEL_GENERAL_SIBLING,
|
|
84
|
+
|
|
85
|
+
XQSEL_UNKNOWN_TYPE
|
|
86
|
+
} XNodeQueryType;
|
|
87
|
+
|
|
88
|
+
#include "gumbo.h"
|
|
89
|
+
#include "jsa.h"
|
|
90
|
+
|
|
91
|
+
typedef struct {
|
|
92
|
+
// One of the XQSEL_ATTR_* types
|
|
93
|
+
XNodeQueryType type;
|
|
94
|
+
|
|
95
|
+
// Attribute name
|
|
96
|
+
const char *name;
|
|
97
|
+
|
|
98
|
+
// Attribute value, if applicable
|
|
99
|
+
const char *value;
|
|
100
|
+
} XNodeAttributeSelector;
|
|
101
|
+
|
|
102
|
+
// 4. Selector syntax
|
|
103
|
+
//
|
|
104
|
+
// A selector is a chain of one or more sequences of simple selectors separated
|
|
105
|
+
// by combinators. One pseudo-element may be appended to the last sequence of
|
|
106
|
+
// simple selectors in a selector.
|
|
107
|
+
//
|
|
108
|
+
// A sequence of simple selectors is a chain of simple selectors that are not
|
|
109
|
+
// separated by a combinator. It always begins with a type selector or a
|
|
110
|
+
// universal selector. No other type selector or universal selector is allowed
|
|
111
|
+
// in the sequence.
|
|
112
|
+
//
|
|
113
|
+
// A simple selector is either a type selector, universal selector, attribute
|
|
114
|
+
// selector, class selector, ID selector, or pseudo-class.
|
|
115
|
+
//
|
|
116
|
+
typedef struct {
|
|
117
|
+
// Tag name for type selector, "*" for universal selector
|
|
118
|
+
const char *type;
|
|
119
|
+
|
|
120
|
+
// A list of attribute selectors; class selectors are translated as
|
|
121
|
+
// a list of {"class", "name", XQSEL_ATTR_SPACE_MATCH} selectors.
|
|
122
|
+
jsa_t attributes;
|
|
123
|
+
|
|
124
|
+
// Relation of this selector and the previous one, if any.
|
|
125
|
+
XNodeQueryType combinator;
|
|
126
|
+
} XNodeSelector;
|
|
127
|
+
|
|
128
|
+
typedef struct {
|
|
129
|
+
jsa_t selectors;
|
|
130
|
+
char *data;
|
|
131
|
+
jsa_t result[2];
|
|
132
|
+
} XNodeQuery;
|
|
133
|
+
|
|
134
|
+
typedef GumboNode XNode;
|
|
135
|
+
|
|
136
|
+
typedef jsa_t XNodeArray;
|
|
137
|
+
#define xnode_array_first(array) ((XNode*)jsa_get((array),0))
|
|
138
|
+
#define xnode_array_get(array,n) ((XNode*)jsa_get((array),(n)))
|
|
139
|
+
#define xnode_array_size(array) jsa_size(array)
|
|
140
|
+
|
|
141
|
+
typedef struct {
|
|
142
|
+
GumboOutput *output;
|
|
143
|
+
GumboOptions parse_options;
|
|
144
|
+
char *html;
|
|
145
|
+
size_t html_length;
|
|
146
|
+
} XNodeDocument;
|
|
147
|
+
|
|
148
|
+
XNodeQuery *xnode_query_create(const char *, const char **, int *);
|
|
149
|
+
const XNodeArray *xnode_query_execute(XNodeQuery *, const XNode*);
|
|
150
|
+
void xnode_query_free(XNodeQuery *);
|
|
151
|
+
XNodeSelector *xnode_query_selector(XNodeQuery*, uint32_t);
|
|
152
|
+
|
|
153
|
+
#define xnode_query_get_selector(q,i) ((XNodeSelector*)jsa_get(&q->selectors,i))
|
|
154
|
+
|
|
155
|
+
XNodeDocument *xnode_parse_html(const char *html, size_t length);
|
|
156
|
+
const XNode *xnode_document_root(const XNodeDocument*);
|
|
157
|
+
void xnode_document_free(XNodeDocument*);
|
|
158
|
+
|
|
159
|
+
const char *xnode_attr(const XNode*, const char *name);
|
|
160
|
+
|
|
161
|
+
XNode *xnode_first_child(XNode *node, XNodeSelector *selector);
|
|
162
|
+
XNode *xnode_parent(XNode *node);
|
|
163
|
+
XNode *xnode_prev(XNode *node, XNodeSelector *selector);
|
|
164
|
+
XNode *xnode_prev_check(XNode *node, XNodeSelector *selector);
|
|
165
|
+
XNode *xnode_next(XNode *node, XNodeSelector *selector);
|
|
166
|
+
XNode *xnode_next_check(XNode *node, XNodeSelector *selector);
|
|
167
|
+
const char *xnode_html(XNode *node, size_t *length);
|
|
168
|
+
const char *xnode_text(XNode *xnode);
|
|
169
|
+
const char *xnode_type(XNode *node, size_t *length);
|
|
170
|
+
int xnode_has_class(XNode *xnode, const char *class_name);
|
|
171
|
+
int xnode_has_attr(XNode *xnode, const char *name);
|
|
172
|
+
|
|
173
|
+
int xnode_match_node(GumboNode *node, XNodeSelector *selector);
|
|
174
|
+
GumboVector *xnode_children(XNode *node);
|
|
175
|
+
|
|
176
|
+
// utils
|
|
177
|
+
char *strtok_simple(const char *str, const char *tok, char sep);
|
|
178
|
+
#define xmalloc malloc
|
|
179
|
+
#define xnew(type) ((type*)xmalloc(sizeof (type)))
|
|
180
|
+
#define xfree free
|
|
181
|
+
|
|
182
|
+
#ifdef __cplusplus
|
|
183
|
+
}
|
|
184
|
+
#endif
|
|
185
|
+
|
|
186
|
+
#endif // XNODE_QUERY_H_
|