gumbo-html 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -10
- package/binding.gyp +49 -0
- package/examples/example.js +87 -0
- package/examples/scrape.js +301 -0
- package/index.d.ts +58 -3
- package/index.js +7 -2
- package/lib/wrapper.js +385 -0
- package/package.json +36 -5
- package/src/addon.cc +19 -0
- package/src/gumbo-parser/COPYING +201 -0
- package/src/gumbo-parser/README.md +8 -0
- package/src/gumbo-parser/src/attribute.c +44 -0
- package/src/gumbo-parser/src/attribute.h +37 -0
- package/src/gumbo-parser/src/char_ref.c +23069 -0
- package/src/gumbo-parser/src/char_ref.h +60 -0
- package/src/gumbo-parser/src/error.c +279 -0
- package/src/gumbo-parser/src/error.h +225 -0
- package/src/gumbo-parser/src/gumbo.h +671 -0
- package/src/gumbo-parser/src/insertion_mode.h +57 -0
- package/src/gumbo-parser/src/parser.c +4192 -0
- package/src/gumbo-parser/src/parser.h +57 -0
- package/src/gumbo-parser/src/string_buffer.c +110 -0
- package/src/gumbo-parser/src/string_buffer.h +84 -0
- package/src/gumbo-parser/src/string_piece.c +48 -0
- package/src/gumbo-parser/src/string_piece.h +38 -0
- package/src/gumbo-parser/src/tag.c +95 -0
- package/src/gumbo-parser/src/tag_enum.h +153 -0
- package/src/gumbo-parser/src/tag_gperf.h +105 -0
- package/src/gumbo-parser/src/tag_sizes.h +4 -0
- package/src/gumbo-parser/src/tag_strings.h +153 -0
- package/src/gumbo-parser/src/token_type.h +41 -0
- package/src/gumbo-parser/src/tokenizer.c +2897 -0
- package/src/gumbo-parser/src/tokenizer.h +123 -0
- package/src/gumbo-parser/src/tokenizer_states.h +103 -0
- package/src/gumbo-parser/src/utf8.c +270 -0
- package/src/gumbo-parser/src/utf8.h +132 -0
- package/src/gumbo-parser/src/util.c +58 -0
- package/src/gumbo-parser/src/util.h +60 -0
- package/src/gumbo-parser/src/vector.c +123 -0
- package/src/gumbo-parser/src/vector.h +67 -0
- package/src/html_document.cc +411 -0
- package/src/html_document.h +56 -0
- package/src/html_element.cc +963 -0
- package/src/html_element.h +70 -0
- package/src/include/win/strings.h +11 -0
- package/src/jsa.c +182 -0
- package/src/jsa.h +44 -0
- package/src/xnode.c +372 -0
- package/src/xnode_query.c +330 -0
- package/src/xnode_query.h +186 -0
- package/src/xnode_query_parser.c +414 -0
- package/install.js +0 -15
|
@@ -0,0 +1,414 @@
|
|
|
1
|
+
#include "xnode_query.h"
|
|
2
|
+
|
|
3
|
+
#include <ctype.h>
|
|
4
|
+
#include <string.h>
|
|
5
|
+
#include <stdio.h>
|
|
6
|
+
#include <stdlib.h>
|
|
7
|
+
|
|
8
|
+
#define XNODE_BAD_NAME_CHARACTER "Illegal name character"
|
|
9
|
+
#define XNODE_BAD_COMBINATOR "Illegal combinator"
|
|
10
|
+
#define XNODE_EMPTY_SELECTOR "Empty selector"
|
|
11
|
+
#define XNODE_UNEXPECTED_CHAR "Illegal character"
|
|
12
|
+
#define XNODE_QUOTATION_MARK_EXPECTED "Missing quotation mark"
|
|
13
|
+
|
|
14
|
+
#define xnew_zero(type) ((type*)calloc(1, sizeof (type)))
|
|
15
|
+
|
|
16
|
+
typedef struct {
|
|
17
|
+
const char *next; // Next input char
|
|
18
|
+
char *data; // Data buffer
|
|
19
|
+
const char *error; // Error message
|
|
20
|
+
XNodeSelector *selector;
|
|
21
|
+
} XNodeQueryParser;
|
|
22
|
+
|
|
23
|
+
// w [ \t\r\n\f]
|
|
24
|
+
static inline int is_space(int c) {
|
|
25
|
+
return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f';
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
static inline char skip_space(XNodeQueryParser *parser) {
|
|
29
|
+
while (*parser->next && is_space(*parser->next)) {
|
|
30
|
+
parser->next++;
|
|
31
|
+
}
|
|
32
|
+
return *parser->next;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// combinator
|
|
36
|
+
// /* combinators can be surrounded by whitespace */
|
|
37
|
+
// : PLUS S* | GREATER S* | TILDE S* | S+
|
|
38
|
+
// ;
|
|
39
|
+
XNodeQueryType xnode_parse_combinator(XNodeQueryParser *parser) {
|
|
40
|
+
XNodeQueryType result = XQSEL_UNKNOWN_TYPE;
|
|
41
|
+
|
|
42
|
+
if (is_space(*parser->next)) {
|
|
43
|
+
// E F: an F element descendant of an E element
|
|
44
|
+
result = XQSEL_DESCENDANT;
|
|
45
|
+
skip_space(parser);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
switch (*parser->next) {
|
|
49
|
+
case '>': // E > F: an F element child of an E element
|
|
50
|
+
result = XQSEL_CHILD;
|
|
51
|
+
break;
|
|
52
|
+
case '+': // E + F: an F element immediately preceded by an E element
|
|
53
|
+
result = XQSEL_ADJACENT_SIBLING;
|
|
54
|
+
break;
|
|
55
|
+
case '~': // E ~ F: an F element preceded by an E element
|
|
56
|
+
result = XQSEL_GENERAL_SIBLING;
|
|
57
|
+
break;
|
|
58
|
+
default:
|
|
59
|
+
break;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
if (result == XQSEL_UNKNOWN_TYPE) {
|
|
63
|
+
parser->error = XNODE_BAD_COMBINATOR;
|
|
64
|
+
return XQSEL_UNKNOWN_TYPE;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
if (result != XQSEL_DESCENDANT) {
|
|
68
|
+
parser->next++;
|
|
69
|
+
skip_space(parser);
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// nmstart [_a-z]|{nonascii}|{escape}
|
|
76
|
+
static inline int is_name_start_char(int c) {
|
|
77
|
+
return (c >= 'A' && c <= 'Z') || c == '_' || (c >= 'a' && c <= 'z');
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// nmchar [_a-z0-9-]|{nonascii}|{escape}
|
|
81
|
+
static inline int is_name_char(int c) {
|
|
82
|
+
return is_name_start_char(c) || c == '-' || (c >= '0' && c <= '9');
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// ident [-]?{nmstart}{nmchar}*
|
|
86
|
+
static char *xnode_parse_ident(XNodeQueryParser *parser) {
|
|
87
|
+
char *name = parser->data;
|
|
88
|
+
|
|
89
|
+
if (*parser->next == '-') {
|
|
90
|
+
*parser->data++ = *parser->next++;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (!is_name_start_char(*parser->next)) {
|
|
94
|
+
parser->error = XNODE_BAD_NAME_CHARACTER;
|
|
95
|
+
return NULL;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
*parser->data++ = *parser->next++;
|
|
99
|
+
|
|
100
|
+
while (is_name_char(*parser->next)) {
|
|
101
|
+
*parser->data++ = *parser->next++;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
*parser->data++ = '\0';
|
|
105
|
+
|
|
106
|
+
return name;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// name {nmchar}+
|
|
110
|
+
static char *xnode_parse_name(XNodeQueryParser *parser) {
|
|
111
|
+
char *name = parser->data;
|
|
112
|
+
|
|
113
|
+
if (!is_name_char(*parser->next)) {
|
|
114
|
+
parser->error = XNODE_BAD_NAME_CHARACTER;
|
|
115
|
+
return NULL;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
while (is_name_char(*parser->next)) {
|
|
119
|
+
*parser->data++ = *parser->next++;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
*parser->data++ = '\0';
|
|
123
|
+
|
|
124
|
+
return name;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
static char *xnode_parse_value(XNodeQueryParser *parser) {
|
|
128
|
+
char *value = parser->data;
|
|
129
|
+
char quote = *parser->next;
|
|
130
|
+
|
|
131
|
+
// [ IDENT | STRING ]
|
|
132
|
+
if (quote != '\'' && quote != '"') {
|
|
133
|
+
return xnode_parse_ident(parser);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
// Skip '\'' or '"'
|
|
137
|
+
parser->next++;
|
|
138
|
+
|
|
139
|
+
while (*parser->next && *parser->next != quote) {
|
|
140
|
+
if (*parser->next == '\\' && *(parser->next + 1) == quote) {
|
|
141
|
+
*parser->data++ = quote;
|
|
142
|
+
parser->next += 2;
|
|
143
|
+
}
|
|
144
|
+
else {
|
|
145
|
+
*parser->data++ = *parser->next++;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (*parser->next != quote) {
|
|
150
|
+
parser->error = XNODE_QUOTATION_MARK_EXPECTED;
|
|
151
|
+
return NULL;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Skip '\'' or '"'
|
|
155
|
+
parser->next++;
|
|
156
|
+
|
|
157
|
+
*parser->data++ = '\0';
|
|
158
|
+
|
|
159
|
+
return value;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
static void xnode_parse_attribute_selector(XNodeQueryParser *parser,
|
|
163
|
+
XNodeSelector *selector) {
|
|
164
|
+
|
|
165
|
+
parser->next++; // skip '['
|
|
166
|
+
|
|
167
|
+
while (*parser->next != ']' && !parser->error) {
|
|
168
|
+
XNodeAttributeSelector *attr = xnew_zero(XNodeAttributeSelector);
|
|
169
|
+
|
|
170
|
+
jsa_push(&selector->attributes, attr);
|
|
171
|
+
|
|
172
|
+
if (!(attr->name = xnode_parse_ident(parser))) {
|
|
173
|
+
return;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
skip_space(parser);
|
|
177
|
+
|
|
178
|
+
switch (*parser->next) {
|
|
179
|
+
case ']': // [... foo]
|
|
180
|
+
attr->type = XQSEL_ATTR_PRESENCE;
|
|
181
|
+
break;
|
|
182
|
+
case ',': // [foo, bar...]
|
|
183
|
+
attr->type = XQSEL_ATTR_PRESENCE;
|
|
184
|
+
parser->next++;
|
|
185
|
+
skip_space(parser);
|
|
186
|
+
continue;
|
|
187
|
+
case '=':
|
|
188
|
+
attr->type = XQSEL_ATTR_EQUAL;
|
|
189
|
+
break;
|
|
190
|
+
case '~':
|
|
191
|
+
attr->type = XQSEL_ATTR_SPACE_MATCH;
|
|
192
|
+
parser->next++;
|
|
193
|
+
break;
|
|
194
|
+
case '|':
|
|
195
|
+
attr->type = XQSEL_ATTR_DASH_MATCH;
|
|
196
|
+
parser->next++;
|
|
197
|
+
break;
|
|
198
|
+
case '^':
|
|
199
|
+
attr->type = XQSEL_ATTR_PREFIX_MATCH;
|
|
200
|
+
parser->next++;
|
|
201
|
+
break;
|
|
202
|
+
case '$':
|
|
203
|
+
attr->type = XQSEL_ATTR_SUFFIX_MATCH;
|
|
204
|
+
parser->next++;
|
|
205
|
+
break;
|
|
206
|
+
case '*':
|
|
207
|
+
attr->type = XQSEL_ATTR_SUBSTRING_MATCH;
|
|
208
|
+
parser->next++;
|
|
209
|
+
break;
|
|
210
|
+
default:
|
|
211
|
+
parser->error = XNODE_UNEXPECTED_CHAR;
|
|
212
|
+
return;
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
if (*parser->next != ']') {
|
|
216
|
+
if (*parser->next == '=') {
|
|
217
|
+
// skip the '=' char
|
|
218
|
+
parser->next++;
|
|
219
|
+
|
|
220
|
+
// space before value
|
|
221
|
+
skip_space(parser);
|
|
222
|
+
|
|
223
|
+
// Selector needs a value
|
|
224
|
+
if (!(attr->value = xnode_parse_value(parser))) {
|
|
225
|
+
return;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// space after value
|
|
229
|
+
skip_space(parser);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
switch (*parser->next) {
|
|
233
|
+
case ']': // [... foo]
|
|
234
|
+
break;
|
|
235
|
+
case ',': // [foo, bar...]
|
|
236
|
+
parser->next++;
|
|
237
|
+
skip_space(parser);
|
|
238
|
+
default:
|
|
239
|
+
continue;
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
parser->next++; // skip ']'
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
static void xnode_parse_class_selector(XNodeQueryParser *parser,
|
|
248
|
+
XNodeSelector *selector) {
|
|
249
|
+
// class: '.' IDENT
|
|
250
|
+
const char *class_name = xnode_parse_ident(parser);
|
|
251
|
+
if (class_name) {
|
|
252
|
+
XNodeAttributeSelector *attr = xnew_zero(XNodeAttributeSelector);
|
|
253
|
+
attr->name = "class";
|
|
254
|
+
attr->value = class_name;
|
|
255
|
+
attr->type = XQSEL_ATTR_SPACE_MATCH;
|
|
256
|
+
jsa_push(&selector->attributes, attr);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
static void xnode_parse_id_selector(XNodeQueryParser *parser,
|
|
261
|
+
XNodeSelector *selector) {
|
|
262
|
+
// "#"{name}
|
|
263
|
+
const char *id = xnode_parse_name(parser);
|
|
264
|
+
if (id) {
|
|
265
|
+
XNodeAttributeSelector *attr = xnew_zero(XNodeAttributeSelector);
|
|
266
|
+
attr->name = "id";
|
|
267
|
+
attr->value = id;
|
|
268
|
+
attr->type = XQSEL_ATTR_EQUAL;
|
|
269
|
+
jsa_push(&selector->attributes, attr);
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
static XNodeSelector *xnode_selector_alloc() {
|
|
274
|
+
XNodeSelector *result = (XNodeSelector *) malloc(sizeof(XNodeSelector));
|
|
275
|
+
|
|
276
|
+
result->type = NULL;
|
|
277
|
+
jsa_init(&result->attributes);
|
|
278
|
+
result->combinator = XQSEL_DESCENDANT;
|
|
279
|
+
|
|
280
|
+
return result;
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
static void xnode_selector_free(XNodeSelector *selector) {
|
|
284
|
+
uint32_t i;
|
|
285
|
+
|
|
286
|
+
for (i = 0; i < selector->attributes.size; i++) {
|
|
287
|
+
XNodeAttributeSelector *attr;
|
|
288
|
+
if ((attr = (XNodeAttributeSelector*) selector->attributes.item[i])) {
|
|
289
|
+
xfree(attr);
|
|
290
|
+
}
|
|
291
|
+
}
|
|
292
|
+
jsa_clean(&selector->attributes);
|
|
293
|
+
xfree(selector);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
static XNodeSelector *xnode_parse_selector(XNodeQueryParser *parser) {
|
|
297
|
+
XNodeSelector *selector;
|
|
298
|
+
|
|
299
|
+
if (!skip_space(parser)) {
|
|
300
|
+
return NULL;
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
selector = xnode_selector_alloc();
|
|
304
|
+
|
|
305
|
+
// If a universal selector represented by * (i.e. without a namespace
|
|
306
|
+
// prefix) is not the only component of a sequence of simple selectors or
|
|
307
|
+
// is immediately followed by a pseudo-element, then the * may be omitted
|
|
308
|
+
// and the universal selector's presence implied.
|
|
309
|
+
while (*parser->next && !parser->error) {
|
|
310
|
+
switch (*parser->next) {
|
|
311
|
+
case '*':
|
|
312
|
+
if (!selector->type) {
|
|
313
|
+
selector->type = "*";
|
|
314
|
+
parser->next++;
|
|
315
|
+
continue;
|
|
316
|
+
}
|
|
317
|
+
else {
|
|
318
|
+
parser->error = XNODE_UNEXPECTED_CHAR;
|
|
319
|
+
break;
|
|
320
|
+
}
|
|
321
|
+
case '[':
|
|
322
|
+
if (!selector->type) selector->type = "*";
|
|
323
|
+
xnode_parse_attribute_selector(parser, selector);
|
|
324
|
+
break;
|
|
325
|
+
case '.':
|
|
326
|
+
parser->next++;
|
|
327
|
+
if (!selector->type) selector->type = "*";
|
|
328
|
+
xnode_parse_class_selector(parser, selector);
|
|
329
|
+
break;
|
|
330
|
+
case '#':
|
|
331
|
+
parser->next++;
|
|
332
|
+
if (!selector->type) selector->type = "*";
|
|
333
|
+
xnode_parse_id_selector(parser, selector);
|
|
334
|
+
break;
|
|
335
|
+
default:
|
|
336
|
+
if (!selector->type) {
|
|
337
|
+
// element_name: IDENT
|
|
338
|
+
selector->type = xnode_parse_ident(parser);
|
|
339
|
+
break;
|
|
340
|
+
}
|
|
341
|
+
return selector;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
return selector;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
static XNodeQuery* xnode_query_alloc(char *data) {
|
|
349
|
+
XNodeQuery *query = xnew(XNodeQuery);
|
|
350
|
+
query->data = data;
|
|
351
|
+
jsa_init(&query->selectors);
|
|
352
|
+
jsa_init(&query->result[0]);
|
|
353
|
+
jsa_init(&query->result[1]);
|
|
354
|
+
return query;
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
void xnode_query_free(XNodeQuery *query) {
|
|
358
|
+
uint32_t i;
|
|
359
|
+
for (i = 0; i < jsa_size(&query->selectors); i++) {
|
|
360
|
+
xnode_selector_free((XNodeSelector *)jsa_get(&query->selectors, i));
|
|
361
|
+
}
|
|
362
|
+
jsa_clean(&query->selectors);
|
|
363
|
+
jsa_clean(&query->result[0]);
|
|
364
|
+
jsa_clean(&query->result[1]);
|
|
365
|
+
xfree(query->data);
|
|
366
|
+
xfree(query);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
XNodeQuery *xnode_query_create(const char *query_text, const char **errptr,
|
|
370
|
+
int *erroffset) {
|
|
371
|
+
XNodeQueryParser parser;
|
|
372
|
+
XNodeQuery *query;
|
|
373
|
+
XNodeQueryType combinator;
|
|
374
|
+
|
|
375
|
+
parser.next = query_text;
|
|
376
|
+
parser.data = (char*) xmalloc(strlen(query_text) + 1);
|
|
377
|
+
parser.error = NULL;
|
|
378
|
+
|
|
379
|
+
// To support queries like node:find('>span.hidden')
|
|
380
|
+
if ((combinator = xnode_parse_combinator(&parser)) == XQSEL_UNKNOWN_TYPE) {
|
|
381
|
+
combinator = XQSEL_DESCENDANT;
|
|
382
|
+
parser.error = NULL;
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
query = xnode_query_alloc(parser.data);
|
|
386
|
+
|
|
387
|
+
while (*parser.next && !parser.error) {
|
|
388
|
+
XNodeSelector *selector = xnode_parse_selector(&parser);
|
|
389
|
+
if (parser.error) {
|
|
390
|
+
if (selector) {
|
|
391
|
+
xnode_selector_free(selector);
|
|
392
|
+
}
|
|
393
|
+
break;
|
|
394
|
+
}
|
|
395
|
+
if (!selector) break;
|
|
396
|
+
selector->combinator = combinator;
|
|
397
|
+
jsa_push(&query->selectors, selector);
|
|
398
|
+
if (!*parser.next) break;
|
|
399
|
+
combinator = xnode_parse_combinator(&parser);
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
if (!parser.error && query->selectors.size == 0) {
|
|
403
|
+
parser.error = XNODE_EMPTY_SELECTOR;
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
if (parser.error) {
|
|
407
|
+
xnode_query_free(query);
|
|
408
|
+
if (errptr) *errptr = parser.error;
|
|
409
|
+
if (erroffset) *erroffset = (int)(parser.next - query_text);
|
|
410
|
+
return NULL;
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
return query;
|
|
414
|
+
}
|
package/install.js
DELETED
|
@@ -1,15 +0,0 @@
|
|
|
1
|
-
const https = require("https");
|
|
2
|
-
const fs = require("fs");
|
|
3
|
-
const os = require("os");
|
|
4
|
-
const path = require("path");
|
|
5
|
-
|
|
6
|
-
const libs = "https://raw.githubusercontent.com/fangwd/gumbo-html/main/libs";
|
|
7
|
-
const url = `${libs}/${process.platform}/${os.arch()}/html.node`;
|
|
8
|
-
const file = fs.createWriteStream(path.join(__dirname, "html.node"));
|
|
9
|
-
|
|
10
|
-
https.get(url, (response) => {
|
|
11
|
-
response.pipe(file);
|
|
12
|
-
file.on("finish", () => {
|
|
13
|
-
file.close();
|
|
14
|
-
});
|
|
15
|
-
});
|