gumbo-html 0.2.3 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +27 -10
  2. package/binding.gyp +49 -0
  3. package/examples/example.js +87 -0
  4. package/examples/scrape.js +301 -0
  5. package/index.d.ts +58 -3
  6. package/index.js +7 -2
  7. package/lib/wrapper.js +385 -0
  8. package/package.json +36 -5
  9. package/src/addon.cc +19 -0
  10. package/src/gumbo-parser/COPYING +201 -0
  11. package/src/gumbo-parser/README.md +8 -0
  12. package/src/gumbo-parser/src/attribute.c +44 -0
  13. package/src/gumbo-parser/src/attribute.h +37 -0
  14. package/src/gumbo-parser/src/char_ref.c +23069 -0
  15. package/src/gumbo-parser/src/char_ref.h +60 -0
  16. package/src/gumbo-parser/src/error.c +279 -0
  17. package/src/gumbo-parser/src/error.h +225 -0
  18. package/src/gumbo-parser/src/gumbo.h +671 -0
  19. package/src/gumbo-parser/src/insertion_mode.h +57 -0
  20. package/src/gumbo-parser/src/parser.c +4192 -0
  21. package/src/gumbo-parser/src/parser.h +57 -0
  22. package/src/gumbo-parser/src/string_buffer.c +110 -0
  23. package/src/gumbo-parser/src/string_buffer.h +84 -0
  24. package/src/gumbo-parser/src/string_piece.c +48 -0
  25. package/src/gumbo-parser/src/string_piece.h +38 -0
  26. package/src/gumbo-parser/src/tag.c +95 -0
  27. package/src/gumbo-parser/src/tag_enum.h +153 -0
  28. package/src/gumbo-parser/src/tag_gperf.h +105 -0
  29. package/src/gumbo-parser/src/tag_sizes.h +4 -0
  30. package/src/gumbo-parser/src/tag_strings.h +153 -0
  31. package/src/gumbo-parser/src/token_type.h +41 -0
  32. package/src/gumbo-parser/src/tokenizer.c +2897 -0
  33. package/src/gumbo-parser/src/tokenizer.h +123 -0
  34. package/src/gumbo-parser/src/tokenizer_states.h +103 -0
  35. package/src/gumbo-parser/src/utf8.c +270 -0
  36. package/src/gumbo-parser/src/utf8.h +132 -0
  37. package/src/gumbo-parser/src/util.c +58 -0
  38. package/src/gumbo-parser/src/util.h +60 -0
  39. package/src/gumbo-parser/src/vector.c +123 -0
  40. package/src/gumbo-parser/src/vector.h +67 -0
  41. package/src/html_document.cc +411 -0
  42. package/src/html_document.h +56 -0
  43. package/src/html_element.cc +963 -0
  44. package/src/html_element.h +70 -0
  45. package/src/include/win/strings.h +11 -0
  46. package/src/jsa.c +182 -0
  47. package/src/jsa.h +44 -0
  48. package/src/xnode.c +372 -0
  49. package/src/xnode_query.c +330 -0
  50. package/src/xnode_query.h +186 -0
  51. package/src/xnode_query_parser.c +414 -0
  52. package/install.js +0 -15
@@ -0,0 +1,414 @@
1
+ #include "xnode_query.h"
2
+
3
+ #include <ctype.h>
4
+ #include <string.h>
5
+ #include <stdio.h>
6
+ #include <stdlib.h>
7
+
8
+ #define XNODE_BAD_NAME_CHARACTER "Illegal name character"
9
+ #define XNODE_BAD_COMBINATOR "Illegal combinator"
10
+ #define XNODE_EMPTY_SELECTOR "Empty selector"
11
+ #define XNODE_UNEXPECTED_CHAR "Illegal character"
12
+ #define XNODE_QUOTATION_MARK_EXPECTED "Missing quotation mark"
13
+
14
+ #define xnew_zero(type) ((type*)calloc(1, sizeof (type)))
15
+
16
+ typedef struct {
17
+ const char *next; // Next input char
18
+ char *data; // Data buffer
19
+ const char *error; // Error message
20
+ XNodeSelector *selector;
21
+ } XNodeQueryParser;
22
+
23
+ // w [ \t\r\n\f]
24
+ static inline int is_space(int c) {
25
+ return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f';
26
+ }
27
+
28
+ static inline char skip_space(XNodeQueryParser *parser) {
29
+ while (*parser->next && is_space(*parser->next)) {
30
+ parser->next++;
31
+ }
32
+ return *parser->next;
33
+ }
34
+
35
+ // combinator
36
+ // /* combinators can be surrounded by whitespace */
37
+ // : PLUS S* | GREATER S* | TILDE S* | S+
38
+ // ;
39
+ XNodeQueryType xnode_parse_combinator(XNodeQueryParser *parser) {
40
+ XNodeQueryType result = XQSEL_UNKNOWN_TYPE;
41
+
42
+ if (is_space(*parser->next)) {
43
+ // E F: an F element descendant of an E element
44
+ result = XQSEL_DESCENDANT;
45
+ skip_space(parser);
46
+ }
47
+
48
+ switch (*parser->next) {
49
+ case '>': // E > F: an F element child of an E element
50
+ result = XQSEL_CHILD;
51
+ break;
52
+ case '+': // E + F: an F element immediately preceded by an E element
53
+ result = XQSEL_ADJACENT_SIBLING;
54
+ break;
55
+ case '~': // E ~ F: an F element preceded by an E element
56
+ result = XQSEL_GENERAL_SIBLING;
57
+ break;
58
+ default:
59
+ break;
60
+ }
61
+
62
+ if (result == XQSEL_UNKNOWN_TYPE) {
63
+ parser->error = XNODE_BAD_COMBINATOR;
64
+ return XQSEL_UNKNOWN_TYPE;
65
+ }
66
+
67
+ if (result != XQSEL_DESCENDANT) {
68
+ parser->next++;
69
+ skip_space(parser);
70
+ }
71
+
72
+ return result;
73
+ }
74
+
75
+ // nmstart [_a-z]|{nonascii}|{escape}
76
+ static inline int is_name_start_char(int c) {
77
+ return (c >= 'A' && c <= 'Z') || c == '_' || (c >= 'a' && c <= 'z');
78
+ }
79
+
80
+ // nmchar [_a-z0-9-]|{nonascii}|{escape}
81
+ static inline int is_name_char(int c) {
82
+ return is_name_start_char(c) || c == '-' || (c >= '0' && c <= '9');
83
+ }
84
+
85
+ // ident [-]?{nmstart}{nmchar}*
86
+ static char *xnode_parse_ident(XNodeQueryParser *parser) {
87
+ char *name = parser->data;
88
+
89
+ if (*parser->next == '-') {
90
+ *parser->data++ = *parser->next++;
91
+ }
92
+
93
+ if (!is_name_start_char(*parser->next)) {
94
+ parser->error = XNODE_BAD_NAME_CHARACTER;
95
+ return NULL;
96
+ }
97
+
98
+ *parser->data++ = *parser->next++;
99
+
100
+ while (is_name_char(*parser->next)) {
101
+ *parser->data++ = *parser->next++;
102
+ }
103
+
104
+ *parser->data++ = '\0';
105
+
106
+ return name;
107
+ }
108
+
109
+ // name {nmchar}+
110
+ static char *xnode_parse_name(XNodeQueryParser *parser) {
111
+ char *name = parser->data;
112
+
113
+ if (!is_name_char(*parser->next)) {
114
+ parser->error = XNODE_BAD_NAME_CHARACTER;
115
+ return NULL;
116
+ }
117
+
118
+ while (is_name_char(*parser->next)) {
119
+ *parser->data++ = *parser->next++;
120
+ }
121
+
122
+ *parser->data++ = '\0';
123
+
124
+ return name;
125
+ }
126
+
127
+ static char *xnode_parse_value(XNodeQueryParser *parser) {
128
+ char *value = parser->data;
129
+ char quote = *parser->next;
130
+
131
+ // [ IDENT | STRING ]
132
+ if (quote != '\'' && quote != '"') {
133
+ return xnode_parse_ident(parser);
134
+ }
135
+
136
+ // Skip '\'' or '"'
137
+ parser->next++;
138
+
139
+ while (*parser->next && *parser->next != quote) {
140
+ if (*parser->next == '\\' && *(parser->next + 1) == quote) {
141
+ *parser->data++ = quote;
142
+ parser->next += 2;
143
+ }
144
+ else {
145
+ *parser->data++ = *parser->next++;
146
+ }
147
+ }
148
+
149
+ if (*parser->next != quote) {
150
+ parser->error = XNODE_QUOTATION_MARK_EXPECTED;
151
+ return NULL;
152
+ }
153
+
154
+ // Skip '\'' or '"'
155
+ parser->next++;
156
+
157
+ *parser->data++ = '\0';
158
+
159
+ return value;
160
+ }
161
+
162
+ static void xnode_parse_attribute_selector(XNodeQueryParser *parser,
163
+ XNodeSelector *selector) {
164
+
165
+ parser->next++; // skip '['
166
+
167
+ while (*parser->next != ']' && !parser->error) {
168
+ XNodeAttributeSelector *attr = xnew_zero(XNodeAttributeSelector);
169
+
170
+ jsa_push(&selector->attributes, attr);
171
+
172
+ if (!(attr->name = xnode_parse_ident(parser))) {
173
+ return;
174
+ }
175
+
176
+ skip_space(parser);
177
+
178
+ switch (*parser->next) {
179
+ case ']': // [... foo]
180
+ attr->type = XQSEL_ATTR_PRESENCE;
181
+ break;
182
+ case ',': // [foo, bar...]
183
+ attr->type = XQSEL_ATTR_PRESENCE;
184
+ parser->next++;
185
+ skip_space(parser);
186
+ continue;
187
+ case '=':
188
+ attr->type = XQSEL_ATTR_EQUAL;
189
+ break;
190
+ case '~':
191
+ attr->type = XQSEL_ATTR_SPACE_MATCH;
192
+ parser->next++;
193
+ break;
194
+ case '|':
195
+ attr->type = XQSEL_ATTR_DASH_MATCH;
196
+ parser->next++;
197
+ break;
198
+ case '^':
199
+ attr->type = XQSEL_ATTR_PREFIX_MATCH;
200
+ parser->next++;
201
+ break;
202
+ case '$':
203
+ attr->type = XQSEL_ATTR_SUFFIX_MATCH;
204
+ parser->next++;
205
+ break;
206
+ case '*':
207
+ attr->type = XQSEL_ATTR_SUBSTRING_MATCH;
208
+ parser->next++;
209
+ break;
210
+ default:
211
+ parser->error = XNODE_UNEXPECTED_CHAR;
212
+ return;
213
+ }
214
+
215
+ if (*parser->next != ']') {
216
+ if (*parser->next == '=') {
217
+ // skip the '=' char
218
+ parser->next++;
219
+
220
+ // space before value
221
+ skip_space(parser);
222
+
223
+ // Selector needs a value
224
+ if (!(attr->value = xnode_parse_value(parser))) {
225
+ return;
226
+ }
227
+
228
+ // space after value
229
+ skip_space(parser);
230
+ }
231
+
232
+ switch (*parser->next) {
233
+ case ']': // [... foo]
234
+ break;
235
+ case ',': // [foo, bar...]
236
+ parser->next++;
237
+ skip_space(parser);
238
+ default:
239
+ continue;
240
+ }
241
+ }
242
+ }
243
+
244
+ parser->next++; // skip ']'
245
+ }
246
+
247
+ static void xnode_parse_class_selector(XNodeQueryParser *parser,
248
+ XNodeSelector *selector) {
249
+ // class: '.' IDENT
250
+ const char *class_name = xnode_parse_ident(parser);
251
+ if (class_name) {
252
+ XNodeAttributeSelector *attr = xnew_zero(XNodeAttributeSelector);
253
+ attr->name = "class";
254
+ attr->value = class_name;
255
+ attr->type = XQSEL_ATTR_SPACE_MATCH;
256
+ jsa_push(&selector->attributes, attr);
257
+ }
258
+ }
259
+
260
+ static void xnode_parse_id_selector(XNodeQueryParser *parser,
261
+ XNodeSelector *selector) {
262
+ // "#"{name}
263
+ const char *id = xnode_parse_name(parser);
264
+ if (id) {
265
+ XNodeAttributeSelector *attr = xnew_zero(XNodeAttributeSelector);
266
+ attr->name = "id";
267
+ attr->value = id;
268
+ attr->type = XQSEL_ATTR_EQUAL;
269
+ jsa_push(&selector->attributes, attr);
270
+ }
271
+ }
272
+
273
+ static XNodeSelector *xnode_selector_alloc() {
274
+ XNodeSelector *result = (XNodeSelector *) malloc(sizeof(XNodeSelector));
275
+
276
+ result->type = NULL;
277
+ jsa_init(&result->attributes);
278
+ result->combinator = XQSEL_DESCENDANT;
279
+
280
+ return result;
281
+ }
282
+
283
+ static void xnode_selector_free(XNodeSelector *selector) {
284
+ uint32_t i;
285
+
286
+ for (i = 0; i < selector->attributes.size; i++) {
287
+ XNodeAttributeSelector *attr;
288
+ if ((attr = (XNodeAttributeSelector*) selector->attributes.item[i])) {
289
+ xfree(attr);
290
+ }
291
+ }
292
+ jsa_clean(&selector->attributes);
293
+ xfree(selector);
294
+ }
295
+
296
+ static XNodeSelector *xnode_parse_selector(XNodeQueryParser *parser) {
297
+ XNodeSelector *selector;
298
+
299
+ if (!skip_space(parser)) {
300
+ return NULL;
301
+ }
302
+
303
+ selector = xnode_selector_alloc();
304
+
305
+ // If a universal selector represented by * (i.e. without a namespace
306
+ // prefix) is not the only component of a sequence of simple selectors or
307
+ // is immediately followed by a pseudo-element, then the * may be omitted
308
+ // and the universal selector's presence implied.
309
+ while (*parser->next && !parser->error) {
310
+ switch (*parser->next) {
311
+ case '*':
312
+ if (!selector->type) {
313
+ selector->type = "*";
314
+ parser->next++;
315
+ continue;
316
+ }
317
+ else {
318
+ parser->error = XNODE_UNEXPECTED_CHAR;
319
+ break;
320
+ }
321
+ case '[':
322
+ if (!selector->type) selector->type = "*";
323
+ xnode_parse_attribute_selector(parser, selector);
324
+ break;
325
+ case '.':
326
+ parser->next++;
327
+ if (!selector->type) selector->type = "*";
328
+ xnode_parse_class_selector(parser, selector);
329
+ break;
330
+ case '#':
331
+ parser->next++;
332
+ if (!selector->type) selector->type = "*";
333
+ xnode_parse_id_selector(parser, selector);
334
+ break;
335
+ default:
336
+ if (!selector->type) {
337
+ // element_name: IDENT
338
+ selector->type = xnode_parse_ident(parser);
339
+ break;
340
+ }
341
+ return selector;
342
+ }
343
+ }
344
+
345
+ return selector;
346
+ }
347
+
348
+ static XNodeQuery* xnode_query_alloc(char *data) {
349
+ XNodeQuery *query = xnew(XNodeQuery);
350
+ query->data = data;
351
+ jsa_init(&query->selectors);
352
+ jsa_init(&query->result[0]);
353
+ jsa_init(&query->result[1]);
354
+ return query;
355
+ }
356
+
357
+ void xnode_query_free(XNodeQuery *query) {
358
+ uint32_t i;
359
+ for (i = 0; i < jsa_size(&query->selectors); i++) {
360
+ xnode_selector_free((XNodeSelector *)jsa_get(&query->selectors, i));
361
+ }
362
+ jsa_clean(&query->selectors);
363
+ jsa_clean(&query->result[0]);
364
+ jsa_clean(&query->result[1]);
365
+ xfree(query->data);
366
+ xfree(query);
367
+ }
368
+
369
+ XNodeQuery *xnode_query_create(const char *query_text, const char **errptr,
370
+ int *erroffset) {
371
+ XNodeQueryParser parser;
372
+ XNodeQuery *query;
373
+ XNodeQueryType combinator;
374
+
375
+ parser.next = query_text;
376
+ parser.data = (char*) xmalloc(strlen(query_text) + 1);
377
+ parser.error = NULL;
378
+
379
+ // To support queries like node:find('>span.hidden')
380
+ if ((combinator = xnode_parse_combinator(&parser)) == XQSEL_UNKNOWN_TYPE) {
381
+ combinator = XQSEL_DESCENDANT;
382
+ parser.error = NULL;
383
+ }
384
+
385
+ query = xnode_query_alloc(parser.data);
386
+
387
+ while (*parser.next && !parser.error) {
388
+ XNodeSelector *selector = xnode_parse_selector(&parser);
389
+ if (parser.error) {
390
+ if (selector) {
391
+ xnode_selector_free(selector);
392
+ }
393
+ break;
394
+ }
395
+ if (!selector) break;
396
+ selector->combinator = combinator;
397
+ jsa_push(&query->selectors, selector);
398
+ if (!*parser.next) break;
399
+ combinator = xnode_parse_combinator(&parser);
400
+ }
401
+
402
+ if (!parser.error && query->selectors.size == 0) {
403
+ parser.error = XNODE_EMPTY_SELECTOR;
404
+ }
405
+
406
+ if (parser.error) {
407
+ xnode_query_free(query);
408
+ if (errptr) *errptr = parser.error;
409
+ if (erroffset) *erroffset = (int)(parser.next - query_text);
410
+ return NULL;
411
+ }
412
+
413
+ return query;
414
+ }
package/install.js DELETED
@@ -1,15 +0,0 @@
1
- const https = require("https");
2
- const fs = require("fs");
3
- const os = require("os");
4
- const path = require("path");
5
-
6
- const libs = "https://raw.githubusercontent.com/fangwd/gumbo-html/main/libs";
7
- const url = `${libs}/${process.platform}/${os.arch()}/html.node`;
8
- const file = fs.createWriteStream(path.join(__dirname, "html.node"));
9
-
10
- https.get(url, (response) => {
11
- response.pipe(file);
12
- file.on("finish", () => {
13
- file.close();
14
- });
15
- });