gumbo-html 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/README.md +27 -10
  2. package/binding.gyp +49 -0
  3. package/examples/example.js +87 -0
  4. package/examples/scrape.js +301 -0
  5. package/index.d.ts +58 -3
  6. package/index.js +7 -2
  7. package/lib/wrapper.js +385 -0
  8. package/package.json +36 -5
  9. package/src/addon.cc +19 -0
  10. package/src/gumbo-parser/COPYING +201 -0
  11. package/src/gumbo-parser/README.md +8 -0
  12. package/src/gumbo-parser/src/attribute.c +44 -0
  13. package/src/gumbo-parser/src/attribute.h +37 -0
  14. package/src/gumbo-parser/src/char_ref.c +23069 -0
  15. package/src/gumbo-parser/src/char_ref.h +60 -0
  16. package/src/gumbo-parser/src/error.c +279 -0
  17. package/src/gumbo-parser/src/error.h +225 -0
  18. package/src/gumbo-parser/src/gumbo.h +671 -0
  19. package/src/gumbo-parser/src/insertion_mode.h +57 -0
  20. package/src/gumbo-parser/src/parser.c +4192 -0
  21. package/src/gumbo-parser/src/parser.h +57 -0
  22. package/src/gumbo-parser/src/string_buffer.c +110 -0
  23. package/src/gumbo-parser/src/string_buffer.h +84 -0
  24. package/src/gumbo-parser/src/string_piece.c +48 -0
  25. package/src/gumbo-parser/src/string_piece.h +38 -0
  26. package/src/gumbo-parser/src/tag.c +95 -0
  27. package/src/gumbo-parser/src/tag_enum.h +153 -0
  28. package/src/gumbo-parser/src/tag_gperf.h +105 -0
  29. package/src/gumbo-parser/src/tag_sizes.h +4 -0
  30. package/src/gumbo-parser/src/tag_strings.h +153 -0
  31. package/src/gumbo-parser/src/token_type.h +41 -0
  32. package/src/gumbo-parser/src/tokenizer.c +2897 -0
  33. package/src/gumbo-parser/src/tokenizer.h +123 -0
  34. package/src/gumbo-parser/src/tokenizer_states.h +103 -0
  35. package/src/gumbo-parser/src/utf8.c +270 -0
  36. package/src/gumbo-parser/src/utf8.h +132 -0
  37. package/src/gumbo-parser/src/util.c +58 -0
  38. package/src/gumbo-parser/src/util.h +60 -0
  39. package/src/gumbo-parser/src/vector.c +123 -0
  40. package/src/gumbo-parser/src/vector.h +67 -0
  41. package/src/html_document.cc +411 -0
  42. package/src/html_document.h +56 -0
  43. package/src/html_element.cc +963 -0
  44. package/src/html_element.h +70 -0
  45. package/src/include/win/strings.h +11 -0
  46. package/src/jsa.c +182 -0
  47. package/src/jsa.h +44 -0
  48. package/src/xnode.c +372 -0
  49. package/src/xnode_query.c +330 -0
  50. package/src/xnode_query.h +186 -0
  51. package/src/xnode_query_parser.c +414 -0
  52. package/install.js +0 -15
@@ -0,0 +1,330 @@
1
+ #include "xnode_query.h"
2
+
3
+ #include <assert.h>
4
+ #include <ctype.h>
5
+ #include <string.h>
6
+
7
+ #ifdef _WIN32
8
+ #define strcasecmp(x,y) _stricmp(x,y)
9
+ #define strncasecmp(x,y,z) _strnicmp(x,y,z)
10
+ #else
11
+ #include <strings.h>
12
+ #endif
13
+
14
+ static int string_ends_with(const char *str, const char *suffix) {
15
+ if (!str || !suffix) {
16
+ return 0;
17
+ } else {
18
+ size_t lenstr = strlen(str);
19
+ size_t lensuffix = strlen(suffix);
20
+ if (lensuffix > lenstr) return 0;
21
+ return strncmp(str + lenstr - lensuffix, suffix, lensuffix) == 0;
22
+ }
23
+ }
24
+
25
+ static int is_html_space(char c) {
26
+ return isspace((unsigned char)c);
27
+ }
28
+
29
+ static int string_in_space_list(const char *str, const char *tok) {
30
+ size_t tok_len;
31
+
32
+ if (!str || !tok || *tok == '\0') {
33
+ return 0;
34
+ }
35
+
36
+ tok_len = strlen(tok);
37
+ while (*str) {
38
+ while (*str && is_html_space(*str)) {
39
+ str++;
40
+ }
41
+
42
+ if (!strncmp(str, tok, tok_len) &&
43
+ (str[tok_len] == '\0' || is_html_space(str[tok_len]))) {
44
+ return 1;
45
+ }
46
+
47
+ while (*str && !is_html_space(*str)) {
48
+ str++;
49
+ }
50
+ }
51
+
52
+ return 0;
53
+ }
54
+
55
+ static int string_dash_match(const char *str, const char *tok) {
56
+ size_t tok_len;
57
+
58
+ if (!str || !tok || *tok == '\0') {
59
+ return 0;
60
+ }
61
+
62
+ tok_len = strlen(tok);
63
+ return !strcmp(str, tok) || (!strncmp(str, tok, tok_len) && str[tok_len] == '-');
64
+ }
65
+
66
+ /**
67
+ * Evaluate an attribute expression against an attribute.
68
+ */
69
+ static int xnode_match_attribute(GumboAttribute *attribute,
70
+ XNodeAttributeSelector *selector) {
71
+
72
+ if (strcasecmp(attribute->name, selector->name) == 0) {
73
+ switch (selector->type) {
74
+ case XQSEL_ATTR_PRESENCE:
75
+ return 1;
76
+ case XQSEL_ATTR_EQUAL:
77
+ return !strcmp(attribute->value, selector->value);
78
+ case XQSEL_ATTR_SPACE_MATCH: // ~=
79
+ return string_in_space_list(attribute->value, selector->value);
80
+ case XQSEL_ATTR_DASH_MATCH: // |=
81
+ return string_dash_match(attribute->value, selector->value);
82
+ case XQSEL_ATTR_PREFIX_MATCH: // ^=
83
+ return !strncmp(selector->value, attribute->value, strlen(selector->value));
84
+ case XQSEL_ATTR_SUFFIX_MATCH: // $=
85
+ return string_ends_with(attribute->value, selector->value);
86
+ case XQSEL_ATTR_SUBSTRING_MATCH: // *=
87
+ return strstr(attribute->value, selector->value) != NULL;
88
+ default:
89
+ assert(0);
90
+ break;
91
+ }
92
+ }
93
+
94
+ return 0;
95
+ }
96
+
97
+ const char *xnode_get_tag_name(GumboNode *node, size_t *length) {
98
+ const char *data;
99
+
100
+ if (node->type == GUMBO_NODE_DOCUMENT) {
101
+ data = "document";
102
+ *length = sizeof("document") - 1;
103
+ }
104
+ else {
105
+ data = gumbo_normalized_tagname(node->v.element.tag);
106
+ *length = strlen(data);
107
+ }
108
+
109
+ if (*length == 0) {
110
+ GumboStringPiece tag = node->v.element.original_tag;
111
+ if (tag.data != NULL) {
112
+ gumbo_tag_from_original_text(&tag);
113
+ data = tag.data;
114
+ *length = tag.length;
115
+ }
116
+ else {
117
+ data = NULL;
118
+ *length = 0;
119
+ }
120
+ }
121
+
122
+ return data;
123
+ }
124
+
125
+ static int xnode_match_node_type(GumboNode *node, const char *type) {
126
+ const char *name;
127
+ size_t length;
128
+
129
+ if ((name = xnode_get_tag_name(node, &length))) {
130
+ if (!strcmp(type, "*")) {
131
+ return 1;
132
+ }
133
+ else {
134
+ if (strlen(type) == length) {
135
+ return !strncasecmp(name, type, length);
136
+ }
137
+ }
138
+ }
139
+
140
+ return 0;
141
+ }
142
+
143
+ int xnode_match_node(GumboNode *node, XNodeSelector *node_selector) {
144
+ GumboNodeType type = node->type;
145
+ GumboVector *attributes;
146
+ uint32_t i, j;
147
+
148
+ if (type != GUMBO_NODE_ELEMENT && type != GUMBO_NODE_TEMPLATE) {
149
+ return 0;
150
+ }
151
+
152
+ if (!xnode_match_node_type(node, node_selector->type)) {
153
+ return 0;
154
+ }
155
+
156
+ attributes = &node->v.element.attributes;
157
+
158
+ for (i = 0; i < jsa_size(&node_selector->attributes); i++) {
159
+ XNodeAttributeSelector *selector;
160
+ selector = (XNodeAttributeSelector*) jsa_get(&node_selector->attributes, i);
161
+ for (j = 0; j < attributes->length; ++j) {
162
+ GumboAttribute* attribute = (GumboAttribute*) attributes->data[j];
163
+ if (xnode_match_attribute(attribute, selector)) break;
164
+ }
165
+ if (j == attributes->length) {
166
+ return 0;
167
+ }
168
+ }
169
+
170
+ return 1;
171
+ }
172
+
173
+ static void xnode_match_node_descendant(GumboNode* node, jsa_t *next_nodes,
174
+ XNodeSelector *selector) {
175
+ if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
176
+ uint32_t i;
177
+ for (i = 0; i < node->v.element.children.length; ++i) {
178
+ GumboNode* child = (GumboNode*) node->v.element.children.data[i];
179
+ if (xnode_match_node(child, selector)) {
180
+ jsa_push(next_nodes, child);
181
+ }
182
+ xnode_match_node_descendant(child, next_nodes, selector);
183
+ }
184
+ }
185
+ }
186
+
187
+ static void xnode_match_node_child(GumboNode* node, jsa_t *next_nodes,
188
+ XNodeSelector *selector) {
189
+ if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
190
+ uint32_t i;
191
+ for (i = 0; i < node->v.element.children.length; ++i) {
192
+ GumboNode* child = (GumboNode*) node->v.element.children.data[i];
193
+ if (xnode_match_node(child, selector)) {
194
+ jsa_push(next_nodes, child);
195
+ }
196
+ }
197
+ }
198
+ }
199
+
200
+ static void xnode_match_node_adjacent_sibling(GumboNode* node,
201
+ jsa_t *next_nodes, XNodeSelector *selector) {
202
+ if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
203
+ GumboNode *next = xnode_next(node, NULL);
204
+ if (next && xnode_match_node(next, selector)) {
205
+ jsa_push(next_nodes, next);
206
+ }
207
+ }
208
+ }
209
+
210
+ static void xnode_match_node_general_sibling(GumboNode* node, jsa_t *next_nodes,
211
+ XNodeSelector *selector) {
212
+
213
+ if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
214
+ while ((node = xnode_next(node, NULL))) {
215
+ if (xnode_match_node(node, selector)) {
216
+ jsa_push(next_nodes, node);
217
+ }
218
+ }
219
+ }
220
+ }
221
+
222
+ /*
223
+ #include <stdio.h>
224
+ static void xnode_dump(XNode *node) {
225
+ size_t name_size, html_size;
226
+ const char *name = xnode_type(node, &name_size);
227
+ const char *html = xnode_html(node, &html_size);
228
+ const char *id = xnode_attr(node, "id");
229
+ const char *Class = xnode_attr(node, "class");
230
+ printf("%s\t%s\t%s\t%.*s\n", name, id ? id : "", Class ? Class : "",
231
+ (int) html_size, html);
232
+ }
233
+
234
+ static void dump_query_result(const XNodeArray *result) {
235
+ if (result == NULL) {
236
+ printf("<null>\n");
237
+ return;
238
+ }
239
+ else {
240
+ uint32_t i;
241
+ printf("%u element(s):\n", result->size);
242
+ for (i = 0; i < result->size; i++) {
243
+ xnode_dump((XNode*) jsa_get(result, i));
244
+ }
245
+ }
246
+ }
247
+ */
248
+ #define SWAP(x, y) do { jsa_t *SWAP = x; x = y; y = SWAP; } while (0)
249
+
250
+ const XNodeArray *xnode_query_execute(XNodeQuery *query, const XNode *node) {
251
+ uint32_t i, j;
252
+ jsa_t *prev_nodes = &query->result[0];
253
+ jsa_t *next_nodes = &query->result[1];
254
+
255
+ prev_nodes->size = 0;
256
+ jsa_push(prev_nodes, node);
257
+
258
+ for (i = 0; i < query->selectors.size; i++) {
259
+ XNodeSelector *selector = xnode_query_get_selector(query, i);
260
+
261
+ next_nodes->size = 0;
262
+
263
+ for (j = 0; j < jsa_size(prev_nodes); j++) {
264
+ GumboNode *node = (GumboNode*) jsa_get(prev_nodes, j);
265
+ switch (selector->combinator) {
266
+ case XQSEL_DESCENDANT:
267
+ xnode_match_node_descendant(node, next_nodes, selector);
268
+ break;
269
+ case XQSEL_CHILD:
270
+ xnode_match_node_child(node, next_nodes, selector);
271
+ break;
272
+ case XQSEL_ADJACENT_SIBLING:
273
+ xnode_match_node_adjacent_sibling(node, next_nodes, selector);
274
+ break;
275
+ case XQSEL_GENERAL_SIBLING:
276
+ xnode_match_node_general_sibling(node, next_nodes, selector);
277
+ break;
278
+ default:
279
+ assert(0);
280
+ break;
281
+ }
282
+ }
283
+
284
+ jsa_dedup(next_nodes);
285
+ // dump_query_result(next_nodes);
286
+ SWAP(prev_nodes, next_nodes);
287
+ }
288
+
289
+ return jsa_size(prev_nodes) > 0 ? prev_nodes : NULL;
290
+ }
291
+
292
+ XNodeSelector *xnode_query_selector(XNodeQuery *query, uint32_t n) {
293
+ return (XNodeSelector*) jsa_get(&query->selectors, n);
294
+ }
295
+
296
+ /**
297
+ * Locate the first instance of \btok\b in str, where \b = sep.
298
+ */
299
+ char *strtok_simple(const char *str, const char *tok, char sep) {
300
+ const char *a, *b;
301
+ int flag;
302
+
303
+ b = tok;
304
+
305
+ if (*b == 0) {
306
+ return (char*) str;
307
+ }
308
+
309
+ for (flag = 1; *str != 0; str += 1) {
310
+ if (*str != *b || !flag) {
311
+ flag = sep ? (*str == sep) : (isspace(*str) || ispunct(*str));
312
+ continue;
313
+ }
314
+ a = str;
315
+ while (1) {
316
+ if (*b == 0) {
317
+ if (*a == 0 || (sep ? (*a == sep) : (isspace(*a)) || ispunct(*a))) {
318
+ return (char*) str;
319
+ } else {
320
+ break;
321
+ }
322
+ }
323
+ if (*a++ != *b++) {
324
+ break;
325
+ }
326
+ }
327
+ b = tok;
328
+ }
329
+ return (char *) 0;
330
+ }
@@ -0,0 +1,186 @@
1
+ #ifndef XNODE_QUERY_H_
2
+ #define XNODE_QUERY_H_
3
+
4
+ #ifdef __cplusplus
5
+ extern "C" {
6
+ #endif
7
+
8
+ typedef enum {
9
+ // 6.1. Type selector
10
+ // h1 element in the document tree: h1
11
+ XQSEL_TYPE,
12
+
13
+ // 6.2. Universal selector
14
+ // Represented by asterisk (*) any document type. '*' may be omitted in some cases.
15
+ XQSEL_UNIVERSAL,
16
+
17
+ // 6.3.1. Attribute presence and value selectors
18
+ // An E element with a "foo" attribute: E[foo]
19
+ XQSEL_ATTR_PRESENCE,
20
+
21
+ // An E element whose "foo" attribute value is exactly equal to "bar":
22
+ // E[foo="bar"]
23
+ XQSEL_ATTR_EQUAL,
24
+
25
+ // An E element whose "foo" attribute value is a list of whitespace-
26
+ // separated values, one of which is exactly equal to "bar": E[foo~="bar"]
27
+ XQSEL_ATTR_SPACE_MATCH,
28
+
29
+ // 6.3.2. Substring matching attribute selectors
30
+ // An E element whose "foo" attribute value begins exactly with the string
31
+ // "bar": E[foo^="bar"]
32
+ XQSEL_ATTR_PREFIX_MATCH,
33
+
34
+ // An E element whose "foo" attribute value ends exactly with the string
35
+ // "bar": E[foo$="bar"]
36
+ XQSEL_ATTR_SUFFIX_MATCH,
37
+
38
+ // An E element whose "foo" attribute value contains the substring "bar":
39
+ // E[foo*="bar"]
40
+ XQSEL_ATTR_SUBSTRING_MATCH,
41
+
42
+ // An E element whose "foo" attribute has a hyphen-separated list of values
43
+ // beginning (from the left) with "en": E[foo|="en"]
44
+ XQSEL_ATTR_DASH_MATCH,
45
+
46
+ // 6.4. Class selectors
47
+ // All elements with class~=pastoral: *.pastoral or just .pastoral
48
+ XQSEL_CLASS,
49
+
50
+ // 6.5. ID selectors
51
+ // Examples: h1#chapter1, #chapter1
52
+ XQSEL_ID,
53
+
54
+ // 6.6. Pseudo-classes (NOT SUPPORTED)
55
+ // A pseudo-class always consists of a "colon" (:) followed by the name of
56
+ // the pseudo-class and optionally by a value between parentheses.
57
+ // E.g. a:link, tr:nth-child(2n+1), :last-child, :first-of-type, etc.
58
+ XQSEL_PSEUDO_CLASS,
59
+
60
+ // 7. Pseudo-elements (NOT SUPPORTED)
61
+ // E.g. ::first-line, ::first-letter, ::before, ::after
62
+ XQSEL_PSEUDO_ELEMENT,
63
+
64
+ // 8. Combinators
65
+ // 8.1. Descendant combinator
66
+ // E.g. h1 em, div * p
67
+ XQSEL_DESCENDANT,
68
+
69
+ // 8.2. Child combinators
70
+ // E.g. body > p
71
+ XQSEL_CHILD,
72
+
73
+ // 8.3. Sibling combinators
74
+ // 8.3.1. Adjacent sibling combinator
75
+ // E.g. math + p (p element immediately following a math element)
76
+ XQSEL_ADJACENT_SIBLING,
77
+
78
+ // 8.3.2. General sibling combinator
79
+ // The elements represented by the two sequences share the same parent
80
+ // in the document tree and the element represented by the first sequence
81
+ // precedes (not necessarily immediately) the element represented by the
82
+ // second one. E.g. h1 ~ pre (pre element following an h1)
83
+ XQSEL_GENERAL_SIBLING,
84
+
85
+ XQSEL_UNKNOWN_TYPE
86
+ } XNodeQueryType;
87
+
88
+ #include "gumbo.h"
89
+ #include "jsa.h"
90
+
91
+ typedef struct {
92
+ // One of the XQSEL_ATTR_* types
93
+ XNodeQueryType type;
94
+
95
+ // Attribute name
96
+ const char *name;
97
+
98
+ // Attribute value, if applicable
99
+ const char *value;
100
+ } XNodeAttributeSelector;
101
+
102
+ // 4. Selector syntax
103
+ //
104
+ // A selector is a chain of one or more sequences of simple selectors separated
105
+ // by combinators. One pseudo-element may be appended to the last sequence of
106
+ // simple selectors in a selector.
107
+ //
108
+ // A sequence of simple selectors is a chain of simple selectors that are not
109
+ // separated by a combinator. It always begins with a type selector or a
110
+ // universal selector. No other type selector or universal selector is allowed
111
+ // in the sequence.
112
+ //
113
+ // A simple selector is either a type selector, universal selector, attribute
114
+ // selector, class selector, ID selector, or pseudo-class.
115
+ //
116
+ typedef struct {
117
+ // Tag name for type selector, "*" for universal selector
118
+ const char *type;
119
+
120
+ // A list of attribute selectors; class selectors are translated as
121
+ // a list of {"class", "name", XQSEL_ATTR_SPACE_MATCH} selectors.
122
+ jsa_t attributes;
123
+
124
+ // Relation of this selector and the previous one, if any.
125
+ XNodeQueryType combinator;
126
+ } XNodeSelector;
127
+
128
+ typedef struct {
129
+ jsa_t selectors;
130
+ char *data;
131
+ jsa_t result[2];
132
+ } XNodeQuery;
133
+
134
+ typedef GumboNode XNode;
135
+
136
+ typedef jsa_t XNodeArray;
137
+ #define xnode_array_first(array) ((XNode*)jsa_get((array),0))
138
+ #define xnode_array_get(array,n) ((XNode*)jsa_get((array),(n)))
139
+ #define xnode_array_size(array) jsa_size(array)
140
+
141
+ typedef struct {
142
+ GumboOutput *output;
143
+ GumboOptions parse_options;
144
+ char *html;
145
+ size_t html_length;
146
+ } XNodeDocument;
147
+
148
+ XNodeQuery *xnode_query_create(const char *, const char **, int *);
149
+ const XNodeArray *xnode_query_execute(XNodeQuery *, const XNode*);
150
+ void xnode_query_free(XNodeQuery *);
151
+ XNodeSelector *xnode_query_selector(XNodeQuery*, uint32_t);
152
+
153
+ #define xnode_query_get_selector(q,i) ((XNodeSelector*)jsa_get(&q->selectors,i))
154
+
155
+ XNodeDocument *xnode_parse_html(const char *html, size_t length);
156
+ const XNode *xnode_document_root(const XNodeDocument*);
157
+ void xnode_document_free(XNodeDocument*);
158
+
159
+ const char *xnode_attr(const XNode*, const char *name);
160
+
161
+ XNode *xnode_first_child(XNode *node, XNodeSelector *selector);
162
+ XNode *xnode_parent(XNode *node);
163
+ XNode *xnode_prev(XNode *node, XNodeSelector *selector);
164
+ XNode *xnode_prev_check(XNode *node, XNodeSelector *selector);
165
+ XNode *xnode_next(XNode *node, XNodeSelector *selector);
166
+ XNode *xnode_next_check(XNode *node, XNodeSelector *selector);
167
+ const char *xnode_html(XNode *node, size_t *length);
168
+ const char *xnode_text(XNode *xnode);
169
+ const char *xnode_type(XNode *node, size_t *length);
170
+ int xnode_has_class(XNode *xnode, const char *class_name);
171
+ int xnode_has_attr(XNode *xnode, const char *name);
172
+
173
+ int xnode_match_node(GumboNode *node, XNodeSelector *selector);
174
+ GumboVector *xnode_children(XNode *node);
175
+
176
+ // utils
177
+ char *strtok_simple(const char *str, const char *tok, char sep);
178
+ #define xmalloc malloc
179
+ #define xnew(type) ((type*)xmalloc(sizeof (type)))
180
+ #define xfree free
181
+
182
+ #ifdef __cplusplus
183
+ }
184
+ #endif
185
+
186
+ #endif // XNODE_QUERY_H_