gumbo-html 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -10
- package/binding.gyp +49 -0
- package/examples/example.js +87 -0
- package/examples/scrape.js +301 -0
- package/index.d.ts +58 -3
- package/index.js +7 -2
- package/lib/wrapper.js +385 -0
- package/package.json +36 -5
- package/src/addon.cc +19 -0
- package/src/gumbo-parser/COPYING +201 -0
- package/src/gumbo-parser/README.md +8 -0
- package/src/gumbo-parser/src/attribute.c +44 -0
- package/src/gumbo-parser/src/attribute.h +37 -0
- package/src/gumbo-parser/src/char_ref.c +23069 -0
- package/src/gumbo-parser/src/char_ref.h +60 -0
- package/src/gumbo-parser/src/error.c +279 -0
- package/src/gumbo-parser/src/error.h +225 -0
- package/src/gumbo-parser/src/gumbo.h +671 -0
- package/src/gumbo-parser/src/insertion_mode.h +57 -0
- package/src/gumbo-parser/src/parser.c +4192 -0
- package/src/gumbo-parser/src/parser.h +57 -0
- package/src/gumbo-parser/src/string_buffer.c +110 -0
- package/src/gumbo-parser/src/string_buffer.h +84 -0
- package/src/gumbo-parser/src/string_piece.c +48 -0
- package/src/gumbo-parser/src/string_piece.h +38 -0
- package/src/gumbo-parser/src/tag.c +95 -0
- package/src/gumbo-parser/src/tag_enum.h +153 -0
- package/src/gumbo-parser/src/tag_gperf.h +105 -0
- package/src/gumbo-parser/src/tag_sizes.h +4 -0
- package/src/gumbo-parser/src/tag_strings.h +153 -0
- package/src/gumbo-parser/src/token_type.h +41 -0
- package/src/gumbo-parser/src/tokenizer.c +2897 -0
- package/src/gumbo-parser/src/tokenizer.h +123 -0
- package/src/gumbo-parser/src/tokenizer_states.h +103 -0
- package/src/gumbo-parser/src/utf8.c +270 -0
- package/src/gumbo-parser/src/utf8.h +132 -0
- package/src/gumbo-parser/src/util.c +58 -0
- package/src/gumbo-parser/src/util.h +60 -0
- package/src/gumbo-parser/src/vector.c +123 -0
- package/src/gumbo-parser/src/vector.h +67 -0
- package/src/html_document.cc +411 -0
- package/src/html_document.h +56 -0
- package/src/html_element.cc +963 -0
- package/src/html_element.h +70 -0
- package/src/include/win/strings.h +11 -0
- package/src/jsa.c +182 -0
- package/src/jsa.h +44 -0
- package/src/xnode.c +372 -0
- package/src/xnode_query.c +330 -0
- package/src/xnode_query.h +186 -0
- package/src/xnode_query_parser.c +414 -0
- package/install.js +0 -15
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#ifndef HTML_ELEMENT_H_
|
|
2
|
+
#define HTML_ELEMENT_H_
|
|
3
|
+
|
|
4
|
+
#include <napi.h>
|
|
5
|
+
|
|
6
|
+
#include <string>
|
|
7
|
+
|
|
8
|
+
#include "xnode_query.h"
|
|
9
|
+
|
|
10
|
+
namespace html {
|
|
11
|
+
|
|
12
|
+
class Document;
|
|
13
|
+
struct XDocWrapper;
|
|
14
|
+
|
|
15
|
+
class Element : public Napi::ObjectWrap<Element> {
|
|
16
|
+
public:
|
|
17
|
+
static void Init(Napi::Env env);
|
|
18
|
+
static Napi::FunctionReference constructor;
|
|
19
|
+
|
|
20
|
+
static Napi::Value Query(Napi::Env env, XDocWrapper *wrapper,
|
|
21
|
+
const XNode *node, const char *selector);
|
|
22
|
+
static Napi::Value Create(Napi::Env env, XDocWrapper *xdoc, XNode *xnode);
|
|
23
|
+
Element(const Napi::CallbackInfo& info);
|
|
24
|
+
~Element();
|
|
25
|
+
|
|
26
|
+
Napi::Value Attr(const Napi::CallbackInfo& info);
|
|
27
|
+
Napi::Value AttrSafe(const Napi::CallbackInfo& info);
|
|
28
|
+
Napi::Value Find(const Napi::CallbackInfo& info);
|
|
29
|
+
Napi::Value First(const Napi::CallbackInfo& info);
|
|
30
|
+
Napi::Value FirstSafe(const Napi::CallbackInfo& info);
|
|
31
|
+
Napi::Value Only(const Napi::CallbackInfo& info);
|
|
32
|
+
Napi::Value OnlySafe(const Napi::CallbackInfo& info);
|
|
33
|
+
Napi::Value HasClass(const Napi::CallbackInfo& info);
|
|
34
|
+
Napi::Value HasAttribute(const Napi::CallbackInfo& info);
|
|
35
|
+
Napi::Value Next(const Napi::CallbackInfo& info);
|
|
36
|
+
Napi::Value Prev(const Napi::CallbackInfo& info);
|
|
37
|
+
|
|
38
|
+
Napi::Value GetTagName(const Napi::CallbackInfo& info);
|
|
39
|
+
Napi::Value GetInnerText(const Napi::CallbackInfo& info);
|
|
40
|
+
Napi::Value GetOuterHTML(const Napi::CallbackInfo& info);
|
|
41
|
+
Napi::Value GetParent(const Napi::CallbackInfo& info);
|
|
42
|
+
Napi::Value GetNodeType(const Napi::CallbackInfo& info);
|
|
43
|
+
Napi::Value GetChildNodes(const Napi::CallbackInfo& info);
|
|
44
|
+
|
|
45
|
+
Napi::Value FirstOrThrow(const Napi::CallbackInfo& info);
|
|
46
|
+
Napi::Value OnlyOrThrow(const Napi::CallbackInfo& info);
|
|
47
|
+
Napi::Value AttrOrThrow(const Napi::CallbackInfo& info);
|
|
48
|
+
Napi::Value Text(const Napi::CallbackInfo& info);
|
|
49
|
+
Napi::Value TextOrThrow(const Napi::CallbackInfo& info);
|
|
50
|
+
Napi::Value Exists(const Napi::CallbackInfo& info);
|
|
51
|
+
Napi::Value Count(const Napi::CallbackInfo& info);
|
|
52
|
+
Napi::Value Closest(const Napi::CallbackInfo& info);
|
|
53
|
+
Napi::Value Children(const Napi::CallbackInfo& info);
|
|
54
|
+
Napi::Value Siblings(const Napi::CallbackInfo& info);
|
|
55
|
+
Napi::Value Matches(const Napi::CallbackInfo& info);
|
|
56
|
+
Napi::Value Is(const Napi::CallbackInfo& info);
|
|
57
|
+
Napi::Value Rows(const Napi::CallbackInfo& info);
|
|
58
|
+
Napi::Value GetTextContent(const Napi::CallbackInfo& info);
|
|
59
|
+
|
|
60
|
+
XNode *xnode_;
|
|
61
|
+
XDocWrapper *xdoc_;
|
|
62
|
+
};
|
|
63
|
+
|
|
64
|
+
std::string get_inner_text(GumboNode *node);
|
|
65
|
+
std::string normalize_whitespace(const std::string &s);
|
|
66
|
+
std::string join_text_with_separator(XNode *node, const std::string &sep);
|
|
67
|
+
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
#endif // HTML_ELEMENT_H_
|
package/src/jsa.c
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/*
|
|
2
|
+
* Copyright (C) Weidong Fang
|
|
3
|
+
*
|
|
4
|
+
* Email: wdfang@gmail.com
|
|
5
|
+
*
|
|
6
|
+
* Last updated: 2014/06/28
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
#include "jsa.h"
|
|
10
|
+
|
|
11
|
+
#include <assert.h>
|
|
12
|
+
#include <stdlib.h>
|
|
13
|
+
#include <string.h>
|
|
14
|
+
|
|
15
|
+
#define xmalloc malloc
|
|
16
|
+
#define xrealloc realloc
|
|
17
|
+
#define xfree free
|
|
18
|
+
|
|
19
|
+
#define JSA_MAX_SIZE 1048576
|
|
20
|
+
#define JSA_OK 0
|
|
21
|
+
#define JSA_ERROR -1
|
|
22
|
+
|
|
23
|
+
jsa_t *jsa_create() {
|
|
24
|
+
jsa_t *a = (jsa_t*)xmalloc(sizeof(jsa_t));
|
|
25
|
+
jsa_init(a);
|
|
26
|
+
return a;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
void jsa_init(jsa_t *a) {
|
|
30
|
+
a->size = 0;
|
|
31
|
+
a->item = NULL;
|
|
32
|
+
a->alloc_size = 0;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
int jsa_alloc(jsa_t * a, uint32_t size) {
|
|
36
|
+
if (a->alloc_size < size) {
|
|
37
|
+
void *item;
|
|
38
|
+
|
|
39
|
+
if ((size = ((size + 31) / 32) * 32) > JSA_MAX_SIZE) {
|
|
40
|
+
return JSA_ERROR;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
item = xrealloc(a->item, size * sizeof(JSA_TYPE));
|
|
44
|
+
|
|
45
|
+
if (item == NULL ) {
|
|
46
|
+
return JSA_ERROR;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
a->item = (JSA_TYPE*) item;
|
|
50
|
+
a->alloc_size = size;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return JSA_OK;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
void jsa_clean(jsa_t *a) {
|
|
57
|
+
if (a->alloc_size > 0) {
|
|
58
|
+
xfree(a->item);
|
|
59
|
+
a->item = NULL;
|
|
60
|
+
a->alloc_size = 0;
|
|
61
|
+
a->size = 0;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
void jsa_free(jsa_t * a) {
|
|
66
|
+
jsa_clean(a);
|
|
67
|
+
xfree(a);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
void jsa_free_ex(jsa_t * a, void (*free_item)(JSA_TYPE)) {
|
|
71
|
+
if (free_item) {
|
|
72
|
+
uint32_t i;
|
|
73
|
+
for (i = 0; i < a->size; i++) {
|
|
74
|
+
(*free_item)(a->item[i]);
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
jsa_clean(a);
|
|
78
|
+
xfree(a);
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
void jsa_append(jsa_t *a, JSA_TYPE m) {
|
|
82
|
+
if (a->size >= a->alloc_size) {
|
|
83
|
+
jsa_alloc(a, a->size + 1);
|
|
84
|
+
}
|
|
85
|
+
assert (a->size < a->alloc_size);
|
|
86
|
+
a->item[a->size++] = m;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
JSA_TYPE jsa_pop(jsa_t *a) {
|
|
90
|
+
assert(a->size > 0);
|
|
91
|
+
return a->item[--a->size];
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
int jsa_resize(jsa_t *a, uint32_t size) {
|
|
95
|
+
if (size > a->alloc_size) {
|
|
96
|
+
if (jsa_alloc(a, size) != JSA_OK) {
|
|
97
|
+
return JSA_ERROR;
|
|
98
|
+
}
|
|
99
|
+
memset(a->item + a->size, 0, (size - a->size) * sizeof(a->item[0]));
|
|
100
|
+
}
|
|
101
|
+
else if (size < a->size) {
|
|
102
|
+
memset(a->item + size, 0, (a->size - size) * sizeof(a->item[0]));
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
a->size = size;
|
|
106
|
+
|
|
107
|
+
return JSA_OK;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
void jsa_insert(jsa_t *a, uint32_t index, JSA_TYPE m) {
|
|
111
|
+
if (index >= a->size) {
|
|
112
|
+
jsa_resize(a, index + 1);
|
|
113
|
+
a->item[index] = m;
|
|
114
|
+
} else {
|
|
115
|
+
memmove(&a->item[index + 1], &a->item[index],
|
|
116
|
+
(a->size - index) * sizeof(a->item[0]));
|
|
117
|
+
a->item[index] = m;
|
|
118
|
+
a->size++;
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
void jsa_dedup(jsa_t *a) {
|
|
123
|
+
uint32_t i, j, k, n = a->size;
|
|
124
|
+
|
|
125
|
+
for (i = 0; i < n; i++) {
|
|
126
|
+
for (j = i + 1; j < n;) {
|
|
127
|
+
if (a->item[j] == a->item[i]) {
|
|
128
|
+
for (k = j; k < n; k++) {
|
|
129
|
+
a->item[k] = a->item[k + 1];
|
|
130
|
+
}
|
|
131
|
+
n--;
|
|
132
|
+
} else {
|
|
133
|
+
j++;
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
if (n != a->size) {
|
|
139
|
+
jsa_resize(a, n);
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
jsa_t *jsa_clear(jsa_t *a) {
|
|
144
|
+
jsa_resize(a, 0);
|
|
145
|
+
return a;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
#if TEST_DEDUP
|
|
149
|
+
|
|
150
|
+
#include <stdio.h>
|
|
151
|
+
|
|
152
|
+
int main() {
|
|
153
|
+
jsa_t *a = jsa_create();
|
|
154
|
+
int i, n;
|
|
155
|
+
|
|
156
|
+
printf("Enter array elements (-1 to stop):\n");
|
|
157
|
+
while (1) {
|
|
158
|
+
scanf("%d", &n);
|
|
159
|
+
if (n == -1) break;
|
|
160
|
+
jsa_push(a, (void*)(uintptr_t)n);
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
for (i = 0; i < (int)jsa_size(a); i++) {
|
|
164
|
+
printf("%lu ", (unsigned long)jsa_get(a, i));
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
printf("\n");
|
|
168
|
+
|
|
169
|
+
jsa_dedup(a);
|
|
170
|
+
|
|
171
|
+
for (i = 0; i < (int)jsa_size(a); i++) {
|
|
172
|
+
printf("%lu ", (unsigned long)jsa_get(a, i));
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
printf("\n");
|
|
176
|
+
|
|
177
|
+
jsa_free(a);
|
|
178
|
+
|
|
179
|
+
return 0;
|
|
180
|
+
}
|
|
181
|
+
#endif
|
|
182
|
+
|
package/src/jsa.h
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#ifndef _JSA_H_
|
|
2
|
+
#define _JSA_H_
|
|
3
|
+
|
|
4
|
+
#ifdef __cplusplus
|
|
5
|
+
extern "C" {
|
|
6
|
+
#endif
|
|
7
|
+
|
|
8
|
+
#include <stdint.h>
|
|
9
|
+
|
|
10
|
+
#ifndef JSA_TYPE
|
|
11
|
+
#define JSA_TYPE uintptr_t
|
|
12
|
+
#endif
|
|
13
|
+
|
|
14
|
+
typedef struct {
|
|
15
|
+
JSA_TYPE *item;
|
|
16
|
+
uint32_t size;
|
|
17
|
+
uint32_t alloc_size;
|
|
18
|
+
} jsa_t;
|
|
19
|
+
|
|
20
|
+
jsa_t *jsa_create();
|
|
21
|
+
void jsa_init(jsa_t *);
|
|
22
|
+
void jsa_clean(jsa_t *);
|
|
23
|
+
void jsa_free(jsa_t *);
|
|
24
|
+
void jsa_free_ex(jsa_t * a, void (*free_item)(JSA_TYPE));
|
|
25
|
+
void jsa_append(jsa_t *, JSA_TYPE m);
|
|
26
|
+
JSA_TYPE jsa_pop(jsa_t *) ;
|
|
27
|
+
void jsa_insert(jsa_t *, uint32_t, JSA_TYPE);
|
|
28
|
+
int jsa_alloc(jsa_t * a, uint32_t size);
|
|
29
|
+
int jsa_resize(jsa_t * a, uint32_t size);
|
|
30
|
+
void jsa_dedup(jsa_t * a);
|
|
31
|
+
|
|
32
|
+
#define jsa_get(a,i) ((a)->item[i])
|
|
33
|
+
#define jsa_set(a, i, v) ((a)->item[i]=v)
|
|
34
|
+
#define jsa_push(a,p) jsa_append(a,(JSA_TYPE)p)
|
|
35
|
+
#define jsa_first(a) ((a)->size ? (a)->item[0] : 0)
|
|
36
|
+
#define jsa_last(a) ((a)->size ? (a)->item[(a)->size - 1] : 0)
|
|
37
|
+
#define jsa_size(a) ((a)->size)
|
|
38
|
+
|
|
39
|
+
#ifdef __cplusplus
|
|
40
|
+
}
|
|
41
|
+
#endif
|
|
42
|
+
|
|
43
|
+
#endif /* _JSA_H_ */
|
|
44
|
+
|
package/src/xnode.c
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
#include "xnode_query.h"
|
|
2
|
+
|
|
3
|
+
#include <ctype.h>
|
|
4
|
+
#include <string.h>
|
|
5
|
+
#include <stdlib.h>
|
|
6
|
+
|
|
7
|
+
GumboVector *xnode_children(XNode *node) {
|
|
8
|
+
if (!node) {
|
|
9
|
+
return NULL;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
switch (node->type) {
|
|
13
|
+
case GUMBO_NODE_DOCUMENT:
|
|
14
|
+
return &node->v.document.children;
|
|
15
|
+
case GUMBO_NODE_ELEMENT:
|
|
16
|
+
case GUMBO_NODE_TEMPLATE:
|
|
17
|
+
return &node->v.element.children;
|
|
18
|
+
default:
|
|
19
|
+
return NULL;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
static int is_html_space(char c) {
|
|
24
|
+
return isspace((unsigned char)c);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
static int string_in_space_list(const char *str, const char *tok) {
|
|
28
|
+
size_t tok_len;
|
|
29
|
+
|
|
30
|
+
if (!str || !tok || *tok == '\0') {
|
|
31
|
+
return 0;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
tok_len = strlen(tok);
|
|
35
|
+
while (*str) {
|
|
36
|
+
while (*str && is_html_space(*str)) {
|
|
37
|
+
str++;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (!strncmp(str, tok, tok_len) &&
|
|
41
|
+
(str[tok_len] == '\0' || is_html_space(str[tok_len]))) {
|
|
42
|
+
return 1;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
while (*str && !is_html_space(*str)) {
|
|
46
|
+
str++;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return 0;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
static XNodeDocument *xnode_parse_html_real(const char *html, size_t length,
|
|
54
|
+
int copy_html) {
|
|
55
|
+
XNodeDocument *doc = xnew(XNodeDocument);
|
|
56
|
+
|
|
57
|
+
if (copy_html) {
|
|
58
|
+
doc->html = (char*) xmalloc(length + 1);
|
|
59
|
+
memcpy(doc->html, html, length);
|
|
60
|
+
doc->html[length] = '\0';
|
|
61
|
+
} else {
|
|
62
|
+
doc->html = (char*) html;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
doc->html_length = length;
|
|
66
|
+
doc->parse_options = kGumboDefaultOptions;
|
|
67
|
+
doc->output = gumbo_parse_with_options(&doc->parse_options, doc->html, length);
|
|
68
|
+
|
|
69
|
+
return doc;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
XNodeDocument *xnode_parse_html(const char *html, size_t length) {
|
|
73
|
+
return xnode_parse_html_real(html, length, 1);
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const XNode *xnode_document_root(const XNodeDocument *document) {
|
|
77
|
+
return document->output->root;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
void xnode_document_free(XNodeDocument *doc) {
|
|
81
|
+
gumbo_destroy_output(&doc->parse_options, doc->output);
|
|
82
|
+
xfree(doc->html);
|
|
83
|
+
xfree(doc);
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
const char *xnode_attr(const XNode *node, const char *name) {
|
|
87
|
+
GumboAttribute *attr;
|
|
88
|
+
if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
|
|
89
|
+
if ((attr = gumbo_get_attribute(&node->v.element.attributes, name))) {
|
|
90
|
+
return attr->value;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
return NULL;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
int xnode_match_node(GumboNode *node, XNodeSelector *selector);
|
|
97
|
+
|
|
98
|
+
XNode *xnode_first_child(XNode *node, XNodeSelector *selector) {
|
|
99
|
+
uint32_t i;
|
|
100
|
+
GumboVector *children = xnode_children(node);
|
|
101
|
+
|
|
102
|
+
if (!children) {
|
|
103
|
+
return NULL;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
for (i = 0; i < children->length; ++i) {
|
|
107
|
+
GumboNode* child = (GumboNode*) children->data[i];
|
|
108
|
+
if (child->type != GUMBO_NODE_ELEMENT &&
|
|
109
|
+
child->type != GUMBO_NODE_TEMPLATE) continue;
|
|
110
|
+
if (!selector || xnode_match_node(child, selector)) {
|
|
111
|
+
return child;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return NULL;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
XNode *xnode_first_child_check(XNode *node, XNodeSelector *selector) {
|
|
119
|
+
XNode *child = xnode_first_child(node, NULL);
|
|
120
|
+
return child != NULL && xnode_match_node(child, selector) ? child : NULL;
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
XNode *xnode_parent(XNode *node) {
|
|
124
|
+
return node->parent;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
XNode *xnode_prev(XNode *node, XNodeSelector *selector) {
|
|
128
|
+
GumboNode *parent = node->parent;
|
|
129
|
+
GumboVector *children = xnode_children(parent);
|
|
130
|
+
size_t i;
|
|
131
|
+
|
|
132
|
+
if (!children) {
|
|
133
|
+
return NULL;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
for (i = node->index_within_parent; i > 0; i--) {
|
|
137
|
+
node = (GumboNode*) children->data[i - 1];
|
|
138
|
+
if (node->type != GUMBO_NODE_ELEMENT &&
|
|
139
|
+
node->type != GUMBO_NODE_TEMPLATE) continue;
|
|
140
|
+
if (!selector || xnode_match_node(node, selector)) {
|
|
141
|
+
return node;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
return NULL;
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
XNode *xnode_prev_check(XNode *node, XNodeSelector *selector) {
|
|
149
|
+
XNode *prev = xnode_prev(node, NULL);
|
|
150
|
+
return prev != NULL && xnode_match_node(prev, selector) ? prev : NULL;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
XNode *xnode_next(XNode *node, XNodeSelector *selector) {
|
|
154
|
+
GumboNode *parent = node->parent;
|
|
155
|
+
GumboVector *children = xnode_children(parent);
|
|
156
|
+
size_t i = node->index_within_parent;
|
|
157
|
+
|
|
158
|
+
if (!children) {
|
|
159
|
+
return NULL;
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
for (; i + 1 < children->length; i++) {
|
|
163
|
+
node = (GumboNode*) children->data[i + 1];
|
|
164
|
+
if (node->type != GUMBO_NODE_ELEMENT &&
|
|
165
|
+
node->type != GUMBO_NODE_TEMPLATE) continue;
|
|
166
|
+
if (!selector || xnode_match_node(node, selector)) {
|
|
167
|
+
return node;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
return NULL;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
XNode *xnode_next_check(XNode *node, XNodeSelector *selector) {
|
|
175
|
+
XNode *next = xnode_next(node, NULL);
|
|
176
|
+
return next != NULL && xnode_match_node(next, selector) ? next : NULL;
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
static const char *max_ptr(const char *a, const char *b) {
|
|
180
|
+
if (!a) {
|
|
181
|
+
return b;
|
|
182
|
+
}
|
|
183
|
+
if (!b) {
|
|
184
|
+
return a;
|
|
185
|
+
}
|
|
186
|
+
return b > a ? b : a;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
static const char *min_ptr(const char *a, const char *b) {
|
|
190
|
+
if (!a) {
|
|
191
|
+
return b;
|
|
192
|
+
}
|
|
193
|
+
if (!b) {
|
|
194
|
+
return a;
|
|
195
|
+
}
|
|
196
|
+
return b < a ? b : a;
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
static XNode *xnode_next_any(XNode *node) {
|
|
200
|
+
GumboVector *children;
|
|
201
|
+
|
|
202
|
+
if (!node || !node->parent) {
|
|
203
|
+
return NULL;
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
children = xnode_children(node->parent);
|
|
207
|
+
if (!children || node->index_within_parent + 1 >= children->length) {
|
|
208
|
+
return NULL;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
return (XNode*) children->data[node->index_within_parent + 1];
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
static const char *xnode_source_start(XNode *node) {
|
|
215
|
+
if (!node) {
|
|
216
|
+
return NULL;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
switch (node->type) {
|
|
220
|
+
case GUMBO_NODE_ELEMENT:
|
|
221
|
+
case GUMBO_NODE_TEMPLATE:
|
|
222
|
+
return node->v.element.original_tag.data;
|
|
223
|
+
default:
|
|
224
|
+
return node->v.text.original_text.data;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
static const char *xnode_implicit_end_boundary(XNode *node) {
|
|
229
|
+
const char *start = node->v.element.original_tag.data;
|
|
230
|
+
const char *boundary = xnode_source_start(xnode_next_any(node));
|
|
231
|
+
XNode *parent = node->parent;
|
|
232
|
+
|
|
233
|
+
while (parent) {
|
|
234
|
+
if ((parent->type == GUMBO_NODE_ELEMENT ||
|
|
235
|
+
parent->type == GUMBO_NODE_TEMPLATE) &&
|
|
236
|
+
parent->v.element.original_end_tag.data &&
|
|
237
|
+
parent->v.element.original_end_tag.data > start) {
|
|
238
|
+
boundary = min_ptr(boundary, parent->v.element.original_end_tag.data);
|
|
239
|
+
}
|
|
240
|
+
parent = parent->parent;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
return boundary;
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
static const char *xnode_subtree_end(XNode *node) {
|
|
247
|
+
const char *end = NULL;
|
|
248
|
+
|
|
249
|
+
if (!node) {
|
|
250
|
+
return NULL;
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
switch (node->type) {
|
|
254
|
+
case GUMBO_NODE_DOCUMENT: {
|
|
255
|
+
GumboVector *children = xnode_children(node);
|
|
256
|
+
uint32_t i;
|
|
257
|
+
for (i = 0; children && i < children->length; ++i) {
|
|
258
|
+
end = max_ptr(end, xnode_subtree_end((XNode*) children->data[i]));
|
|
259
|
+
}
|
|
260
|
+
return end;
|
|
261
|
+
}
|
|
262
|
+
case GUMBO_NODE_ELEMENT:
|
|
263
|
+
case GUMBO_NODE_TEMPLATE: {
|
|
264
|
+
GumboElement *element = &node->v.element;
|
|
265
|
+
GumboVector *children = xnode_children(node);
|
|
266
|
+
uint32_t i;
|
|
267
|
+
|
|
268
|
+
if (element->original_tag.data) {
|
|
269
|
+
end = element->original_tag.data + element->original_tag.length;
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
for (i = 0; children && i < children->length; ++i) {
|
|
273
|
+
end = max_ptr(end, xnode_subtree_end((XNode*) children->data[i]));
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
if (element->original_end_tag.data) {
|
|
277
|
+
end = max_ptr(end,
|
|
278
|
+
element->original_end_tag.data + element->original_end_tag.length);
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
return end;
|
|
282
|
+
}
|
|
283
|
+
default:
|
|
284
|
+
if (node->v.text.original_text.data) {
|
|
285
|
+
return node->v.text.original_text.data + node->v.text.original_text.length;
|
|
286
|
+
}
|
|
287
|
+
return NULL;
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
static const char *xnode_html_element(XNode *node, size_t *length) {
|
|
292
|
+
GumboElement *element = &node->v.element;
|
|
293
|
+
*length = 0;
|
|
294
|
+
if (element->original_tag.length > 0) {
|
|
295
|
+
const char *end = NULL;
|
|
296
|
+
if (element->end_pos.offset > element->start_pos.offset) {
|
|
297
|
+
end = element->original_tag.data +
|
|
298
|
+
(element->end_pos.offset - element->start_pos.offset);
|
|
299
|
+
if (element->original_end_tag.length > 0) {
|
|
300
|
+
end += element->original_end_tag.length;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
else {
|
|
304
|
+
end = xnode_subtree_end(node);
|
|
305
|
+
}
|
|
306
|
+
if (element->original_end_tag.length == 0) {
|
|
307
|
+
end = min_ptr(end, xnode_implicit_end_boundary(node));
|
|
308
|
+
}
|
|
309
|
+
if (end && end >= element->original_tag.data) {
|
|
310
|
+
*length = end - element->original_tag.data;
|
|
311
|
+
}
|
|
312
|
+
if (*length == 0) {
|
|
313
|
+
*length = element->original_tag.length;
|
|
314
|
+
}
|
|
315
|
+
return element->original_tag.data;
|
|
316
|
+
}
|
|
317
|
+
else {
|
|
318
|
+
return "";
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
const char *xnode_html(XNode *node, size_t *length) {
|
|
323
|
+
switch(node->type) {
|
|
324
|
+
case GUMBO_NODE_DOCUMENT:
|
|
325
|
+
*length = 0;
|
|
326
|
+
return "";
|
|
327
|
+
case GUMBO_NODE_ELEMENT:
|
|
328
|
+
return xnode_html_element(node, length);
|
|
329
|
+
default:
|
|
330
|
+
*length = node->v.text.original_text.length;
|
|
331
|
+
return node->v.text.original_text.data;
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Returns the text content of the first GUMBO_NODE_TEXT child.
|
|
337
|
+
*/
|
|
338
|
+
const char *xnode_text(XNode *node) {
|
|
339
|
+
uint32_t i;
|
|
340
|
+
|
|
341
|
+
for (i = 0; i < node->v.element.children.length; ++i) {
|
|
342
|
+
GumboNode* child = (GumboNode*) node->v.element.children.data[i];
|
|
343
|
+
if (child->type == GUMBO_NODE_TEXT) {
|
|
344
|
+
return child->v.text.text;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
return NULL;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
const char *xnode_get_tag_name(GumboNode *node, size_t *length);
|
|
352
|
+
|
|
353
|
+
const char *xnode_type(XNode *node, size_t *length) {
|
|
354
|
+
return xnode_get_tag_name(node, length);
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
int xnode_has_class(XNode *node, const char *class_name) {
|
|
358
|
+
GumboAttribute *attr;
|
|
359
|
+
if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
|
|
360
|
+
if ((attr = gumbo_get_attribute(&node->v.element.attributes, "class"))) {
|
|
361
|
+
return string_in_space_list(attr->value, class_name) ? 1 : 0;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
return 0;
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
int xnode_has_attr(XNode *node, const char *name) {
|
|
368
|
+
if (node->type == GUMBO_NODE_ELEMENT || node->type == GUMBO_NODE_TEMPLATE) {
|
|
369
|
+
return gumbo_get_attribute(&node->v.element.attributes, name) ? 1 : 0;
|
|
370
|
+
}
|
|
371
|
+
return 0;
|
|
372
|
+
}
|