gumbo-html 0.2.3 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -10
- package/binding.gyp +49 -0
- package/examples/example.js +87 -0
- package/examples/scrape.js +301 -0
- package/index.d.ts +58 -3
- package/index.js +7 -2
- package/lib/wrapper.js +385 -0
- package/package.json +36 -5
- package/src/addon.cc +19 -0
- package/src/gumbo-parser/COPYING +201 -0
- package/src/gumbo-parser/README.md +8 -0
- package/src/gumbo-parser/src/attribute.c +44 -0
- package/src/gumbo-parser/src/attribute.h +37 -0
- package/src/gumbo-parser/src/char_ref.c +23069 -0
- package/src/gumbo-parser/src/char_ref.h +60 -0
- package/src/gumbo-parser/src/error.c +279 -0
- package/src/gumbo-parser/src/error.h +225 -0
- package/src/gumbo-parser/src/gumbo.h +671 -0
- package/src/gumbo-parser/src/insertion_mode.h +57 -0
- package/src/gumbo-parser/src/parser.c +4192 -0
- package/src/gumbo-parser/src/parser.h +57 -0
- package/src/gumbo-parser/src/string_buffer.c +110 -0
- package/src/gumbo-parser/src/string_buffer.h +84 -0
- package/src/gumbo-parser/src/string_piece.c +48 -0
- package/src/gumbo-parser/src/string_piece.h +38 -0
- package/src/gumbo-parser/src/tag.c +95 -0
- package/src/gumbo-parser/src/tag_enum.h +153 -0
- package/src/gumbo-parser/src/tag_gperf.h +105 -0
- package/src/gumbo-parser/src/tag_sizes.h +4 -0
- package/src/gumbo-parser/src/tag_strings.h +153 -0
- package/src/gumbo-parser/src/token_type.h +41 -0
- package/src/gumbo-parser/src/tokenizer.c +2897 -0
- package/src/gumbo-parser/src/tokenizer.h +123 -0
- package/src/gumbo-parser/src/tokenizer_states.h +103 -0
- package/src/gumbo-parser/src/utf8.c +270 -0
- package/src/gumbo-parser/src/utf8.h +132 -0
- package/src/gumbo-parser/src/util.c +58 -0
- package/src/gumbo-parser/src/util.h +60 -0
- package/src/gumbo-parser/src/vector.c +123 -0
- package/src/gumbo-parser/src/vector.h +67 -0
- package/src/html_document.cc +411 -0
- package/src/html_document.h +56 -0
- package/src/html_element.cc +963 -0
- package/src/html_element.h +70 -0
- package/src/include/win/strings.h +11 -0
- package/src/jsa.c +182 -0
- package/src/jsa.h +44 -0
- package/src/xnode.c +372 -0
- package/src/xnode_query.c +330 -0
- package/src/xnode_query.h +186 -0
- package/src/xnode_query_parser.c +414 -0
- package/install.js +0 -15
package/README.md
CHANGED
|
@@ -1,26 +1,43 @@
|
|
|
1
|
+
# gumbo-html
|
|
2
|
+
|
|
1
3
|
CSS selector based on Gumbo HTML parser.
|
|
2
4
|
|
|
3
5
|
## Installation
|
|
6
|
+
|
|
7
|
+
```sh
|
|
8
|
+
npm install gumbo-html
|
|
4
9
|
```
|
|
5
|
-
|
|
6
|
-
|
|
10
|
+
|
|
11
|
+
`gumbo-html` is a native Node.js addon and is compiled from source on install.
|
|
12
|
+
You'll need the standard `node-gyp` toolchain:
|
|
13
|
+
|
|
14
|
+
- Python 3
|
|
15
|
+
- A C/C++ compiler (Xcode Command Line Tools on macOS, `build-essential` on
|
|
16
|
+
Linux, or Visual Studio Build Tools on Windows)
|
|
17
|
+
|
|
18
|
+
See the [node-gyp docs](https://github.com/nodejs/node-gyp#installation) for
|
|
19
|
+
platform-specific setup details.
|
|
7
20
|
|
|
8
21
|
## Usage
|
|
9
22
|
|
|
10
|
-
Example:
|
|
11
23
|
```ts
|
|
12
|
-
import {parse} from 'gumbo-html';
|
|
24
|
+
import { parse } from 'gumbo-html';
|
|
13
25
|
|
|
14
|
-
const html = `
|
|
26
|
+
const html = `
|
|
15
27
|
<html>
|
|
16
28
|
<p class="foo bar blah">Foo</p>
|
|
17
29
|
<p class="bar">Bar</p>
|
|
18
30
|
</html>
|
|
19
|
-
|
|
31
|
+
`;
|
|
20
32
|
|
|
21
|
-
const
|
|
33
|
+
const doc = parse(html);
|
|
22
34
|
|
|
23
|
-
|
|
24
|
-
console.log(el.innerText)
|
|
35
|
+
doc.find('.bar').forEach((el) => {
|
|
36
|
+
console.log(el.innerText);
|
|
25
37
|
});
|
|
26
|
-
```
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## License
|
|
41
|
+
|
|
42
|
+
MIT. Bundles [google/gumbo-parser](https://github.com/google/gumbo-parser)
|
|
43
|
+
(Apache-2.0) under `src/gumbo-parser/`.
|
package/binding.gyp
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
"targets": [
|
|
3
|
+
{
|
|
4
|
+
"target_name": "html",
|
|
5
|
+
"sources": [
|
|
6
|
+
"src/addon.cc",
|
|
7
|
+
"src/html_document.cc",
|
|
8
|
+
"src/html_element.cc",
|
|
9
|
+
"src/xnode.c",
|
|
10
|
+
"src/xnode_query.c",
|
|
11
|
+
"src/xnode_query_parser.c",
|
|
12
|
+
"src/jsa.c",
|
|
13
|
+
"src/gumbo-parser/src/attribute.c",
|
|
14
|
+
"src/gumbo-parser/src/error.c",
|
|
15
|
+
"src/gumbo-parser/src/string_buffer.c",
|
|
16
|
+
"src/gumbo-parser/src/tag.c",
|
|
17
|
+
"src/gumbo-parser/src/utf8.c",
|
|
18
|
+
"src/gumbo-parser/src/vector.c",
|
|
19
|
+
"src/gumbo-parser/src/char_ref.c",
|
|
20
|
+
"src/gumbo-parser/src/parser.c",
|
|
21
|
+
"src/gumbo-parser/src/string_piece.c",
|
|
22
|
+
"src/gumbo-parser/src/tokenizer.c",
|
|
23
|
+
"src/gumbo-parser/src/util.c"
|
|
24
|
+
],
|
|
25
|
+
"include_dirs": [
|
|
26
|
+
"<!@(node -p \"require('node-addon-api').include\")",
|
|
27
|
+
"src",
|
|
28
|
+
"src/gumbo-parser/src"
|
|
29
|
+
],
|
|
30
|
+
"cflags!": ["-fno-exceptions"],
|
|
31
|
+
"cflags_cc!": ["-fno-exceptions"],
|
|
32
|
+
"xcode_settings": {
|
|
33
|
+
"GCC_ENABLE_CPP_EXCEPTIONS": "YES",
|
|
34
|
+
"CLANG_CXX_LIBRARY": "libc++",
|
|
35
|
+
"MACOSX_DEPLOYMENT_TARGET": "10.15"
|
|
36
|
+
},
|
|
37
|
+
"msvs_settings": {
|
|
38
|
+
"VCCLCompilerTool": {
|
|
39
|
+
"ExceptionHandling": 1
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
"conditions": [
|
|
43
|
+
["OS==\"win\"", {
|
|
44
|
+
"include_dirs": ["src/include/win"]
|
|
45
|
+
}]
|
|
46
|
+
]
|
|
47
|
+
}
|
|
48
|
+
]
|
|
49
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
const { parse } = require('..');
|
|
4
|
+
|
|
5
|
+
const html = `
|
|
6
|
+
<!doctype html>
|
|
7
|
+
<html>
|
|
8
|
+
<body>
|
|
9
|
+
<main id="content">
|
|
10
|
+
<article class="post featured" data-slug="hello-world">
|
|
11
|
+
<h1>Hello world</h1>
|
|
12
|
+
<p class="summary">A short introduction.</p>
|
|
13
|
+
<a class="cta primary" href="/hello">Read more</a>
|
|
14
|
+
</article>
|
|
15
|
+
|
|
16
|
+
<article class="post" data-slug="second-post">
|
|
17
|
+
<h1>Second post</h1>
|
|
18
|
+
<p class="summary">A follow-up note.</p>
|
|
19
|
+
<a class="cta" href="/second">Open post</a>
|
|
20
|
+
</article>
|
|
21
|
+
</main>
|
|
22
|
+
</body>
|
|
23
|
+
</html>
|
|
24
|
+
`;
|
|
25
|
+
|
|
26
|
+
const doc = parse(html);
|
|
27
|
+
|
|
28
|
+
// documentElement returns the parsed <html> element.
|
|
29
|
+
console.log('Root tag:', doc.documentElement.tagName);
|
|
30
|
+
|
|
31
|
+
// find(selector) returns every matching element under the document or element.
|
|
32
|
+
const posts = doc.find('article.post');
|
|
33
|
+
console.log('Post count:', posts.length);
|
|
34
|
+
|
|
35
|
+
posts.forEach((post, index) => {
|
|
36
|
+
// attr(name) returns undefined when the attribute is missing.
|
|
37
|
+
console.log(`Post ${index + 1}:`, post.attr('data-slug'));
|
|
38
|
+
|
|
39
|
+
// first(selector) returns the first match or null.
|
|
40
|
+
const title = post.first('h1');
|
|
41
|
+
console.log(' title:', title ? title.innerText : '(missing)');
|
|
42
|
+
|
|
43
|
+
// hasClass(name) and hasAttribute(name) are convenience checks.
|
|
44
|
+
console.log(' featured:', post.hasClass('featured'));
|
|
45
|
+
console.log(' has slug:', post.hasAttribute('data-slug'));
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
// first_s(selector) is the throwing version of first(selector).
|
|
49
|
+
// Use it when the element is required for the rest of your code.
|
|
50
|
+
const content = doc.first_s('#content');
|
|
51
|
+
console.log('Main outerHTML starts with:', content.outerHTML.slice(0, 20));
|
|
52
|
+
|
|
53
|
+
// only(selector) returns the match only when exactly one element is found.
|
|
54
|
+
const featuredPost = doc.only('article.featured');
|
|
55
|
+
console.log('Featured slug:', featuredPost.attr_s('data-slug'));
|
|
56
|
+
|
|
57
|
+
// only_s(selector) throws unless exactly one element is found.
|
|
58
|
+
try {
|
|
59
|
+
doc.only_s('article.post');
|
|
60
|
+
} catch (error) {
|
|
61
|
+
console.log('only_s on many posts:', error.message);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// Element-scoped queries search only inside that element.
|
|
65
|
+
const firstPost = doc.first_s('article.post');
|
|
66
|
+
console.log('CTA href:', firstPost.first_s('a.cta').attr_s('href'));
|
|
67
|
+
|
|
68
|
+
// next(selector) and prev(selector) walk element siblings.
|
|
69
|
+
const secondPost = firstPost.next('article.post');
|
|
70
|
+
console.log('Next post slug:', secondPost.attr_s('data-slug'));
|
|
71
|
+
console.log('Previous post slug:', secondPost.prev('article.post').attr_s('data-slug'));
|
|
72
|
+
|
|
73
|
+
// childNodes includes text/whitespace nodes as well as element nodes.
|
|
74
|
+
// nodeType helps distinguish them.
|
|
75
|
+
const childTypes = content.childNodes.map((node) => node.nodeType);
|
|
76
|
+
console.log('Main child node types:', childTypes.join(', '));
|
|
77
|
+
|
|
78
|
+
// parent returns the parent element, or null for the document root.
|
|
79
|
+
console.log('First post parent:', firstPost.parent.tagName);
|
|
80
|
+
console.log('Document root parent:', doc.documentElement.parent);
|
|
81
|
+
|
|
82
|
+
// attr_s(name) throws when an attribute is required but missing.
|
|
83
|
+
try {
|
|
84
|
+
firstPost.attr_s('missing');
|
|
85
|
+
} catch (error) {
|
|
86
|
+
console.log('attr_s on missing attribute:', error.message);
|
|
87
|
+
}
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* gumbo-html examples
|
|
5
|
+
*
|
|
6
|
+
* Demonstrates all the new features including:
|
|
7
|
+
* - Friendly aliases (firstOrThrow, onlyOrThrow, attrOrThrow)
|
|
8
|
+
* - Convenience methods (exists, count, text, attr with selector)
|
|
9
|
+
* - Traversal (closest, children, siblings, matches, is)
|
|
10
|
+
* - Table extraction (rows, table)
|
|
11
|
+
* - Text normalization
|
|
12
|
+
* - Structured extraction
|
|
13
|
+
* - URL resolution with baseUrl
|
|
14
|
+
* - Common extractors (meta, links, images, title, etc.)
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
const html = require('..');
|
|
18
|
+
|
|
19
|
+
// ============================================================
|
|
20
|
+
// Parse a HTML page
|
|
21
|
+
// ============================================================
|
|
22
|
+
const HTML = `
|
|
23
|
+
<!DOCTYPE html>
|
|
24
|
+
<html>
|
|
25
|
+
<head>
|
|
26
|
+
<title>Example Blog</title>
|
|
27
|
+
<meta name="description" content="A blog about web scraping">
|
|
28
|
+
<meta property="og:title" content="Example Blog OG">
|
|
29
|
+
<meta property="og:image" content="/images/og.png">
|
|
30
|
+
<link rel="canonical" href="https://example.com/blog/">
|
|
31
|
+
</head>
|
|
32
|
+
<body>
|
|
33
|
+
<article class="post featured">
|
|
34
|
+
<h1> Getting Started with Web Scraping </h1>
|
|
35
|
+
<p>First paragraph of content.</p>
|
|
36
|
+
<a href="/post/getting-started">Read More</a>
|
|
37
|
+
<img src="/images/scraping.png" alt="Web Scraping">
|
|
38
|
+
<ul class="tags">
|
|
39
|
+
<li>scraping</li>
|
|
40
|
+
<li>html</li>
|
|
41
|
+
<li>tutorial</li>
|
|
42
|
+
</ul>
|
|
43
|
+
</article>
|
|
44
|
+
|
|
45
|
+
<article class="post">
|
|
46
|
+
<h1>Advanced CSS Selectors</h1>
|
|
47
|
+
<p>Learn about complex selectors.</p>
|
|
48
|
+
<a href="/post/advanced-selectors">Read More</a>
|
|
49
|
+
<ul class="tags">
|
|
50
|
+
<li>css</li>
|
|
51
|
+
<li>selectors</li>
|
|
52
|
+
</ul>
|
|
53
|
+
</article>
|
|
54
|
+
|
|
55
|
+
<section class="sidebar">
|
|
56
|
+
<h2>Popular Posts</h2>
|
|
57
|
+
<ul>
|
|
58
|
+
<li><a href="/popular/1">How to Use CSS Selectors</a></li>
|
|
59
|
+
<li><a href="/popular/2">HTML Parsing Guide</a></li>
|
|
60
|
+
</ul>
|
|
61
|
+
</section>
|
|
62
|
+
|
|
63
|
+
<table class="pricing">
|
|
64
|
+
<thead>
|
|
65
|
+
<tr><th>Plan</th><th>Price</th><th>Features</th></tr>
|
|
66
|
+
</thead>
|
|
67
|
+
<tbody>
|
|
68
|
+
<tr><td>Basic</td><td>$10/mo</td><td>100 requests</td></tr>
|
|
69
|
+
<tr><td>Pro</td><td>$30/mo</td><td>1000 requests</td></tr>
|
|
70
|
+
<tr><td>Enterprise</td><td>$100/mo</td><td>Unlimited</td></tr>
|
|
71
|
+
</tbody>
|
|
72
|
+
</table>
|
|
73
|
+
|
|
74
|
+
<form action="/search" method="get">
|
|
75
|
+
<input type="text" name="q" placeholder="Search...">
|
|
76
|
+
<button type="submit">Go</button>
|
|
77
|
+
</form>
|
|
78
|
+
</body>
|
|
79
|
+
</html>
|
|
80
|
+
`;
|
|
81
|
+
|
|
82
|
+
const doc = html.parse(HTML, { baseUrl: 'https://example.com/blog/' });
|
|
83
|
+
|
|
84
|
+
// ============================================================
|
|
85
|
+
// 1. Friendly Required/Optional Aliases
|
|
86
|
+
// ============================================================
|
|
87
|
+
console.log('=== 1. Friendly Aliases ===');
|
|
88
|
+
|
|
89
|
+
// firstOrThrow - like first_s but more readable
|
|
90
|
+
const firstArticle = doc.firstOrThrow('article');
|
|
91
|
+
console.log('firstOrThrow article text:', firstArticle.text('h1'));
|
|
92
|
+
|
|
93
|
+
// onlyOrThrow - returns single element or throws
|
|
94
|
+
const sidebar = doc.onlyOrThrow('.sidebar');
|
|
95
|
+
console.log('onlyOrThrow .sidebar heading:', sidebar.text('h2'));
|
|
96
|
+
|
|
97
|
+
// attrOrThrow - get attribute or throw
|
|
98
|
+
const firstLink = doc.firstOrThrow('a');
|
|
99
|
+
console.log('attrOrThrow href:', firstLink.attrOrThrow('href'));
|
|
100
|
+
|
|
101
|
+
// textOrThrow - find element and get text or throw
|
|
102
|
+
console.log('textOrThrow h1:', doc.textOrThrow('h1'));
|
|
103
|
+
|
|
104
|
+
// ============================================================
|
|
105
|
+
// 2. Selector-Scoped Convenience Methods
|
|
106
|
+
// ============================================================
|
|
107
|
+
console.log('\n=== 2. Convenience Methods ===');
|
|
108
|
+
|
|
109
|
+
// exists(selector) - check if any element matches
|
|
110
|
+
console.log('exists .featured:', doc.exists('.featured')); // true
|
|
111
|
+
console.log('exists .missing:', doc.exists('.missing')); // false
|
|
112
|
+
|
|
113
|
+
// count(selector) - count matching elements
|
|
114
|
+
console.log('count article:', doc.count('article')); // 2
|
|
115
|
+
console.log('count li:', doc.count('li')); // 5
|
|
116
|
+
|
|
117
|
+
// text(selector) - get text of first match (null if not found)
|
|
118
|
+
console.log('text h1:', JSON.stringify(doc.text('h1'))); // "Getting Started..."
|
|
119
|
+
|
|
120
|
+
// attr(selector, name) - get attribute of first match
|
|
121
|
+
console.log('attr meta[property] content:', doc.attr('meta[property="og:title"]', 'content'));
|
|
122
|
+
|
|
123
|
+
// attrOrThrow(selector, name) - get attribute or throw
|
|
124
|
+
console.log('attrOrThrow a href:', doc.attrOrThrow('a', 'href'));
|
|
125
|
+
|
|
126
|
+
// ============================================================
|
|
127
|
+
// 3. Text Normalization
|
|
128
|
+
// ============================================================
|
|
129
|
+
console.log('\n=== 3. Text Normalization ===');
|
|
130
|
+
|
|
131
|
+
const h1 = doc.firstOrThrow('h1');
|
|
132
|
+
|
|
133
|
+
// raw text (default)
|
|
134
|
+
console.log('raw text:', JSON.stringify(h1.text()));
|
|
135
|
+
|
|
136
|
+
// normalized (trimmed + collapsed whitespace)
|
|
137
|
+
console.log('normalized:', JSON.stringify(h1.text({ normalize: true })));
|
|
138
|
+
|
|
139
|
+
// separator (join descendant text with custom separator)
|
|
140
|
+
console.log('separator:', JSON.stringify(h1.text({ separator: ' | ' })));
|
|
141
|
+
|
|
142
|
+
// ============================================================
|
|
143
|
+
// 4. Traversal
|
|
144
|
+
// ============================================================
|
|
145
|
+
console.log('\n=== 4. Traversal ===');
|
|
146
|
+
|
|
147
|
+
// closest - find nearest ancestor matching selector
|
|
148
|
+
const firstH1 = doc.firstOrThrow('h1');
|
|
149
|
+
const article = firstH1.closest('article');
|
|
150
|
+
console.log('h1.closest(article) tag:', article.tagName);
|
|
151
|
+
console.log('h1.closest(article) class:', article.attr('class'));
|
|
152
|
+
|
|
153
|
+
// children - get direct element children (optionally filtered)
|
|
154
|
+
const allChildren = article.children();
|
|
155
|
+
console.log('article children count:', allChildren.length);
|
|
156
|
+
|
|
157
|
+
const headingChildren = article.children('h1');
|
|
158
|
+
console.log('article children(h1):', headingChildren.length);
|
|
159
|
+
|
|
160
|
+
// siblings - get sibling elements (optionally filtered)
|
|
161
|
+
const firstP = article.firstOrThrow('p');
|
|
162
|
+
const siblingElements = firstP.siblings();
|
|
163
|
+
console.log('p siblings count:', siblingElements.length);
|
|
164
|
+
|
|
165
|
+
const linkSiblings = firstP.siblings('a');
|
|
166
|
+
console.log('p siblings(a):', linkSiblings.length);
|
|
167
|
+
|
|
168
|
+
// matches / is - check if element matches a selector
|
|
169
|
+
console.log('h1 matches article h1:', firstH1.matches('article h1'));
|
|
170
|
+
console.log('h1 matches div:', firstH1.matches('div'));
|
|
171
|
+
console.log('h1 is(h1):', firstH1.is('h1'));
|
|
172
|
+
|
|
173
|
+
// ============================================================
|
|
174
|
+
// 5. Table Extraction
|
|
175
|
+
// ============================================================
|
|
176
|
+
console.log('\n=== 5. Table Extraction ===');
|
|
177
|
+
|
|
178
|
+
// rows() - extract table rows as objects
|
|
179
|
+
const pricingRows = doc.firstOrThrow('.pricing').rows();
|
|
180
|
+
console.log('pricing table rows:');
|
|
181
|
+
for (const row of pricingRows) {
|
|
182
|
+
console.log(` ${row.Plan}: ${row.Price} (${row.Features})`);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
// doc.table() - convenience: find table and extract rows
|
|
186
|
+
const tableData = doc.table('.pricing');
|
|
187
|
+
console.log('doc.table direct:', JSON.stringify(tableData));
|
|
188
|
+
|
|
189
|
+
// ============================================================
|
|
190
|
+
// 6. Structured Extraction
|
|
191
|
+
// ============================================================
|
|
192
|
+
console.log('\n=== 6. Structured Extraction ===');
|
|
193
|
+
|
|
194
|
+
const extracted = doc.extract({
|
|
195
|
+
title: ['h1', 'text'],
|
|
196
|
+
canonicalUrl: ['link[rel="canonical"]', 'href'],
|
|
197
|
+
hasFeatured: ['.featured', 'exists'],
|
|
198
|
+
|
|
199
|
+
articles: ['article.post', {
|
|
200
|
+
heading: ['h1', 'text'],
|
|
201
|
+
link: ['a', 'href'],
|
|
202
|
+
hasImage: ['img', 'exists'],
|
|
203
|
+
tags: ['ul.tags li', 'text'],
|
|
204
|
+
}],
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
console.log(JSON.stringify(extracted, null, 2));
|
|
208
|
+
|
|
209
|
+
// Element-level extract
|
|
210
|
+
const articleEl = doc.firstOrThrow('article');
|
|
211
|
+
const articleData = articleEl.extract({
|
|
212
|
+
heading: ['h1', 'text'],
|
|
213
|
+
tagCount: ['li', 'count'],
|
|
214
|
+
});
|
|
215
|
+
console.log('elem.extract:', JSON.stringify(articleData));
|
|
216
|
+
|
|
217
|
+
// ============================================================
|
|
218
|
+
// 7. URL Resolution
|
|
219
|
+
// ============================================================
|
|
220
|
+
console.log('\n=== 7. URL Resolution ===');
|
|
221
|
+
|
|
222
|
+
// baseUrl resolves relative URLs in links() and images()
|
|
223
|
+
const links = doc.links();
|
|
224
|
+
console.log('links (with baseUrl):');
|
|
225
|
+
for (const l of links) {
|
|
226
|
+
console.log(` "${l.text}" -> ${l.href}`);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const imgs = doc.images();
|
|
230
|
+
console.log('images (with baseUrl):');
|
|
231
|
+
for (const img of imgs) {
|
|
232
|
+
console.log(` alt="${img.alt}" src="${img.src}"`);
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// doc.url(selector, attr) - resolve a specific URL
|
|
236
|
+
console.log('url(a, href):', doc.url('a', 'href'));
|
|
237
|
+
|
|
238
|
+
// ============================================================
|
|
239
|
+
// 8. Common Extractors
|
|
240
|
+
// ============================================================
|
|
241
|
+
console.log('\n=== 8. Common Extractors ===');
|
|
242
|
+
|
|
243
|
+
// Page title
|
|
244
|
+
console.log('title():', doc.title());
|
|
245
|
+
|
|
246
|
+
// Meta description
|
|
247
|
+
console.log('description():', doc.description());
|
|
248
|
+
|
|
249
|
+
// Canonical URL
|
|
250
|
+
console.log('canonicalUrl():', doc.canonicalUrl());
|
|
251
|
+
|
|
252
|
+
// Meta tags object
|
|
253
|
+
console.log('meta():', JSON.stringify(doc.meta()));
|
|
254
|
+
|
|
255
|
+
// Forms
|
|
256
|
+
console.log('forms count:', doc.forms().length);
|
|
257
|
+
|
|
258
|
+
// Tables
|
|
259
|
+
console.log('tables count:', doc.tables().length);
|
|
260
|
+
|
|
261
|
+
// ============================================================
|
|
262
|
+
// 9. Worked Example: Real-World Scraping
|
|
263
|
+
// ============================================================
|
|
264
|
+
console.log('\n=== 9. Real-World Scraping Example ===');
|
|
265
|
+
|
|
266
|
+
/**
|
|
267
|
+
* Extract structured data from a blog page with one call.
|
|
268
|
+
*/
|
|
269
|
+
function scrapeBlogPage(htmlContent, pageUrl) {
|
|
270
|
+
const doc = html.parse(htmlContent, { baseUrl: pageUrl });
|
|
271
|
+
|
|
272
|
+
return doc.extract({
|
|
273
|
+
// Page-level info
|
|
274
|
+
pageTitle: ['title', 'text'],
|
|
275
|
+
metaDescription: ['meta[name="description"]', 'content'],
|
|
276
|
+
ogImage: ['meta[property="og:image"]', 'content'],
|
|
277
|
+
canonicalUrl: ['link[rel="canonical"]', 'href'],
|
|
278
|
+
|
|
279
|
+
// Content
|
|
280
|
+
posts: ['article.post', {
|
|
281
|
+
heading: ['h1', 'text'],
|
|
282
|
+
link: ['a', 'href'],
|
|
283
|
+
hasImage: ['img', 'exists'],
|
|
284
|
+
isFeatured: ['.featured', 'exists'],
|
|
285
|
+
tagCount: ['ul.tags li', 'count'],
|
|
286
|
+
}],
|
|
287
|
+
|
|
288
|
+
// Sidebar links
|
|
289
|
+
sidebarLinks: ['.sidebar a', 'text'],
|
|
290
|
+
|
|
291
|
+
// Stats
|
|
292
|
+
totalArticles: ['article.post', 'count'],
|
|
293
|
+
hasForm: ['form', 'exists'],
|
|
294
|
+
hasPricing: ['.pricing', 'exists'],
|
|
295
|
+
});
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
const result = scrapeBlogPage(HTML, 'https://example.com/blog/');
|
|
299
|
+
console.log(JSON.stringify(result, null, 2));
|
|
300
|
+
|
|
301
|
+
console.log('\nAll examples completed successfully!');
|
package/index.d.ts
CHANGED
|
@@ -1,11 +1,21 @@
|
|
|
1
1
|
export declare type NodeType = 'DOCUMENT' | 'ELEMENT' | 'TEXT' | 'CDATA' | 'COMMENT' | 'WHITESPACE' | 'TEMPLATE' | 'UNKNOWN';
|
|
2
2
|
|
|
3
|
+
export declare type TextOptions = {
|
|
4
|
+
normalize?: boolean;
|
|
5
|
+
separator?: string;
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
export declare type ExtractSchema = {
|
|
9
|
+
[key: string]: [string, string | ExtractSchema | 'exists' | 'text' | 'count'];
|
|
10
|
+
};
|
|
11
|
+
|
|
3
12
|
export declare type XElement = {
|
|
4
13
|
childNodes: XElement[];
|
|
5
14
|
nodeType: NodeType;
|
|
6
15
|
parent: XElement | null;
|
|
7
16
|
outerHTML: string;
|
|
8
17
|
innerText: string;
|
|
18
|
+
textContent: string;
|
|
9
19
|
tagName: string | null;
|
|
10
20
|
|
|
11
21
|
attr: (name: string) => string | undefined;
|
|
@@ -13,24 +23,69 @@ export declare type XElement = {
|
|
|
13
23
|
find: (selector: string) => XElement[];
|
|
14
24
|
first: (selector: string) => XElement | null;
|
|
15
25
|
first_s: (selector: string) => XElement;
|
|
26
|
+
firstOrThrow: (selector: string) => XElement;
|
|
16
27
|
only: (selector: string) => XElement | null;
|
|
17
28
|
only_s: (selector: string) => XElement;
|
|
29
|
+
onlyOrThrow: (selector: string) => XElement;
|
|
18
30
|
hasClass: (name: string) => boolean;
|
|
19
31
|
hasAttribute: (name: string) => boolean;
|
|
20
32
|
prev: (selector?: string) => XElement | null;
|
|
21
33
|
next: (selector?: string) => XElement | null;
|
|
34
|
+
|
|
35
|
+
// New methods
|
|
36
|
+
attrOrThrow: (name: string) => string;
|
|
37
|
+
text: ((opts?: TextOptions) => string) | ((selector: string, opts?: TextOptions) => string | null);
|
|
38
|
+
textOrThrow: (selector: string) => string;
|
|
39
|
+
exists: (selector: string) => boolean;
|
|
40
|
+
count: (selector: string) => number;
|
|
41
|
+
closest: (selector: string) => XElement | null;
|
|
42
|
+
children: (selector?: string) => XElement[];
|
|
43
|
+
siblings: (selector?: string) => XElement[];
|
|
44
|
+
matches: (selector: string) => boolean;
|
|
45
|
+
is: (selector: string) => boolean;
|
|
46
|
+
rows: () => Array<{ [header: string]: string }>;
|
|
47
|
+
urlAttr: (attrName: string) => string | undefined;
|
|
48
|
+
extract: (schema: ExtractSchema) => any;
|
|
22
49
|
};
|
|
23
50
|
|
|
24
51
|
export declare type XDocument = {
|
|
25
52
|
documentElement: XElement;
|
|
26
|
-
outerHTML: string;
|
|
27
53
|
innerText: string;
|
|
54
|
+
textContent: string;
|
|
55
|
+
outerHTML: string;
|
|
28
56
|
tagName: string | null;
|
|
29
|
-
|
|
57
|
+
nodeType: NodeType;
|
|
58
|
+
|
|
59
|
+
find: (selector: string) => XElement[];
|
|
30
60
|
first: (selector: string) => XElement | null;
|
|
31
61
|
first_s: (selector: string) => XElement;
|
|
62
|
+
firstOrThrow: (selector: string) => XElement;
|
|
32
63
|
only: (selector: string) => XElement | null;
|
|
33
64
|
only_s: (selector: string) => XElement;
|
|
65
|
+
onlyOrThrow: (selector: string) => XElement;
|
|
66
|
+
|
|
67
|
+
// New convenience methods
|
|
68
|
+
text: (selector: string, opts?: TextOptions) => string | null;
|
|
69
|
+
textOrThrow: (selector: string) => string;
|
|
70
|
+
attr: (selector: string, name: string) => string | undefined;
|
|
71
|
+
attrOrThrow: (selector: string, name: string) => string;
|
|
72
|
+
exists: (selector: string) => boolean;
|
|
73
|
+
count: (selector: string) => number;
|
|
74
|
+
|
|
75
|
+
// URL helpers
|
|
76
|
+
url: (selector: string, attr: string) => string | undefined;
|
|
77
|
+
|
|
78
|
+
// High-level extractors
|
|
79
|
+
extract: (schema: ExtractSchema) => any;
|
|
80
|
+
meta: () => { [key: string]: string };
|
|
81
|
+
links: () => Array<{ text: string; href: string }>;
|
|
82
|
+
images: () => Array<{ alt: string; src: string }>;
|
|
83
|
+
forms: () => XElement[];
|
|
84
|
+
tables: () => XElement[];
|
|
85
|
+
table: (selector?: string) => Array<{ [header: string]: string }>;
|
|
86
|
+
title: () => string | null;
|
|
87
|
+
description: () => string | undefined;
|
|
88
|
+
canonicalUrl: () => string | undefined;
|
|
34
89
|
};
|
|
35
90
|
|
|
36
|
-
export declare function parse(html: string): XDocument;
|
|
91
|
+
export declare function parse(html: string, options?: { baseUrl?: string }): XDocument;
|