html-fetch-parser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/publish.yml +53 -0
- package/LICENSE +21 -0
- package/README.md +280 -0
- package/example/advanced-scraping.js +150 -0
- package/example/basic-usage.js +57 -0
- package/example/fetch-example.js +27 -0
- package/example/manipulator-example.js +39 -0
- package/index.d.ts +97 -0
- package/index.js +95 -0
- package/lib/fetcher.js +106 -0
- package/lib/manipulator.js +151 -0
- package/lib/parser.js +196 -0
- package/package.json +30 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
name: Auto Publish to NPM
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
paths:
|
|
6
|
+
- "package.json"
|
|
7
|
+
- ".github/workflows/publish.yml"
|
|
8
|
+
branches:
|
|
9
|
+
- main
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
publish:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout code
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
with:
|
|
19
|
+
fetch-depth: 0
|
|
20
|
+
|
|
21
|
+
- name: Setup Node.js
|
|
22
|
+
uses: actions/setup-node@v4
|
|
23
|
+
with:
|
|
24
|
+
node-version: "18"
|
|
25
|
+
registry-url: "https://registry.npmjs.org"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
- name: Check version update
|
|
29
|
+
id: version-check
|
|
30
|
+
run: |
|
|
31
|
+
CURRENT_VERSION=$(node -pe "require('./package.json').version")
|
|
32
|
+
echo "CURRENT_VERSION=$CURRENT_VERSION" >> $GITHUB_OUTPUT
|
|
33
|
+
echo "📦 Current Version: $CURRENT_VERSION"
|
|
34
|
+
|
|
35
|
+
- name: Publish to NPM
|
|
36
|
+
run: npm publish
|
|
37
|
+
env:
|
|
38
|
+
NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
|
|
39
|
+
|
|
40
|
+
- name: Create GitHub Release
|
|
41
|
+
uses: actions/create-release@v1
|
|
42
|
+
env:
|
|
43
|
+
GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
|
|
44
|
+
with:
|
|
45
|
+
tag_name: v${{ steps.version-check.outputs.CURRENT_VERSION }}
|
|
46
|
+
release_name: Release v${{ steps.version-check.outputs.CURRENT_VERSION }}
|
|
47
|
+
body: |
|
|
48
|
+
# 🚀 New Release v${{ steps.version-check.outputs.CURRENT_VERSION }}
|
|
49
|
+
draft: false
|
|
50
|
+
prerelease: false
|
|
51
|
+
|
|
52
|
+
- name: Success notification
|
|
53
|
+
run: echo "✅ Published html-fetch-parser@${{ steps.version-check.outputs.CURRENT_VERSION }} to NPM"
|
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Kaze
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
# HTML Fetch Parser
|
|
2
|
+
|
|
3
|
+
Lightweight and powerful HTML fetching, parsing, and manipulation library for Node.js. Combines the best features of fetch, axios, and cheerio in one simple package.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
- **Easy HTML Fetching** - Built-in HTTP client with timeout support
|
|
8
|
+
- **Powerful Parsing** - CSS selector-based HTML parsing
|
|
9
|
+
- **Simple API** - Intuitive chainable methods
|
|
10
|
+
- **Zero Heavy Dependencies** - Uses lightweight `node-html-parser`
|
|
11
|
+
- **TypeScript Support** - Full TypeScript definitions included
|
|
12
|
+
- **Utility Functions** - HTML manipulation helpers built-in
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
npm install html-fetch-parser
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Quick Start
|
|
21
|
+
|
|
22
|
+
### Fetch and Parse Remote HTML
|
|
23
|
+
|
|
24
|
+
```javascript
|
|
25
|
+
const { fetch } = require('html-fetch-parser');
|
|
26
|
+
|
|
27
|
+
const parser = await fetch('https://example.com');
|
|
28
|
+
console.log(parser.getTitle());
|
|
29
|
+
console.log(parser.text('h1'));
|
|
30
|
+
console.log(parser.getLinks());
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Load and Parse Local HTML
|
|
34
|
+
|
|
35
|
+
```javascript
|
|
36
|
+
const HtmlFetchParser = require('html-fetch-parser');
|
|
37
|
+
|
|
38
|
+
const html = '<h1>Hello World</h1><p>Welcome</p>';
|
|
39
|
+
const parser = new HtmlFetchParser();
|
|
40
|
+
parser.load(html);
|
|
41
|
+
|
|
42
|
+
console.log(parser.text('h1'));
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## API Reference
|
|
46
|
+
|
|
47
|
+
### Main Class
|
|
48
|
+
|
|
49
|
+
#### `new HtmlFetchParser(options)`
|
|
50
|
+
|
|
51
|
+
Create a new instance.
|
|
52
|
+
|
|
53
|
+
**Options:**
|
|
54
|
+
- `headers` - Default HTTP headers
|
|
55
|
+
- `timeout` - Request timeout in milliseconds (default: 10000)
|
|
56
|
+
|
|
57
|
+
#### Methods
|
|
58
|
+
|
|
59
|
+
**Fetching:**
|
|
60
|
+
- `fetch(url, options)` - Fetch HTML from URL
|
|
61
|
+
- `post(url, data, options)` - POST request
|
|
62
|
+
- `load(html)` - Load HTML string
|
|
63
|
+
|
|
64
|
+
**Querying:**
|
|
65
|
+
- `$(selector)` - Get single element (alias for querySelector)
|
|
66
|
+
- `$$(selector)` - Get all elements (alias for querySelectorAll)
|
|
67
|
+
- `text(selector)` - Get text content
|
|
68
|
+
- `textAll(selector)` - Get all text contents
|
|
69
|
+
- `attr(selector, attr)` - Get attribute value
|
|
70
|
+
- `attrAll(selector, attr)` - Get all attribute values
|
|
71
|
+
- `html(selector)` - Get inner HTML
|
|
72
|
+
|
|
73
|
+
**Data Extraction:**
|
|
74
|
+
- `extract(schema)` - Extract data using schema
|
|
75
|
+
- `getTitle()` - Get page title
|
|
76
|
+
- `getMeta(name)` - Get meta tag content
|
|
77
|
+
- `getLinks()` - Get all links
|
|
78
|
+
- `getImages()` - Get all images
|
|
79
|
+
- `getRawHtml()` - Get raw HTML string
|
|
80
|
+
|
|
81
|
+
### Extract Schema
|
|
82
|
+
|
|
83
|
+
Extract structured data easily:
|
|
84
|
+
|
|
85
|
+
```javascript
|
|
86
|
+
const data = parser.extract({
|
|
87
|
+
title: 'h1',
|
|
88
|
+
description: '.intro',
|
|
89
|
+
links: {
|
|
90
|
+
selector: 'a',
|
|
91
|
+
attr: 'href',
|
|
92
|
+
multiple: true
|
|
93
|
+
},
|
|
94
|
+
prices: {
|
|
95
|
+
selector: '.price',
|
|
96
|
+
multiple: true,
|
|
97
|
+
transform: (value) => parseFloat(value.replace('$', ''))
|
|
98
|
+
}
|
|
99
|
+
});
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Schema Options:**
|
|
103
|
+
- `selector` (required) - CSS selector
|
|
104
|
+
- `attr` - Attribute name to extract
|
|
105
|
+
- `multiple` - Extract from all matching elements
|
|
106
|
+
- `transform` - Transform function
|
|
107
|
+
|
|
108
|
+
### Manipulator Class
|
|
109
|
+
|
|
110
|
+
Static utility methods for HTML manipulation:
|
|
111
|
+
|
|
112
|
+
```javascript
|
|
113
|
+
const { Manipulator } = require('html-fetch-parser');
|
|
114
|
+
|
|
115
|
+
Manipulator.stripTags(html);
|
|
116
|
+
Manipulator.decodeEntities(html);
|
|
117
|
+
Manipulator.extractUrls(html, baseUrl);
|
|
118
|
+
Manipulator.extractEmails(html);
|
|
119
|
+
Manipulator.cleanWhitespace(text);
|
|
120
|
+
Manipulator.truncate(text, length, suffix);
|
|
121
|
+
Manipulator.toAbsoluteUrl(url, baseUrl);
|
|
122
|
+
Manipulator.removeScriptsAndStyles(html);
|
|
123
|
+
Manipulator.wordCount(text);
|
|
124
|
+
Manipulator.sanitizeFilename(filename);
|
|
125
|
+
Manipulator.extractStructuredData(html);
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Examples
|
|
129
|
+
|
|
130
|
+
### Basic Usage
|
|
131
|
+
|
|
132
|
+
```javascript
|
|
133
|
+
const HtmlFetchParser = require('html-fetch-parser');
|
|
134
|
+
|
|
135
|
+
const html = `
|
|
136
|
+
<div>
|
|
137
|
+
<h1>Products</h1>
|
|
138
|
+
<div class="product">
|
|
139
|
+
<h2>Product 1</h2>
|
|
140
|
+
<span class="price">$19.99</span>
|
|
141
|
+
</div>
|
|
142
|
+
<div class="product">
|
|
143
|
+
<h2>Product 2</h2>
|
|
144
|
+
<span class="price">$29.99</span>
|
|
145
|
+
</div>
|
|
146
|
+
</div>
|
|
147
|
+
`;
|
|
148
|
+
|
|
149
|
+
const parser = new HtmlFetchParser();
|
|
150
|
+
parser.load(html);
|
|
151
|
+
|
|
152
|
+
const products = parser.extract({
|
|
153
|
+
title: 'h1',
|
|
154
|
+
products: {
|
|
155
|
+
selector: '.product h2',
|
|
156
|
+
multiple: true
|
|
157
|
+
},
|
|
158
|
+
prices: {
|
|
159
|
+
selector: '.price',
|
|
160
|
+
multiple: true
|
|
161
|
+
}
|
|
162
|
+
});
|
|
163
|
+
|
|
164
|
+
console.log(products);
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
### Fetch Remote HTML
|
|
168
|
+
|
|
169
|
+
```javascript
|
|
170
|
+
const { fetch } = require('html-fetch-parser');
|
|
171
|
+
|
|
172
|
+
async function scrapeWebsite() {
|
|
173
|
+
const parser = await fetch('https://example.com', {
|
|
174
|
+
headers: {
|
|
175
|
+
'User-Agent': 'My Scraper Bot'
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
const data = parser.extract({
|
|
180
|
+
title: 'h1',
|
|
181
|
+
description: 'meta[name="description"]',
|
|
182
|
+
links: {
|
|
183
|
+
selector: 'a',
|
|
184
|
+
attr: 'href',
|
|
185
|
+
multiple: true
|
|
186
|
+
}
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
return data;
|
|
190
|
+
}
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
### Custom Fetcher
|
|
194
|
+
|
|
195
|
+
```javascript
|
|
196
|
+
const { Fetcher } = require('html-fetch-parser');
|
|
197
|
+
|
|
198
|
+
const fetcher = new Fetcher({
|
|
199
|
+
timeout: 5000,
|
|
200
|
+
headers: {
|
|
201
|
+
'User-Agent': 'Custom Bot'
|
|
202
|
+
}
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
const html = await fetcher.get('https://example.com');
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### HTML Manipulation
|
|
209
|
+
|
|
210
|
+
```javascript
|
|
211
|
+
const { Manipulator, load } = require('html-fetch-parser');
|
|
212
|
+
|
|
213
|
+
const html = '<p>Hello & welcome!</p>';
|
|
214
|
+
|
|
215
|
+
const clean = Manipulator.decodeEntities(html);
|
|
216
|
+
const text = Manipulator.stripTags(clean);
|
|
217
|
+
const truncated = Manipulator.truncate(text, 10);
|
|
218
|
+
|
|
219
|
+
console.log(truncated);
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
## Advanced Usage
|
|
223
|
+
|
|
224
|
+
### Chaining Methods
|
|
225
|
+
|
|
226
|
+
```javascript
|
|
227
|
+
const data = await fetch('https://example.com')
|
|
228
|
+
.then(parser => parser.extract({
|
|
229
|
+
title: 'h1',
|
|
230
|
+
content: '.content'
|
|
231
|
+
}));
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
### Error Handling
|
|
235
|
+
|
|
236
|
+
```javascript
|
|
237
|
+
try {
|
|
238
|
+
const parser = await fetch('https://example.com');
|
|
239
|
+
console.log(parser.getTitle());
|
|
240
|
+
} catch (error) {
|
|
241
|
+
console.error('Failed to fetch:', error.message);
|
|
242
|
+
}
|
|
243
|
+
```
|
|
244
|
+
|
|
245
|
+
### Custom Timeout
|
|
246
|
+
|
|
247
|
+
```javascript
|
|
248
|
+
const parser = new HtmlFetchParser({ timeout: 30000 });
|
|
249
|
+
await parser.fetch('https://slow-website.com');
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
## TypeScript
|
|
253
|
+
|
|
254
|
+
Full TypeScript support included:
|
|
255
|
+
|
|
256
|
+
```typescript
|
|
257
|
+
import HtmlFetchParser, { fetch, Manipulator } from 'html-fetch-parser';
|
|
258
|
+
|
|
259
|
+
const parser: HtmlFetchParser = await fetch('https://example.com');
|
|
260
|
+
const title: string = parser.getTitle();
|
|
261
|
+
```
|
|
262
|
+
|
|
263
|
+
## Performance
|
|
264
|
+
|
|
265
|
+
- Lightweight with minimal dependencies
|
|
266
|
+
- Fast HTML parsing using node-html-parser
|
|
267
|
+
- Native fetch API for HTTP requests
|
|
268
|
+
- Memory efficient
|
|
269
|
+
|
|
270
|
+
## License
|
|
271
|
+
|
|
272
|
+
MIT
|
|
273
|
+
|
|
274
|
+
## Contributing
|
|
275
|
+
|
|
276
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
277
|
+
|
|
278
|
+
## Support
|
|
279
|
+
|
|
280
|
+
For issues and questions, please open an issue on GitHub.
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
const HtmlFetchParser = require('../index');
|
|
2
|
+
const { Manipulator } = require('../index');
|
|
3
|
+
|
|
4
|
+
async function advancedScrapingExample() {
|
|
5
|
+
console.log('=== Advanced Web Scraping Example ===\n');
|
|
6
|
+
|
|
7
|
+
const html = `
|
|
8
|
+
<!DOCTYPE html>
|
|
9
|
+
<html lang="en">
|
|
10
|
+
<head>
|
|
11
|
+
<title>E-Commerce Store</title>
|
|
12
|
+
<meta name="description" content="Best products online">
|
|
13
|
+
<script type="application/ld+json">
|
|
14
|
+
{
|
|
15
|
+
"@context": "https://schema.org",
|
|
16
|
+
"@type": "Product",
|
|
17
|
+
"name": "Example Product"
|
|
18
|
+
}
|
|
19
|
+
</script>
|
|
20
|
+
</head>
|
|
21
|
+
<body>
|
|
22
|
+
<header>
|
|
23
|
+
<nav>
|
|
24
|
+
<a href="/home">Home</a>
|
|
25
|
+
<a href="/products">Products</a>
|
|
26
|
+
<a href="/about">About</a>
|
|
27
|
+
</nav>
|
|
28
|
+
</header>
|
|
29
|
+
|
|
30
|
+
<main>
|
|
31
|
+
<h1>Featured Products</h1>
|
|
32
|
+
|
|
33
|
+
<article class="product" data-id="1">
|
|
34
|
+
<h2 class="product-name">Laptop Pro</h2>
|
|
35
|
+
<p class="description">High-performance laptop for professionals</p>
|
|
36
|
+
<span class="price" data-currency="USD">$1,299.99</span>
|
|
37
|
+
<span class="stock">In Stock</span>
|
|
38
|
+
<a href="/products/laptop-pro" class="btn">View Details</a>
|
|
39
|
+
</article>
|
|
40
|
+
|
|
41
|
+
<article class="product" data-id="2">
|
|
42
|
+
<h2 class="product-name">Wireless Mouse</h2>
|
|
43
|
+
<p class="description">Ergonomic wireless mouse with long battery life</p>
|
|
44
|
+
<span class="price" data-currency="USD">$49.99</span>
|
|
45
|
+
<span class="stock">Low Stock</span>
|
|
46
|
+
<a href="/products/wireless-mouse" class="btn">View Details</a>
|
|
47
|
+
</article>
|
|
48
|
+
|
|
49
|
+
<article class="product" data-id="3">
|
|
50
|
+
<h2 class="product-name">USB-C Cable</h2>
|
|
51
|
+
<p class="description">Fast charging USB-C cable, 6ft length</p>
|
|
52
|
+
<span class="price" data-currency="USD">$12.99</span>
|
|
53
|
+
<span class="stock">In Stock</span>
|
|
54
|
+
<a href="/products/usb-c-cable" class="btn">View Details</a>
|
|
55
|
+
</article>
|
|
56
|
+
</main>
|
|
57
|
+
|
|
58
|
+
<footer>
|
|
59
|
+
<p>Contact: sales@example.com | support@example.com</p>
|
|
60
|
+
<p>Visit our blog: https://blog.example.com</p>
|
|
61
|
+
</footer>
|
|
62
|
+
</body>
|
|
63
|
+
</html>
|
|
64
|
+
`;
|
|
65
|
+
|
|
66
|
+
const parser = new HtmlFetchParser();
|
|
67
|
+
parser.load(html);
|
|
68
|
+
|
|
69
|
+
console.log('=== Page Metadata ===');
|
|
70
|
+
console.log('Title:', parser.getTitle());
|
|
71
|
+
console.log('Description:', parser.getMeta('description'));
|
|
72
|
+
console.log();
|
|
73
|
+
|
|
74
|
+
console.log('=== Navigation Links ===');
|
|
75
|
+
const navLinks = parser.extract({
|
|
76
|
+
links: {
|
|
77
|
+
selector: 'nav a',
|
|
78
|
+
attr: 'href',
|
|
79
|
+
multiple: true
|
|
80
|
+
}
|
|
81
|
+
});
|
|
82
|
+
console.log(navLinks);
|
|
83
|
+
console.log();
|
|
84
|
+
|
|
85
|
+
console.log('=== Extract All Products ===');
|
|
86
|
+
const products = [];
|
|
87
|
+
const productElements = parser.$$('.product');
|
|
88
|
+
|
|
89
|
+
productElements.forEach(productEl => {
|
|
90
|
+
const tempParser = new HtmlFetchParser();
|
|
91
|
+
tempParser.load(productEl.outerHTML);
|
|
92
|
+
|
|
93
|
+
products.push({
|
|
94
|
+
id: productEl.getAttribute('data-id'),
|
|
95
|
+
name: tempParser.text('.product-name'),
|
|
96
|
+
description: tempParser.text('.description'),
|
|
97
|
+
price: tempParser.text('.price'),
|
|
98
|
+
priceNumeric: parseFloat(tempParser.text('.price').replace(/[^0-9.]/g, '')),
|
|
99
|
+
stock: tempParser.text('.stock'),
|
|
100
|
+
url: tempParser.attr('.btn', 'href')
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
console.log(JSON.stringify(products, null, 2));
|
|
105
|
+
console.log();
|
|
106
|
+
|
|
107
|
+
console.log('=== Statistics ===');
|
|
108
|
+
const totalProducts = products.length;
|
|
109
|
+
const inStockProducts = products.filter(p => p.stock === 'In Stock').length;
|
|
110
|
+
const avgPrice = products.reduce((sum, p) => sum + p.priceNumeric, 0) / totalProducts;
|
|
111
|
+
|
|
112
|
+
console.log(`Total Products: ${totalProducts}`);
|
|
113
|
+
console.log(`In Stock: ${inStockProducts}`);
|
|
114
|
+
console.log(`Average Price: $${avgPrice.toFixed(2)}`);
|
|
115
|
+
console.log();
|
|
116
|
+
|
|
117
|
+
console.log('=== Extract Contact Information ===');
|
|
118
|
+
const footerText = parser.text('footer');
|
|
119
|
+
const emails = Manipulator.extractEmails(footerText);
|
|
120
|
+
const urls = Manipulator.extractUrls(footerText);
|
|
121
|
+
|
|
122
|
+
console.log('Email addresses:', emails);
|
|
123
|
+
console.log('URLs:', urls);
|
|
124
|
+
console.log();
|
|
125
|
+
|
|
126
|
+
console.log('=== Structured Data ===');
|
|
127
|
+
const structuredData = Manipulator.extractStructuredData(parser.getRawHtml());
|
|
128
|
+
console.log(JSON.stringify(structuredData, null, 2));
|
|
129
|
+
console.log();
|
|
130
|
+
|
|
131
|
+
console.log('=== Text Analysis ===');
|
|
132
|
+
const mainContent = parser.text('main');
|
|
133
|
+
const cleanText = Manipulator.cleanWhitespace(mainContent);
|
|
134
|
+
const wordCount = Manipulator.wordCount(cleanText);
|
|
135
|
+
const preview = Manipulator.truncate(cleanText, 100);
|
|
136
|
+
|
|
137
|
+
console.log(`Word Count: ${wordCount}`);
|
|
138
|
+
console.log(`Preview: ${preview}`);
|
|
139
|
+
console.log();
|
|
140
|
+
|
|
141
|
+
console.log('=== Convert URLs to Absolute ===');
|
|
142
|
+
const baseUrl = 'https://example.com';
|
|
143
|
+
const absoluteUrls = products.map(p => ({
|
|
144
|
+
name: p.name,
|
|
145
|
+
url: Manipulator.toAbsoluteUrl(p.url, baseUrl)
|
|
146
|
+
}));
|
|
147
|
+
console.log(absoluteUrls);
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
advancedScrapingExample().catch(console.error);
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
const HtmlFetchParser = require('../index');
|
|
2
|
+
|
|
3
|
+
async function basicExample() {
|
|
4
|
+
console.log('=== HTML Fetch Parser - Basic Usage ===\n');
|
|
5
|
+
|
|
6
|
+
const html = `
|
|
7
|
+
<!DOCTYPE html>
|
|
8
|
+
<html>
|
|
9
|
+
<head>
|
|
10
|
+
<title>Sample Page</title>
|
|
11
|
+
<meta name="description" content="This is a sample page">
|
|
12
|
+
</head>
|
|
13
|
+
<body>
|
|
14
|
+
<h1>Welcome to HTML Fetch Parser</h1>
|
|
15
|
+
<p class="intro">A lightweight library for HTML manipulation</p>
|
|
16
|
+
<div class="content">
|
|
17
|
+
<h2>Features</h2>
|
|
18
|
+
<ul>
|
|
19
|
+
<li>Easy HTML fetching</li>
|
|
20
|
+
<li>Powerful parsing</li>
|
|
21
|
+
<li>Simple manipulation</li>
|
|
22
|
+
</ul>
|
|
23
|
+
</div>
|
|
24
|
+
<a href="/docs" title="Documentation">Read Docs</a>
|
|
25
|
+
<img src="/logo.png" alt="Logo">
|
|
26
|
+
</body>
|
|
27
|
+
</html>
|
|
28
|
+
`;
|
|
29
|
+
|
|
30
|
+
const parser = new HtmlFetchParser();
|
|
31
|
+
parser.load(html);
|
|
32
|
+
|
|
33
|
+
console.log('Title:', parser.getTitle());
|
|
34
|
+
console.log('Meta description:', parser.getMeta('description'));
|
|
35
|
+
console.log('H1 text:', parser.text('h1'));
|
|
36
|
+
console.log('Intro text:', parser.text('.intro'));
|
|
37
|
+
console.log('All list items:', parser.textAll('li'));
|
|
38
|
+
console.log('Links:', parser.getLinks());
|
|
39
|
+
console.log('Images:', parser.getImages());
|
|
40
|
+
|
|
41
|
+
console.log('\n=== Extract with Schema ===');
|
|
42
|
+
const data = parser.extract({
|
|
43
|
+
title: 'h1',
|
|
44
|
+
intro: '.intro',
|
|
45
|
+
features: {
|
|
46
|
+
selector: 'li',
|
|
47
|
+
multiple: true
|
|
48
|
+
},
|
|
49
|
+
linkHref: {
|
|
50
|
+
selector: 'a',
|
|
51
|
+
attr: 'href'
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
console.log(data);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
basicExample().catch(console.error);
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
const { fetch } = require('../index');
|
|
2
|
+
|
|
3
|
+
async function fetchExample() {
|
|
4
|
+
console.log('=== Fetch Remote HTML Example ===\n');
|
|
5
|
+
|
|
6
|
+
try {
|
|
7
|
+
const parser = await fetch('https://freepublicapisss.vercel.app/');
|
|
8
|
+
|
|
9
|
+
console.log('Title:', parser.getTitle());
|
|
10
|
+
console.log('First paragraph:', parser.text('p'));
|
|
11
|
+
console.log('All links:', parser.getLinks());
|
|
12
|
+
|
|
13
|
+
const data = parser.extract({
|
|
14
|
+
title: 'h1',
|
|
15
|
+
paragraphs: {
|
|
16
|
+
selector: 'p',
|
|
17
|
+
multiple: true
|
|
18
|
+
}
|
|
19
|
+
});
|
|
20
|
+
|
|
21
|
+
console.log('\nExtracted data:', data);
|
|
22
|
+
} catch (error) {
|
|
23
|
+
console.error('Error:', error.message);
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
fetchExample();
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
const { Manipulator } = require('../index');
|
|
2
|
+
|
|
3
|
+
console.log('=== Manipulator Example ===\n');
|
|
4
|
+
|
|
5
|
+
const html = `
|
|
6
|
+
<div>
|
|
7
|
+
<h1>Hello & Welcome!</h1>
|
|
8
|
+
<p>Contact us at: info@example.com or support@test.com</p>
|
|
9
|
+
<script>alert('test');</script>
|
|
10
|
+
<p>Visit https://example.com for more info</p>
|
|
11
|
+
</div>
|
|
12
|
+
`;
|
|
13
|
+
|
|
14
|
+
console.log('Original HTML:');
|
|
15
|
+
console.log(html);
|
|
16
|
+
|
|
17
|
+
console.log('\nStrip tags:');
|
|
18
|
+
console.log(Manipulator.stripTags(html));
|
|
19
|
+
|
|
20
|
+
console.log('\nDecode entities:');
|
|
21
|
+
console.log(Manipulator.decodeEntities('Hello & Welcome!'));
|
|
22
|
+
|
|
23
|
+
console.log('\nExtract emails:');
|
|
24
|
+
console.log(Manipulator.extractEmails(html));
|
|
25
|
+
|
|
26
|
+
console.log('\nExtract URLs:');
|
|
27
|
+
console.log(Manipulator.extractUrls(html));
|
|
28
|
+
|
|
29
|
+
console.log('\nRemove scripts:');
|
|
30
|
+
console.log(Manipulator.removeScriptsAndStyles(html));
|
|
31
|
+
|
|
32
|
+
console.log('\nTruncate text:');
|
|
33
|
+
console.log(Manipulator.truncate('This is a very long text that needs truncating', 20));
|
|
34
|
+
|
|
35
|
+
console.log('\nWord count:');
|
|
36
|
+
console.log(Manipulator.wordCount('Hello world this is a test'));
|
|
37
|
+
|
|
38
|
+
console.log('\nConvert to absolute URL:');
|
|
39
|
+
console.log(Manipulator.toAbsoluteUrl('/path/to/page', 'https://example.com'));
|
package/index.d.ts
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
declare module 'html-fetch-parser' {
|
|
2
|
+
export interface FetcherOptions {
|
|
3
|
+
headers?: Record<string, string>;
|
|
4
|
+
timeout?: number;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export interface FetchOptions extends RequestInit {
|
|
8
|
+
headers?: Record<string, string>;
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface ExtractConfig {
|
|
12
|
+
selector: string;
|
|
13
|
+
attr?: string;
|
|
14
|
+
multiple?: boolean;
|
|
15
|
+
transform?: (value: any) => any;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface ExtractSchema {
|
|
19
|
+
[key: string]: string | ExtractConfig;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface LinkObject {
|
|
23
|
+
text: string;
|
|
24
|
+
href: string | null;
|
|
25
|
+
title: string | null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface ImageObject {
|
|
29
|
+
src: string | null;
|
|
30
|
+
alt: string | null;
|
|
31
|
+
title: string | null;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export class Fetcher {
|
|
35
|
+
constructor(options?: FetcherOptions);
|
|
36
|
+
get(url: string, options?: FetchOptions): Promise<string>;
|
|
37
|
+
post(url: string, data?: any, options?: FetchOptions): Promise<string>;
|
|
38
|
+
setHeaders(headers: Record<string, string>): void;
|
|
39
|
+
setTimeout(ms: number): void;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export class Parser {
|
|
43
|
+
constructor(html?: string);
|
|
44
|
+
load(html: string): Parser;
|
|
45
|
+
querySelector(selector: string): any;
|
|
46
|
+
querySelectorAll(selector: string): any[];
|
|
47
|
+
text(selector: string): string;
|
|
48
|
+
textAll(selector: string): string[];
|
|
49
|
+
attr(selector: string, attr: string): string | null;
|
|
50
|
+
attrAll(selector: string, attr: string): string[];
|
|
51
|
+
html(selector: string): string;
|
|
52
|
+
outerHtml(selector: string): string;
|
|
53
|
+
getTitle(): string;
|
|
54
|
+
getMeta(name: string): string;
|
|
55
|
+
getLinks(): LinkObject[];
|
|
56
|
+
getImages(): ImageObject[];
|
|
57
|
+
extract(schema: ExtractSchema): Record<string, any>;
|
|
58
|
+
getRawHtml(): string;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export class Manipulator {
|
|
62
|
+
static stripTags(html: string): string;
|
|
63
|
+
static decodeEntities(html: string): string;
|
|
64
|
+
static extractUrls(html: string, baseUrl?: string): string[];
|
|
65
|
+
static cleanWhitespace(text: string): string;
|
|
66
|
+
static extractEmails(html: string): string[];
|
|
67
|
+
static truncate(text: string, length: number, suffix?: string): string;
|
|
68
|
+
static toAbsoluteUrl(url: string, baseUrl: string): string;
|
|
69
|
+
static extractStructuredData(html: string): object[];
|
|
70
|
+
static removeScriptsAndStyles(html: string): string;
|
|
71
|
+
static wordCount(text: string): number;
|
|
72
|
+
static sanitizeFilename(filename: string): string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export default class HtmlFetchParser {
|
|
76
|
+
constructor(options?: FetcherOptions);
|
|
77
|
+
fetch(url: string, options?: FetchOptions): Promise<HtmlFetchParser>;
|
|
78
|
+
post(url: string, data?: any, options?: FetchOptions): Promise<HtmlFetchParser>;
|
|
79
|
+
load(html: string): HtmlFetchParser;
|
|
80
|
+
$(selector: string): any;
|
|
81
|
+
$$(selector: string): any[];
|
|
82
|
+
text(selector: string): string;
|
|
83
|
+
textAll(selector: string): string[];
|
|
84
|
+
attr(selector: string, attr: string): string | null;
|
|
85
|
+
attrAll(selector: string, attr: string): string[];
|
|
86
|
+
html(selector: string): string;
|
|
87
|
+
extract(schema: ExtractSchema): Record<string, any>;
|
|
88
|
+
getTitle(): string;
|
|
89
|
+
getMeta(name: string): string;
|
|
90
|
+
getLinks(): LinkObject[];
|
|
91
|
+
getImages(): ImageObject[];
|
|
92
|
+
getRawHtml(): string;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
export function fetch(url: string, options?: FetcherOptions): Promise<HtmlFetchParser>;
|
|
96
|
+
export function load(html: string): HtmlFetchParser;
|
|
97
|
+
}
|
package/index.js
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
const Fetcher = require('./lib/fetcher');
|
|
2
|
+
const Parser = require('./lib/parser');
|
|
3
|
+
const Manipulator = require('./lib/manipulator');
|
|
4
|
+
|
|
5
|
+
class HtmlFetchParser {
|
|
6
|
+
constructor(options = {}) {
|
|
7
|
+
this.fetcher = new Fetcher(options);
|
|
8
|
+
this.parser = new Parser();
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
async fetch(url, options = {}) {
|
|
12
|
+
const html = await this.fetcher.get(url, options);
|
|
13
|
+
this.parser.load(html);
|
|
14
|
+
return this;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
async post(url, data = {}, options = {}) {
|
|
18
|
+
const html = await this.fetcher.post(url, data, options);
|
|
19
|
+
this.parser.load(html);
|
|
20
|
+
return this;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
load(html) {
|
|
24
|
+
this.parser.load(html);
|
|
25
|
+
return this;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
$(selector) {
|
|
29
|
+
return this.parser.querySelector(selector);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
$$(selector) {
|
|
33
|
+
return this.parser.querySelectorAll(selector);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
text(selector) {
|
|
37
|
+
return this.parser.text(selector);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
textAll(selector) {
|
|
41
|
+
return this.parser.textAll(selector);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
attr(selector, attr) {
|
|
45
|
+
return this.parser.attr(selector, attr);
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
attrAll(selector, attr) {
|
|
49
|
+
return this.parser.attrAll(selector, attr);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
html(selector) {
|
|
53
|
+
return this.parser.html(selector);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
extract(schema) {
|
|
57
|
+
return this.parser.extract(schema);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
getTitle() {
|
|
61
|
+
return this.parser.getTitle();
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
getMeta(name) {
|
|
65
|
+
return this.parser.getMeta(name);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
getLinks() {
|
|
69
|
+
return this.parser.getLinks();
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
getImages() {
|
|
73
|
+
return this.parser.getImages();
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
getRawHtml() {
|
|
77
|
+
return this.parser.getRawHtml();
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
module.exports = HtmlFetchParser;
|
|
82
|
+
module.exports.Fetcher = Fetcher;
|
|
83
|
+
module.exports.Parser = Parser;
|
|
84
|
+
module.exports.Manipulator = Manipulator;
|
|
85
|
+
module.exports.default = HtmlFetchParser;
|
|
86
|
+
|
|
87
|
+
module.exports.fetch = async (url, options) => {
|
|
88
|
+
const instance = new HtmlFetchParser(options);
|
|
89
|
+
return await instance.fetch(url);
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
module.exports.load = (html) => {
|
|
93
|
+
const instance = new HtmlFetchParser();
|
|
94
|
+
return instance.load(html);
|
|
95
|
+
};
|
package/lib/fetcher.js
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTTP Fetcher Module
|
|
3
|
+
* Lightweight HTTP client for fetching HTML content
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
class Fetcher {
|
|
7
|
+
constructor(options = {}) {
|
|
8
|
+
this.defaultHeaders = {
|
|
9
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
|
10
|
+
...options.headers
|
|
11
|
+
};
|
|
12
|
+
this.timeout = options.timeout || 10000;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Fetch HTML content from URL
|
|
17
|
+
* @param {string} url - URL to fetch
|
|
18
|
+
* @param {object} options - Fetch options
|
|
19
|
+
* @returns {Promise<string>} HTML content
|
|
20
|
+
*/
|
|
21
|
+
async get(url, options = {}) {
|
|
22
|
+
const controller = new AbortController();
|
|
23
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
24
|
+
|
|
25
|
+
try {
|
|
26
|
+
const response = await fetch(url, {
|
|
27
|
+
method: 'GET',
|
|
28
|
+
headers: { ...this.defaultHeaders, ...options.headers },
|
|
29
|
+
signal: controller.signal,
|
|
30
|
+
...options
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
clearTimeout(timeoutId);
|
|
34
|
+
|
|
35
|
+
if (!response.ok) {
|
|
36
|
+
throw new Error(`HTTP Error: ${response.status} ${response.statusText}`);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return await response.text();
|
|
40
|
+
} catch (error) {
|
|
41
|
+
clearTimeout(timeoutId);
|
|
42
|
+
if (error.name === 'AbortError') {
|
|
43
|
+
throw new Error(`Request timeout after ${this.timeout}ms`);
|
|
44
|
+
}
|
|
45
|
+
throw error;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* POST request with data
|
|
51
|
+
* @param {string} url - URL to post
|
|
52
|
+
* @param {object} data - Data to send
|
|
53
|
+
* @param {object} options - Fetch options
|
|
54
|
+
* @returns {Promise<string>} Response content
|
|
55
|
+
*/
|
|
56
|
+
async post(url, data = {}, options = {}) {
|
|
57
|
+
const controller = new AbortController();
|
|
58
|
+
const timeoutId = setTimeout(() => controller.abort(), this.timeout);
|
|
59
|
+
|
|
60
|
+
try {
|
|
61
|
+
const response = await fetch(url, {
|
|
62
|
+
method: 'POST',
|
|
63
|
+
headers: {
|
|
64
|
+
'Content-Type': 'application/json',
|
|
65
|
+
...this.defaultHeaders,
|
|
66
|
+
...options.headers
|
|
67
|
+
},
|
|
68
|
+
body: JSON.stringify(data),
|
|
69
|
+
signal: controller.signal,
|
|
70
|
+
...options
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
clearTimeout(timeoutId);
|
|
74
|
+
|
|
75
|
+
if (!response.ok) {
|
|
76
|
+
throw new Error(`HTTP Error: ${response.status} ${response.statusText}`);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return await response.text();
|
|
80
|
+
} catch (error) {
|
|
81
|
+
clearTimeout(timeoutId);
|
|
82
|
+
if (error.name === 'AbortError') {
|
|
83
|
+
throw new Error(`Request timeout after ${this.timeout}ms`);
|
|
84
|
+
}
|
|
85
|
+
throw error;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Set default headers
|
|
91
|
+
* @param {object} headers - Headers object
|
|
92
|
+
*/
|
|
93
|
+
setHeaders(headers) {
|
|
94
|
+
this.defaultHeaders = { ...this.defaultHeaders, ...headers };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Set timeout
|
|
99
|
+
* @param {number} ms - Timeout in milliseconds
|
|
100
|
+
*/
|
|
101
|
+
setTimeout(ms) {
|
|
102
|
+
this.timeout = ms;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
module.exports = Fetcher;
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML Manipulator Module
|
|
3
|
+
* Utilities for HTML manipulation and transformation
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
class Manipulator {
|
|
7
|
+
/**
|
|
8
|
+
* Remove HTML tags from text
|
|
9
|
+
* @param {string} html - HTML string
|
|
10
|
+
* @returns {string} Plain text
|
|
11
|
+
*/
|
|
12
|
+
static stripTags(html) {
|
|
13
|
+
return html.replace(/<[^>]*>/g, '');
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Decode HTML entities
|
|
18
|
+
* @param {string} html - HTML string with entities
|
|
19
|
+
* @returns {string} Decoded string
|
|
20
|
+
*/
|
|
21
|
+
static decodeEntities(html) {
|
|
22
|
+
const entities = {
|
|
23
|
+
'&': '&',
|
|
24
|
+
'<': '<',
|
|
25
|
+
'>': '>',
|
|
26
|
+
'"': '"',
|
|
27
|
+
''': "'",
|
|
28
|
+
''': "'",
|
|
29
|
+
' ': ' '
|
|
30
|
+
};
|
|
31
|
+
return html.replace(/&[#\w]+;/g, entity => entities[entity] || entity);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Extract URLs from HTML
|
|
36
|
+
* @param {string} html - HTML string
|
|
37
|
+
* @param {string} baseUrl - Base URL for relative links
|
|
38
|
+
* @returns {Array<string>} Array of URLs
|
|
39
|
+
*/
|
|
40
|
+
static extractUrls(html, baseUrl = '') {
|
|
41
|
+
const urlRegex = /https?:\/\/[^\s<>"]+/g;
|
|
42
|
+
const urls = html.match(urlRegex) || [];
|
|
43
|
+
return [...new Set(urls)];
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Clean whitespace
|
|
48
|
+
* @param {string} text - Text to clean
|
|
49
|
+
* @returns {string} Cleaned text
|
|
50
|
+
*/
|
|
51
|
+
static cleanWhitespace(text) {
|
|
52
|
+
return text
|
|
53
|
+
.replace(/\s+/g, ' ')
|
|
54
|
+
.replace(/\n\s*\n/g, '\n')
|
|
55
|
+
.trim();
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Extract emails from HTML
|
|
60
|
+
* @param {string} html - HTML string
|
|
61
|
+
* @returns {Array<string>} Array of email addresses
|
|
62
|
+
*/
|
|
63
|
+
static extractEmails(html) {
|
|
64
|
+
const emailRegex = /[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+/g;
|
|
65
|
+
const emails = html.match(emailRegex) || [];
|
|
66
|
+
return [...new Set(emails)];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/**
|
|
70
|
+
* Truncate text
|
|
71
|
+
* @param {string} text - Text to truncate
|
|
72
|
+
* @param {number} length - Max length
|
|
73
|
+
* @param {string} suffix - Suffix (default: '...')
|
|
74
|
+
* @returns {string} Truncated text
|
|
75
|
+
*/
|
|
76
|
+
static truncate(text, length, suffix = '...') {
|
|
77
|
+
if (text.length <= length) return text;
|
|
78
|
+
return text.substring(0, length - suffix.length) + suffix;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
/**
|
|
82
|
+
* Convert relative URL to absolute
|
|
83
|
+
* @param {string} url - Relative URL
|
|
84
|
+
* @param {string} baseUrl - Base URL
|
|
85
|
+
* @returns {string} Absolute URL
|
|
86
|
+
*/
|
|
87
|
+
static toAbsoluteUrl(url, baseUrl) {
|
|
88
|
+
if (!url) return '';
|
|
89
|
+
if (url.startsWith('http://') || url.startsWith('https://')) return url;
|
|
90
|
+
|
|
91
|
+
try {
|
|
92
|
+
return new URL(url, baseUrl).href;
|
|
93
|
+
} catch {
|
|
94
|
+
return url;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Extract structured data (JSON-LD, microdata)
|
|
100
|
+
* @param {string} html - HTML string
|
|
101
|
+
* @returns {Array<object>} Array of structured data objects
|
|
102
|
+
*/
|
|
103
|
+
static extractStructuredData(html) {
|
|
104
|
+
const jsonLdRegex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>(.*?)<\/script>/gis;
|
|
105
|
+
const matches = [...html.matchAll(jsonLdRegex)];
|
|
106
|
+
|
|
107
|
+
return matches
|
|
108
|
+
.map(match => {
|
|
109
|
+
try {
|
|
110
|
+
return JSON.parse(match[1]);
|
|
111
|
+
} catch {
|
|
112
|
+
return null;
|
|
113
|
+
}
|
|
114
|
+
})
|
|
115
|
+
.filter(Boolean);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Remove scripts and styles
|
|
120
|
+
* @param {string} html - HTML string
|
|
121
|
+
* @returns {string} Cleaned HTML
|
|
122
|
+
*/
|
|
123
|
+
static removeScriptsAndStyles(html) {
|
|
124
|
+
return html
|
|
125
|
+
.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
|
|
126
|
+
.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Get word count
|
|
131
|
+
* @param {string} text - Text to count
|
|
132
|
+
* @returns {number} Word count
|
|
133
|
+
*/
|
|
134
|
+
static wordCount(text) {
|
|
135
|
+
return text.trim().split(/\s+/).filter(Boolean).length;
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Sanitize filename
|
|
140
|
+
* @param {string} filename - Filename to sanitize
|
|
141
|
+
* @returns {string} Safe filename
|
|
142
|
+
*/
|
|
143
|
+
static sanitizeFilename(filename) {
|
|
144
|
+
return filename
|
|
145
|
+
.replace(/[^a-z0-9.-]/gi, '_')
|
|
146
|
+
.replace(/_+/g, '_')
|
|
147
|
+
.toLowerCase();
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
module.exports = Manipulator;
|
package/lib/parser.js
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML Parser Module
|
|
3
|
+
* Parse and query HTML content
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
const { parse } = require('node-html-parser');
|
|
7
|
+
|
|
8
|
+
class Parser {
|
|
9
|
+
constructor(html = '') {
|
|
10
|
+
this.root = html ? parse(html) : null;
|
|
11
|
+
this.rawHtml = html;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Load HTML content
|
|
16
|
+
* @param {string} html - HTML string
|
|
17
|
+
* @returns {Parser} Parser instance
|
|
18
|
+
*/
|
|
19
|
+
load(html) {
|
|
20
|
+
this.rawHtml = html;
|
|
21
|
+
this.root = parse(html);
|
|
22
|
+
return this;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Find element by CSS selector
|
|
27
|
+
* @param {string} selector - CSS selector
|
|
28
|
+
* @returns {object|null} Element
|
|
29
|
+
*/
|
|
30
|
+
querySelector(selector) {
|
|
31
|
+
if (!this.root) return null;
|
|
32
|
+
return this.root.querySelector(selector);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Find all elements by CSS selector
|
|
37
|
+
* @param {string} selector - CSS selector
|
|
38
|
+
* @returns {Array} Array of elements
|
|
39
|
+
*/
|
|
40
|
+
querySelectorAll(selector) {
|
|
41
|
+
if (!this.root) return [];
|
|
42
|
+
return this.root.querySelectorAll(selector);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Get element text content
|
|
47
|
+
* @param {string} selector - CSS selector
|
|
48
|
+
* @returns {string} Text content
|
|
49
|
+
*/
|
|
50
|
+
text(selector) {
|
|
51
|
+
const element = this.querySelector(selector);
|
|
52
|
+
return element ? element.text.trim() : '';
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Get all text from elements
|
|
57
|
+
* @param {string} selector - CSS selector
|
|
58
|
+
* @returns {Array<string>} Array of text content
|
|
59
|
+
*/
|
|
60
|
+
textAll(selector) {
|
|
61
|
+
const elements = this.querySelectorAll(selector);
|
|
62
|
+
return elements.map(el => el.text.trim()).filter(text => text);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Get element attribute
|
|
67
|
+
* @param {string} selector - CSS selector
|
|
68
|
+
* @param {string} attr - Attribute name
|
|
69
|
+
* @returns {string|null} Attribute value
|
|
70
|
+
*/
|
|
71
|
+
attr(selector, attr) {
|
|
72
|
+
const element = this.querySelector(selector);
|
|
73
|
+
return element ? element.getAttribute(attr) : null;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Get attributes from all matching elements
|
|
78
|
+
* @param {string} selector - CSS selector
|
|
79
|
+
* @param {string} attr - Attribute name
|
|
80
|
+
* @returns {Array<string>} Array of attribute values
|
|
81
|
+
*/
|
|
82
|
+
attrAll(selector, attr) {
|
|
83
|
+
const elements = this.querySelectorAll(selector);
|
|
84
|
+
return elements.map(el => el.getAttribute(attr)).filter(val => val);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Get element HTML
|
|
89
|
+
* @param {string} selector - CSS selector
|
|
90
|
+
* @returns {string} HTML content
|
|
91
|
+
*/
|
|
92
|
+
html(selector) {
|
|
93
|
+
const element = this.querySelector(selector);
|
|
94
|
+
return element ? element.innerHTML : '';
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Get outer HTML
|
|
99
|
+
* @param {string} selector - CSS selector
|
|
100
|
+
* @returns {string} Outer HTML
|
|
101
|
+
*/
|
|
102
|
+
outerHtml(selector) {
|
|
103
|
+
const element = this.querySelector(selector);
|
|
104
|
+
return element ? element.outerHTML : '';
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Get page title
|
|
109
|
+
* @returns {string} Page title
|
|
110
|
+
*/
|
|
111
|
+
getTitle() {
|
|
112
|
+
return this.text('title');
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Get meta tags
|
|
117
|
+
* @param {string} name - Meta name or property
|
|
118
|
+
* @returns {string} Meta content
|
|
119
|
+
*/
|
|
120
|
+
getMeta(name) {
|
|
121
|
+
const meta = this.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
122
|
+
return meta ? meta.getAttribute('content') : '';
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Get all links
|
|
127
|
+
* @returns {Array<object>} Array of links with text and href
|
|
128
|
+
*/
|
|
129
|
+
getLinks() {
|
|
130
|
+
const links = this.querySelectorAll('a');
|
|
131
|
+
return links.map(link => ({
|
|
132
|
+
text: link.text.trim(),
|
|
133
|
+
href: link.getAttribute('href'),
|
|
134
|
+
title: link.getAttribute('title')
|
|
135
|
+
}));
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
/**
|
|
139
|
+
* Get all images
|
|
140
|
+
* @returns {Array<object>} Array of images with src and alt
|
|
141
|
+
*/
|
|
142
|
+
getImages() {
|
|
143
|
+
const images = this.querySelectorAll('img');
|
|
144
|
+
return images.map(img => ({
|
|
145
|
+
src: img.getAttribute('src'),
|
|
146
|
+
alt: img.getAttribute('alt'),
|
|
147
|
+
title: img.getAttribute('title')
|
|
148
|
+
}));
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
/**
|
|
152
|
+
* Extract data using custom mapping
|
|
153
|
+
* @param {object} schema - Extraction schema
|
|
154
|
+
* @returns {object} Extracted data
|
|
155
|
+
*/
|
|
156
|
+
extract(schema) {
|
|
157
|
+
const result = {};
|
|
158
|
+
|
|
159
|
+
for (const [key, config] of Object.entries(schema)) {
|
|
160
|
+
if (typeof config === 'string') {
|
|
161
|
+
// Simple selector
|
|
162
|
+
result[key] = this.text(config);
|
|
163
|
+
} else if (typeof config === 'object') {
|
|
164
|
+
// Advanced config
|
|
165
|
+
const { selector, attr, multiple, transform } = config;
|
|
166
|
+
|
|
167
|
+
if (multiple) {
|
|
168
|
+
result[key] = attr
|
|
169
|
+
? this.attrAll(selector, attr)
|
|
170
|
+
: this.textAll(selector);
|
|
171
|
+
} else {
|
|
172
|
+
result[key] = attr
|
|
173
|
+
? this.attr(selector, attr)
|
|
174
|
+
: this.text(selector);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
// Apply transform function if provided
|
|
178
|
+
if (transform && typeof transform === 'function') {
|
|
179
|
+
result[key] = transform(result[key]);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
return result;
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
/**
|
|
188
|
+
* Get raw HTML
|
|
189
|
+
* @returns {string} Raw HTML
|
|
190
|
+
*/
|
|
191
|
+
getRawHtml() {
|
|
192
|
+
return this.rawHtml;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
module.exports = Parser;
|
package/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "html-fetch-parser",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Lightweight HTML fetching and parsing library - combines fetch, parsing, and manipulation in one simple package",
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"types": "index.d.ts",
|
|
7
|
+
"keywords": [
|
|
8
|
+
"html",
|
|
9
|
+
"parser",
|
|
10
|
+
"fetch",
|
|
11
|
+
"scraper",
|
|
12
|
+
"cheerio",
|
|
13
|
+
"axios",
|
|
14
|
+
"dom",
|
|
15
|
+
"manipulation"
|
|
16
|
+
],
|
|
17
|
+
"author": "KazeDevID",
|
|
18
|
+
"license": "MIT",
|
|
19
|
+
"dependencies": {
|
|
20
|
+
"node-html-parser": "^6.1.12"
|
|
21
|
+
},
|
|
22
|
+
"devDependencies": {},
|
|
23
|
+
"repository": {
|
|
24
|
+
"type": "git",
|
|
25
|
+
"url": "https://github.com/KazeDevID/html-fetch-parser"
|
|
26
|
+
},
|
|
27
|
+
"scripts": {
|
|
28
|
+
"test": "node examples/basic-usage.js"
|
|
29
|
+
}
|
|
30
|
+
}
|