html-fetch-parser 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ name: Auto Publish to NPM
2
+
3
+ on:
4
+ push:
5
+ paths:
6
+ - "package.json"
7
+ - ".github/workflows/publish.yml"
8
+ branches:
9
+ - main
10
+
11
+ jobs:
12
+ publish:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout code
17
+ uses: actions/checkout@v4
18
+ with:
19
+ fetch-depth: 0
20
+
21
+ - name: Setup Node.js
22
+ uses: actions/setup-node@v4
23
+ with:
24
+ node-version: "18"
25
+ registry-url: "https://registry.npmjs.org"
26
+
27
+
28
+ - name: Check version update
29
+ id: version-check
30
+ run: |
31
+ CURRENT_VERSION=$(node -pe "require('./package.json').version")
32
+ echo "CURRENT_VERSION=$CURRENT_VERSION" >> $GITHUB_OUTPUT
33
+ echo "📦 Current Version: $CURRENT_VERSION"
34
+
35
+ - name: Publish to NPM
36
+ run: npm publish
37
+ env:
38
+ NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
39
+
40
+ - name: Create GitHub Release
41
+ uses: actions/create-release@v1
42
+ env:
43
+ GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
44
+ with:
45
+ tag_name: v${{ steps.version-check.outputs.CURRENT_VERSION }}
46
+ release_name: Release v${{ steps.version-check.outputs.CURRENT_VERSION }}
47
+ body: |
48
+ # 🚀 New Release v${{ steps.version-check.outputs.CURRENT_VERSION }}
49
+ draft: false
50
+ prerelease: false
51
+
52
+ - name: Success notification
53
+ run: echo "✅ Published html-fetch-parser@${{ steps.version-check.outputs.CURRENT_VERSION }} to NPM"
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kaze
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,280 @@
1
+ # HTML Fetch Parser
2
+
3
+ Lightweight and powerful HTML fetching, parsing, and manipulation library for Node.js. Combines the best features of fetch, axios, and cheerio in one simple package.
4
+
5
+ ## Features
6
+
7
+ - **Easy HTML Fetching** - Built-in HTTP client with timeout support
8
+ - **Powerful Parsing** - CSS selector-based HTML parsing
9
+ - **Simple API** - Intuitive chainable methods
10
+ - **Zero Heavy Dependencies** - Uses lightweight `node-html-parser`
11
+ - **TypeScript Support** - Full TypeScript definitions included
12
+ - **Utility Functions** - HTML manipulation helpers built-in
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ npm install html-fetch-parser
18
+ ```
19
+
20
+ ## Quick Start
21
+
22
+ ### Fetch and Parse Remote HTML
23
+
24
+ ```javascript
25
+ const { fetch } = require('html-fetch-parser');
26
+
27
+ const parser = await fetch('https://example.com');
28
+ console.log(parser.getTitle());
29
+ console.log(parser.text('h1'));
30
+ console.log(parser.getLinks());
31
+ ```
32
+
33
+ ### Load and Parse Local HTML
34
+
35
+ ```javascript
36
+ const HtmlFetchParser = require('html-fetch-parser');
37
+
38
+ const html = '<h1>Hello World</h1><p>Welcome</p>';
39
+ const parser = new HtmlFetchParser();
40
+ parser.load(html);
41
+
42
+ console.log(parser.text('h1'));
43
+ ```
44
+
45
+ ## API Reference
46
+
47
+ ### Main Class
48
+
49
+ #### `new HtmlFetchParser(options)`
50
+
51
+ Create a new instance.
52
+
53
+ **Options:**
54
+ - `headers` - Default HTTP headers
55
+ - `timeout` - Request timeout in milliseconds (default: 10000)
56
+
57
+ #### Methods
58
+
59
+ **Fetching:**
60
+ - `fetch(url, options)` - Fetch HTML from URL
61
+ - `post(url, data, options)` - POST request
62
+ - `load(html)` - Load HTML string
63
+
64
+ **Querying:**
65
+ - `$(selector)` - Get single element (alias for querySelector)
66
+ - `$$(selector)` - Get all elements (alias for querySelectorAll)
67
+ - `text(selector)` - Get text content
68
+ - `textAll(selector)` - Get all text contents
69
+ - `attr(selector, attr)` - Get attribute value
70
+ - `attrAll(selector, attr)` - Get all attribute values
71
+ - `html(selector)` - Get inner HTML
72
+
73
+ **Data Extraction:**
74
+ - `extract(schema)` - Extract data using schema
75
+ - `getTitle()` - Get page title
76
+ - `getMeta(name)` - Get meta tag content
77
+ - `getLinks()` - Get all links
78
+ - `getImages()` - Get all images
79
+ - `getRawHtml()` - Get raw HTML string
80
+
81
+ ### Extract Schema
82
+
83
+ Extract structured data easily:
84
+
85
+ ```javascript
86
+ const data = parser.extract({
87
+ title: 'h1',
88
+ description: '.intro',
89
+ links: {
90
+ selector: 'a',
91
+ attr: 'href',
92
+ multiple: true
93
+ },
94
+ prices: {
95
+ selector: '.price',
96
+ multiple: true,
97
+ transform: (value) => parseFloat(value.replace('$', ''))
98
+ }
99
+ });
100
+ ```
101
+
102
+ **Schema Options:**
103
+ - `selector` (required) - CSS selector
104
+ - `attr` - Attribute name to extract
105
+ - `multiple` - Extract from all matching elements
106
+ - `transform` - Transform function
107
+
108
+ ### Manipulator Class
109
+
110
+ Static utility methods for HTML manipulation:
111
+
112
+ ```javascript
113
+ const { Manipulator } = require('html-fetch-parser');
114
+
115
+ Manipulator.stripTags(html);
116
+ Manipulator.decodeEntities(html);
117
+ Manipulator.extractUrls(html, baseUrl);
118
+ Manipulator.extractEmails(html);
119
+ Manipulator.cleanWhitespace(text);
120
+ Manipulator.truncate(text, length, suffix);
121
+ Manipulator.toAbsoluteUrl(url, baseUrl);
122
+ Manipulator.removeScriptsAndStyles(html);
123
+ Manipulator.wordCount(text);
124
+ Manipulator.sanitizeFilename(filename);
125
+ Manipulator.extractStructuredData(html);
126
+ ```
127
+
128
+ ## Examples
129
+
130
+ ### Basic Usage
131
+
132
+ ```javascript
133
+ const HtmlFetchParser = require('html-fetch-parser');
134
+
135
+ const html = `
136
+ <div>
137
+ <h1>Products</h1>
138
+ <div class="product">
139
+ <h2>Product 1</h2>
140
+ <span class="price">$19.99</span>
141
+ </div>
142
+ <div class="product">
143
+ <h2>Product 2</h2>
144
+ <span class="price">$29.99</span>
145
+ </div>
146
+ </div>
147
+ `;
148
+
149
+ const parser = new HtmlFetchParser();
150
+ parser.load(html);
151
+
152
+ const products = parser.extract({
153
+ title: 'h1',
154
+ products: {
155
+ selector: '.product h2',
156
+ multiple: true
157
+ },
158
+ prices: {
159
+ selector: '.price',
160
+ multiple: true
161
+ }
162
+ });
163
+
164
+ console.log(products);
165
+ ```
166
+
167
+ ### Fetch Remote HTML
168
+
169
+ ```javascript
170
+ const { fetch } = require('html-fetch-parser');
171
+
172
+ async function scrapeWebsite() {
173
+ const parser = await fetch('https://example.com', {
174
+ headers: {
175
+ 'User-Agent': 'My Scraper Bot'
176
+ }
177
+ });
178
+
179
+ const data = parser.extract({
180
+ title: 'h1',
181
+ description: 'meta[name="description"]',
182
+ links: {
183
+ selector: 'a',
184
+ attr: 'href',
185
+ multiple: true
186
+ }
187
+ });
188
+
189
+ return data;
190
+ }
191
+ ```
192
+
193
+ ### Custom Fetcher
194
+
195
+ ```javascript
196
+ const { Fetcher } = require('html-fetch-parser');
197
+
198
+ const fetcher = new Fetcher({
199
+ timeout: 5000,
200
+ headers: {
201
+ 'User-Agent': 'Custom Bot'
202
+ }
203
+ });
204
+
205
+ const html = await fetcher.get('https://example.com');
206
+ ```
207
+
208
+ ### HTML Manipulation
209
+
210
+ ```javascript
211
+ const { Manipulator, load } = require('html-fetch-parser');
212
+
213
+ const html = '<p>Hello &amp; welcome!</p>';
214
+
215
+ const clean = Manipulator.decodeEntities(html);
216
+ const text = Manipulator.stripTags(clean);
217
+ const truncated = Manipulator.truncate(text, 10);
218
+
219
+ console.log(truncated);
220
+ ```
221
+
222
+ ## Advanced Usage
223
+
224
+ ### Chaining Methods
225
+
226
+ ```javascript
227
+ const data = await fetch('https://example.com')
228
+ .then(parser => parser.extract({
229
+ title: 'h1',
230
+ content: '.content'
231
+ }));
232
+ ```
233
+
234
+ ### Error Handling
235
+
236
+ ```javascript
237
+ try {
238
+ const parser = await fetch('https://example.com');
239
+ console.log(parser.getTitle());
240
+ } catch (error) {
241
+ console.error('Failed to fetch:', error.message);
242
+ }
243
+ ```
244
+
245
+ ### Custom Timeout
246
+
247
+ ```javascript
248
+ const parser = new HtmlFetchParser({ timeout: 30000 });
249
+ await parser.fetch('https://slow-website.com');
250
+ ```
251
+
252
+ ## TypeScript
253
+
254
+ Full TypeScript support included:
255
+
256
+ ```typescript
257
+ import HtmlFetchParser, { fetch, Manipulator } from 'html-fetch-parser';
258
+
259
+ const parser: HtmlFetchParser = await fetch('https://example.com');
260
+ const title: string = parser.getTitle();
261
+ ```
262
+
263
+ ## Performance
264
+
265
+ - Lightweight with minimal dependencies
266
+ - Fast HTML parsing using node-html-parser
267
+ - Native fetch API for HTTP requests
268
+ - Memory efficient
269
+
270
+ ## License
271
+
272
+ MIT
273
+
274
+ ## Contributing
275
+
276
+ Contributions are welcome! Please feel free to submit a Pull Request.
277
+
278
+ ## Support
279
+
280
+ For issues and questions, please open an issue on GitHub.
@@ -0,0 +1,150 @@
1
+ const HtmlFetchParser = require('../index');
2
+ const { Manipulator } = require('../index');
3
+
4
+ async function advancedScrapingExample() {
5
+ console.log('=== Advanced Web Scraping Example ===\n');
6
+
7
+ const html = `
8
+ <!DOCTYPE html>
9
+ <html lang="en">
10
+ <head>
11
+ <title>E-Commerce Store</title>
12
+ <meta name="description" content="Best products online">
13
+ <script type="application/ld+json">
14
+ {
15
+ "@context": "https://schema.org",
16
+ "@type": "Product",
17
+ "name": "Example Product"
18
+ }
19
+ </script>
20
+ </head>
21
+ <body>
22
+ <header>
23
+ <nav>
24
+ <a href="/home">Home</a>
25
+ <a href="/products">Products</a>
26
+ <a href="/about">About</a>
27
+ </nav>
28
+ </header>
29
+
30
+ <main>
31
+ <h1>Featured Products</h1>
32
+
33
+ <article class="product" data-id="1">
34
+ <h2 class="product-name">Laptop Pro</h2>
35
+ <p class="description">High-performance laptop for professionals</p>
36
+ <span class="price" data-currency="USD">$1,299.99</span>
37
+ <span class="stock">In Stock</span>
38
+ <a href="/products/laptop-pro" class="btn">View Details</a>
39
+ </article>
40
+
41
+ <article class="product" data-id="2">
42
+ <h2 class="product-name">Wireless Mouse</h2>
43
+ <p class="description">Ergonomic wireless mouse with long battery life</p>
44
+ <span class="price" data-currency="USD">$49.99</span>
45
+ <span class="stock">Low Stock</span>
46
+ <a href="/products/wireless-mouse" class="btn">View Details</a>
47
+ </article>
48
+
49
+ <article class="product" data-id="3">
50
+ <h2 class="product-name">USB-C Cable</h2>
51
+ <p class="description">Fast charging USB-C cable, 6ft length</p>
52
+ <span class="price" data-currency="USD">$12.99</span>
53
+ <span class="stock">In Stock</span>
54
+ <a href="/products/usb-c-cable" class="btn">View Details</a>
55
+ </article>
56
+ </main>
57
+
58
+ <footer>
59
+ <p>Contact: sales@example.com | support@example.com</p>
60
+ <p>Visit our blog: https://blog.example.com</p>
61
+ </footer>
62
+ </body>
63
+ </html>
64
+ `;
65
+
66
+ const parser = new HtmlFetchParser();
67
+ parser.load(html);
68
+
69
+ console.log('=== Page Metadata ===');
70
+ console.log('Title:', parser.getTitle());
71
+ console.log('Description:', parser.getMeta('description'));
72
+ console.log();
73
+
74
+ console.log('=== Navigation Links ===');
75
+ const navLinks = parser.extract({
76
+ links: {
77
+ selector: 'nav a',
78
+ attr: 'href',
79
+ multiple: true
80
+ }
81
+ });
82
+ console.log(navLinks);
83
+ console.log();
84
+
85
+ console.log('=== Extract All Products ===');
86
+ const products = [];
87
+ const productElements = parser.$$('.product');
88
+
89
+ productElements.forEach(productEl => {
90
+ const tempParser = new HtmlFetchParser();
91
+ tempParser.load(productEl.outerHTML);
92
+
93
+ products.push({
94
+ id: productEl.getAttribute('data-id'),
95
+ name: tempParser.text('.product-name'),
96
+ description: tempParser.text('.description'),
97
+ price: tempParser.text('.price'),
98
+ priceNumeric: parseFloat(tempParser.text('.price').replace(/[^0-9.]/g, '')),
99
+ stock: tempParser.text('.stock'),
100
+ url: tempParser.attr('.btn', 'href')
101
+ });
102
+ });
103
+
104
+ console.log(JSON.stringify(products, null, 2));
105
+ console.log();
106
+
107
+ console.log('=== Statistics ===');
108
+ const totalProducts = products.length;
109
+ const inStockProducts = products.filter(p => p.stock === 'In Stock').length;
110
+ const avgPrice = products.reduce((sum, p) => sum + p.priceNumeric, 0) / totalProducts;
111
+
112
+ console.log(`Total Products: ${totalProducts}`);
113
+ console.log(`In Stock: ${inStockProducts}`);
114
+ console.log(`Average Price: $${avgPrice.toFixed(2)}`);
115
+ console.log();
116
+
117
+ console.log('=== Extract Contact Information ===');
118
+ const footerText = parser.text('footer');
119
+ const emails = Manipulator.extractEmails(footerText);
120
+ const urls = Manipulator.extractUrls(footerText);
121
+
122
+ console.log('Email addresses:', emails);
123
+ console.log('URLs:', urls);
124
+ console.log();
125
+
126
+ console.log('=== Structured Data ===');
127
+ const structuredData = Manipulator.extractStructuredData(parser.getRawHtml());
128
+ console.log(JSON.stringify(structuredData, null, 2));
129
+ console.log();
130
+
131
+ console.log('=== Text Analysis ===');
132
+ const mainContent = parser.text('main');
133
+ const cleanText = Manipulator.cleanWhitespace(mainContent);
134
+ const wordCount = Manipulator.wordCount(cleanText);
135
+ const preview = Manipulator.truncate(cleanText, 100);
136
+
137
+ console.log(`Word Count: ${wordCount}`);
138
+ console.log(`Preview: ${preview}`);
139
+ console.log();
140
+
141
+ console.log('=== Convert URLs to Absolute ===');
142
+ const baseUrl = 'https://example.com';
143
+ const absoluteUrls = products.map(p => ({
144
+ name: p.name,
145
+ url: Manipulator.toAbsoluteUrl(p.url, baseUrl)
146
+ }));
147
+ console.log(absoluteUrls);
148
+ }
149
+
150
+ advancedScrapingExample().catch(console.error);
@@ -0,0 +1,57 @@
1
+ const HtmlFetchParser = require('../index');
2
+
3
+ async function basicExample() {
4
+ console.log('=== HTML Fetch Parser - Basic Usage ===\n');
5
+
6
+ const html = `
7
+ <!DOCTYPE html>
8
+ <html>
9
+ <head>
10
+ <title>Sample Page</title>
11
+ <meta name="description" content="This is a sample page">
12
+ </head>
13
+ <body>
14
+ <h1>Welcome to HTML Fetch Parser</h1>
15
+ <p class="intro">A lightweight library for HTML manipulation</p>
16
+ <div class="content">
17
+ <h2>Features</h2>
18
+ <ul>
19
+ <li>Easy HTML fetching</li>
20
+ <li>Powerful parsing</li>
21
+ <li>Simple manipulation</li>
22
+ </ul>
23
+ </div>
24
+ <a href="/docs" title="Documentation">Read Docs</a>
25
+ <img src="/logo.png" alt="Logo">
26
+ </body>
27
+ </html>
28
+ `;
29
+
30
+ const parser = new HtmlFetchParser();
31
+ parser.load(html);
32
+
33
+ console.log('Title:', parser.getTitle());
34
+ console.log('Meta description:', parser.getMeta('description'));
35
+ console.log('H1 text:', parser.text('h1'));
36
+ console.log('Intro text:', parser.text('.intro'));
37
+ console.log('All list items:', parser.textAll('li'));
38
+ console.log('Links:', parser.getLinks());
39
+ console.log('Images:', parser.getImages());
40
+
41
+ console.log('\n=== Extract with Schema ===');
42
+ const data = parser.extract({
43
+ title: 'h1',
44
+ intro: '.intro',
45
+ features: {
46
+ selector: 'li',
47
+ multiple: true
48
+ },
49
+ linkHref: {
50
+ selector: 'a',
51
+ attr: 'href'
52
+ }
53
+ });
54
+ console.log(data);
55
+ }
56
+
57
+ basicExample().catch(console.error);
@@ -0,0 +1,27 @@
1
+ const { fetch } = require('../index');
2
+
3
+ async function fetchExample() {
4
+ console.log('=== Fetch Remote HTML Example ===\n');
5
+
6
+ try {
7
+ const parser = await fetch('https://freepublicapisss.vercel.app/');
8
+
9
+ console.log('Title:', parser.getTitle());
10
+ console.log('First paragraph:', parser.text('p'));
11
+ console.log('All links:', parser.getLinks());
12
+
13
+ const data = parser.extract({
14
+ title: 'h1',
15
+ paragraphs: {
16
+ selector: 'p',
17
+ multiple: true
18
+ }
19
+ });
20
+
21
+ console.log('\nExtracted data:', data);
22
+ } catch (error) {
23
+ console.error('Error:', error.message);
24
+ }
25
+ }
26
+
27
+ fetchExample();
@@ -0,0 +1,39 @@
1
+ const { Manipulator } = require('../index');
2
+
3
+ console.log('=== Manipulator Example ===\n');
4
+
5
+ const html = `
6
+ <div>
7
+ <h1>Hello &amp; Welcome!</h1>
8
+ <p>Contact us at: info@example.com or support@test.com</p>
9
+ <script>alert('test');</script>
10
+ <p>Visit https://example.com for more info</p>
11
+ </div>
12
+ `;
13
+
14
+ console.log('Original HTML:');
15
+ console.log(html);
16
+
17
+ console.log('\nStrip tags:');
18
+ console.log(Manipulator.stripTags(html));
19
+
20
+ console.log('\nDecode entities:');
21
+ console.log(Manipulator.decodeEntities('Hello &amp; Welcome!'));
22
+
23
+ console.log('\nExtract emails:');
24
+ console.log(Manipulator.extractEmails(html));
25
+
26
+ console.log('\nExtract URLs:');
27
+ console.log(Manipulator.extractUrls(html));
28
+
29
+ console.log('\nRemove scripts:');
30
+ console.log(Manipulator.removeScriptsAndStyles(html));
31
+
32
+ console.log('\nTruncate text:');
33
+ console.log(Manipulator.truncate('This is a very long text that needs truncating', 20));
34
+
35
+ console.log('\nWord count:');
36
+ console.log(Manipulator.wordCount('Hello world this is a test'));
37
+
38
+ console.log('\nConvert to absolute URL:');
39
+ console.log(Manipulator.toAbsoluteUrl('/path/to/page', 'https://example.com'));
package/index.d.ts ADDED
@@ -0,0 +1,97 @@
1
+ declare module 'html-fetch-parser' {
2
+ export interface FetcherOptions {
3
+ headers?: Record<string, string>;
4
+ timeout?: number;
5
+ }
6
+
7
+ export interface FetchOptions extends RequestInit {
8
+ headers?: Record<string, string>;
9
+ }
10
+
11
+ export interface ExtractConfig {
12
+ selector: string;
13
+ attr?: string;
14
+ multiple?: boolean;
15
+ transform?: (value: any) => any;
16
+ }
17
+
18
+ export interface ExtractSchema {
19
+ [key: string]: string | ExtractConfig;
20
+ }
21
+
22
+ export interface LinkObject {
23
+ text: string;
24
+ href: string | null;
25
+ title: string | null;
26
+ }
27
+
28
+ export interface ImageObject {
29
+ src: string | null;
30
+ alt: string | null;
31
+ title: string | null;
32
+ }
33
+
34
+ export class Fetcher {
35
+ constructor(options?: FetcherOptions);
36
+ get(url: string, options?: FetchOptions): Promise<string>;
37
+ post(url: string, data?: any, options?: FetchOptions): Promise<string>;
38
+ setHeaders(headers: Record<string, string>): void;
39
+ setTimeout(ms: number): void;
40
+ }
41
+
42
+ export class Parser {
43
+ constructor(html?: string);
44
+ load(html: string): Parser;
45
+ querySelector(selector: string): any;
46
+ querySelectorAll(selector: string): any[];
47
+ text(selector: string): string;
48
+ textAll(selector: string): string[];
49
+ attr(selector: string, attr: string): string | null;
50
+ attrAll(selector: string, attr: string): string[];
51
+ html(selector: string): string;
52
+ outerHtml(selector: string): string;
53
+ getTitle(): string;
54
+ getMeta(name: string): string;
55
+ getLinks(): LinkObject[];
56
+ getImages(): ImageObject[];
57
+ extract(schema: ExtractSchema): Record<string, any>;
58
+ getRawHtml(): string;
59
+ }
60
+
61
+ export class Manipulator {
62
+ static stripTags(html: string): string;
63
+ static decodeEntities(html: string): string;
64
+ static extractUrls(html: string, baseUrl?: string): string[];
65
+ static cleanWhitespace(text: string): string;
66
+ static extractEmails(html: string): string[];
67
+ static truncate(text: string, length: number, suffix?: string): string;
68
+ static toAbsoluteUrl(url: string, baseUrl: string): string;
69
+ static extractStructuredData(html: string): object[];
70
+ static removeScriptsAndStyles(html: string): string;
71
+ static wordCount(text: string): number;
72
+ static sanitizeFilename(filename: string): string;
73
+ }
74
+
75
+ export default class HtmlFetchParser {
76
+ constructor(options?: FetcherOptions);
77
+ fetch(url: string, options?: FetchOptions): Promise<HtmlFetchParser>;
78
+ post(url: string, data?: any, options?: FetchOptions): Promise<HtmlFetchParser>;
79
+ load(html: string): HtmlFetchParser;
80
+ $(selector: string): any;
81
+ $$(selector: string): any[];
82
+ text(selector: string): string;
83
+ textAll(selector: string): string[];
84
+ attr(selector: string, attr: string): string | null;
85
+ attrAll(selector: string, attr: string): string[];
86
+ html(selector: string): string;
87
+ extract(schema: ExtractSchema): Record<string, any>;
88
+ getTitle(): string;
89
+ getMeta(name: string): string;
90
+ getLinks(): LinkObject[];
91
+ getImages(): ImageObject[];
92
+ getRawHtml(): string;
93
+ }
94
+
95
+ export function fetch(url: string, options?: FetcherOptions): Promise<HtmlFetchParser>;
96
+ export function load(html: string): HtmlFetchParser;
97
+ }
package/index.js ADDED
@@ -0,0 +1,95 @@
1
+ const Fetcher = require('./lib/fetcher');
2
+ const Parser = require('./lib/parser');
3
+ const Manipulator = require('./lib/manipulator');
4
+
5
+ class HtmlFetchParser {
6
+ constructor(options = {}) {
7
+ this.fetcher = new Fetcher(options);
8
+ this.parser = new Parser();
9
+ }
10
+
11
+ async fetch(url, options = {}) {
12
+ const html = await this.fetcher.get(url, options);
13
+ this.parser.load(html);
14
+ return this;
15
+ }
16
+
17
+ async post(url, data = {}, options = {}) {
18
+ const html = await this.fetcher.post(url, data, options);
19
+ this.parser.load(html);
20
+ return this;
21
+ }
22
+
23
+ load(html) {
24
+ this.parser.load(html);
25
+ return this;
26
+ }
27
+
28
+ $(selector) {
29
+ return this.parser.querySelector(selector);
30
+ }
31
+
32
+ $$(selector) {
33
+ return this.parser.querySelectorAll(selector);
34
+ }
35
+
36
+ text(selector) {
37
+ return this.parser.text(selector);
38
+ }
39
+
40
+ textAll(selector) {
41
+ return this.parser.textAll(selector);
42
+ }
43
+
44
+ attr(selector, attr) {
45
+ return this.parser.attr(selector, attr);
46
+ }
47
+
48
+ attrAll(selector, attr) {
49
+ return this.parser.attrAll(selector, attr);
50
+ }
51
+
52
+ html(selector) {
53
+ return this.parser.html(selector);
54
+ }
55
+
56
+ extract(schema) {
57
+ return this.parser.extract(schema);
58
+ }
59
+
60
+ getTitle() {
61
+ return this.parser.getTitle();
62
+ }
63
+
64
+ getMeta(name) {
65
+ return this.parser.getMeta(name);
66
+ }
67
+
68
+ getLinks() {
69
+ return this.parser.getLinks();
70
+ }
71
+
72
+ getImages() {
73
+ return this.parser.getImages();
74
+ }
75
+
76
+ getRawHtml() {
77
+ return this.parser.getRawHtml();
78
+ }
79
+ }
80
+
81
+ module.exports = HtmlFetchParser;
82
+ module.exports.Fetcher = Fetcher;
83
+ module.exports.Parser = Parser;
84
+ module.exports.Manipulator = Manipulator;
85
+ module.exports.default = HtmlFetchParser;
86
+
87
+ module.exports.fetch = async (url, options) => {
88
+ const instance = new HtmlFetchParser(options);
89
+ return await instance.fetch(url);
90
+ };
91
+
92
+ module.exports.load = (html) => {
93
+ const instance = new HtmlFetchParser();
94
+ return instance.load(html);
95
+ };
package/lib/fetcher.js ADDED
@@ -0,0 +1,106 @@
1
+ /**
2
+ * HTTP Fetcher Module
3
+ * Lightweight HTTP client for fetching HTML content
4
+ */
5
+
6
+ class Fetcher {
7
+ constructor(options = {}) {
8
+ this.defaultHeaders = {
9
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
10
+ ...options.headers
11
+ };
12
+ this.timeout = options.timeout || 10000;
13
+ }
14
+
15
+ /**
16
+ * Fetch HTML content from URL
17
+ * @param {string} url - URL to fetch
18
+ * @param {object} options - Fetch options
19
+ * @returns {Promise<string>} HTML content
20
+ */
21
+ async get(url, options = {}) {
22
+ const controller = new AbortController();
23
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
24
+
25
+ try {
26
+ const response = await fetch(url, {
27
+ method: 'GET',
28
+ headers: { ...this.defaultHeaders, ...options.headers },
29
+ signal: controller.signal,
30
+ ...options
31
+ });
32
+
33
+ clearTimeout(timeoutId);
34
+
35
+ if (!response.ok) {
36
+ throw new Error(`HTTP Error: ${response.status} ${response.statusText}`);
37
+ }
38
+
39
+ return await response.text();
40
+ } catch (error) {
41
+ clearTimeout(timeoutId);
42
+ if (error.name === 'AbortError') {
43
+ throw new Error(`Request timeout after ${this.timeout}ms`);
44
+ }
45
+ throw error;
46
+ }
47
+ }
48
+
49
+ /**
50
+ * POST request with data
51
+ * @param {string} url - URL to post
52
+ * @param {object} data - Data to send
53
+ * @param {object} options - Fetch options
54
+ * @returns {Promise<string>} Response content
55
+ */
56
+ async post(url, data = {}, options = {}) {
57
+ const controller = new AbortController();
58
+ const timeoutId = setTimeout(() => controller.abort(), this.timeout);
59
+
60
+ try {
61
+ const response = await fetch(url, {
62
+ method: 'POST',
63
+ headers: {
64
+ 'Content-Type': 'application/json',
65
+ ...this.defaultHeaders,
66
+ ...options.headers
67
+ },
68
+ body: JSON.stringify(data),
69
+ signal: controller.signal,
70
+ ...options
71
+ });
72
+
73
+ clearTimeout(timeoutId);
74
+
75
+ if (!response.ok) {
76
+ throw new Error(`HTTP Error: ${response.status} ${response.statusText}`);
77
+ }
78
+
79
+ return await response.text();
80
+ } catch (error) {
81
+ clearTimeout(timeoutId);
82
+ if (error.name === 'AbortError') {
83
+ throw new Error(`Request timeout after ${this.timeout}ms`);
84
+ }
85
+ throw error;
86
+ }
87
+ }
88
+
89
+ /**
90
+ * Set default headers
91
+ * @param {object} headers - Headers object
92
+ */
93
+ setHeaders(headers) {
94
+ this.defaultHeaders = { ...this.defaultHeaders, ...headers };
95
+ }
96
+
97
+ /**
98
+ * Set timeout
99
+ * @param {number} ms - Timeout in milliseconds
100
+ */
101
+ setTimeout(ms) {
102
+ this.timeout = ms;
103
+ }
104
+ }
105
+
106
+ module.exports = Fetcher;
@@ -0,0 +1,151 @@
1
+ /**
2
+ * HTML Manipulator Module
3
+ * Utilities for HTML manipulation and transformation
4
+ */
5
+
6
+ class Manipulator {
7
+ /**
8
+ * Remove HTML tags from text
9
+ * @param {string} html - HTML string
10
+ * @returns {string} Plain text
11
+ */
12
+ static stripTags(html) {
13
+ return html.replace(/<[^>]*>/g, '');
14
+ }
15
+
16
+ /**
17
+ * Decode HTML entities
18
+ * @param {string} html - HTML string with entities
19
+ * @returns {string} Decoded string
20
+ */
21
+ static decodeEntities(html) {
22
+ const entities = {
23
+ '&amp;': '&',
24
+ '&lt;': '<',
25
+ '&gt;': '>',
26
+ '&quot;': '"',
27
+ '&#039;': "'",
28
+ '&apos;': "'",
29
+ '&nbsp;': ' '
30
+ };
31
+ return html.replace(/&[#\w]+;/g, entity => entities[entity] || entity);
32
+ }
33
+
34
+ /**
35
+ * Extract URLs from HTML
36
+ * @param {string} html - HTML string
37
+ * @param {string} baseUrl - Base URL for relative links
38
+ * @returns {Array<string>} Array of URLs
39
+ */
40
+ static extractUrls(html, baseUrl = '') {
41
+ const urlRegex = /https?:\/\/[^\s<>"]+/g;
42
+ const urls = html.match(urlRegex) || [];
43
+ return [...new Set(urls)];
44
+ }
45
+
46
+ /**
47
+ * Clean whitespace
48
+ * @param {string} text - Text to clean
49
+ * @returns {string} Cleaned text
50
+ */
51
+ static cleanWhitespace(text) {
52
+ return text
53
+ .replace(/\s+/g, ' ')
54
+ .replace(/\n\s*\n/g, '\n')
55
+ .trim();
56
+ }
57
+
58
+ /**
59
+ * Extract emails from HTML
60
+ * @param {string} html - HTML string
61
+ * @returns {Array<string>} Array of email addresses
62
+ */
63
+ static extractEmails(html) {
64
+ const emailRegex = /[a-zA-Z0-9._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+/g;
65
+ const emails = html.match(emailRegex) || [];
66
+ return [...new Set(emails)];
67
+ }
68
+
69
+ /**
70
+ * Truncate text
71
+ * @param {string} text - Text to truncate
72
+ * @param {number} length - Max length
73
+ * @param {string} suffix - Suffix (default: '...')
74
+ * @returns {string} Truncated text
75
+ */
76
+ static truncate(text, length, suffix = '...') {
77
+ if (text.length <= length) return text;
78
+ return text.substring(0, length - suffix.length) + suffix;
79
+ }
80
+
81
+ /**
82
+ * Convert relative URL to absolute
83
+ * @param {string} url - Relative URL
84
+ * @param {string} baseUrl - Base URL
85
+ * @returns {string} Absolute URL
86
+ */
87
+ static toAbsoluteUrl(url, baseUrl) {
88
+ if (!url) return '';
89
+ if (url.startsWith('http://') || url.startsWith('https://')) return url;
90
+
91
+ try {
92
+ return new URL(url, baseUrl).href;
93
+ } catch {
94
+ return url;
95
+ }
96
+ }
97
+
98
+ /**
99
+ * Extract structured data (JSON-LD, microdata)
100
+ * @param {string} html - HTML string
101
+ * @returns {Array<object>} Array of structured data objects
102
+ */
103
+ static extractStructuredData(html) {
104
+ const jsonLdRegex = /<script[^>]+type=["']application\/ld\+json["'][^>]*>(.*?)<\/script>/gis;
105
+ const matches = [...html.matchAll(jsonLdRegex)];
106
+
107
+ return matches
108
+ .map(match => {
109
+ try {
110
+ return JSON.parse(match[1]);
111
+ } catch {
112
+ return null;
113
+ }
114
+ })
115
+ .filter(Boolean);
116
+ }
117
+
118
+ /**
119
+ * Remove scripts and styles
120
+ * @param {string} html - HTML string
121
+ * @returns {string} Cleaned HTML
122
+ */
123
+ static removeScriptsAndStyles(html) {
124
+ return html
125
+ .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '')
126
+ .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, '');
127
+ }
128
+
129
+ /**
130
+ * Get word count
131
+ * @param {string} text - Text to count
132
+ * @returns {number} Word count
133
+ */
134
+ static wordCount(text) {
135
+ return text.trim().split(/\s+/).filter(Boolean).length;
136
+ }
137
+
138
+ /**
139
+ * Sanitize filename
140
+ * @param {string} filename - Filename to sanitize
141
+ * @returns {string} Safe filename
142
+ */
143
+ static sanitizeFilename(filename) {
144
+ return filename
145
+ .replace(/[^a-z0-9.-]/gi, '_')
146
+ .replace(/_+/g, '_')
147
+ .toLowerCase();
148
+ }
149
+ }
150
+
151
+ module.exports = Manipulator;
package/lib/parser.js ADDED
@@ -0,0 +1,196 @@
1
+ /**
2
+ * HTML Parser Module
3
+ * Parse and query HTML content
4
+ */
5
+
6
+ const { parse } = require('node-html-parser');
7
+
8
+ class Parser {
9
+ constructor(html = '') {
10
+ this.root = html ? parse(html) : null;
11
+ this.rawHtml = html;
12
+ }
13
+
14
+ /**
15
+ * Load HTML content
16
+ * @param {string} html - HTML string
17
+ * @returns {Parser} Parser instance
18
+ */
19
+ load(html) {
20
+ this.rawHtml = html;
21
+ this.root = parse(html);
22
+ return this;
23
+ }
24
+
25
+ /**
26
+ * Find element by CSS selector
27
+ * @param {string} selector - CSS selector
28
+ * @returns {object|null} Element
29
+ */
30
+ querySelector(selector) {
31
+ if (!this.root) return null;
32
+ return this.root.querySelector(selector);
33
+ }
34
+
35
+ /**
36
+ * Find all elements by CSS selector
37
+ * @param {string} selector - CSS selector
38
+ * @returns {Array} Array of elements
39
+ */
40
+ querySelectorAll(selector) {
41
+ if (!this.root) return [];
42
+ return this.root.querySelectorAll(selector);
43
+ }
44
+
45
+ /**
46
+ * Get element text content
47
+ * @param {string} selector - CSS selector
48
+ * @returns {string} Text content
49
+ */
50
+ text(selector) {
51
+ const element = this.querySelector(selector);
52
+ return element ? element.text.trim() : '';
53
+ }
54
+
55
+ /**
56
+ * Get all text from elements
57
+ * @param {string} selector - CSS selector
58
+ * @returns {Array<string>} Array of text content
59
+ */
60
+ textAll(selector) {
61
+ const elements = this.querySelectorAll(selector);
62
+ return elements.map(el => el.text.trim()).filter(text => text);
63
+ }
64
+
65
+ /**
66
+ * Get element attribute
67
+ * @param {string} selector - CSS selector
68
+ * @param {string} attr - Attribute name
69
+ * @returns {string|null} Attribute value
70
+ */
71
+ attr(selector, attr) {
72
+ const element = this.querySelector(selector);
73
+ return element ? element.getAttribute(attr) : null;
74
+ }
75
+
76
+ /**
77
+ * Get attributes from all matching elements
78
+ * @param {string} selector - CSS selector
79
+ * @param {string} attr - Attribute name
80
+ * @returns {Array<string>} Array of attribute values
81
+ */
82
+ attrAll(selector, attr) {
83
+ const elements = this.querySelectorAll(selector);
84
+ return elements.map(el => el.getAttribute(attr)).filter(val => val);
85
+ }
86
+
87
+ /**
88
+ * Get element HTML
89
+ * @param {string} selector - CSS selector
90
+ * @returns {string} HTML content
91
+ */
92
+ html(selector) {
93
+ const element = this.querySelector(selector);
94
+ return element ? element.innerHTML : '';
95
+ }
96
+
97
+ /**
98
+ * Get outer HTML
99
+ * @param {string} selector - CSS selector
100
+ * @returns {string} Outer HTML
101
+ */
102
+ outerHtml(selector) {
103
+ const element = this.querySelector(selector);
104
+ return element ? element.outerHTML : '';
105
+ }
106
+
107
+ /**
108
+ * Get page title
109
+ * @returns {string} Page title
110
+ */
111
+ getTitle() {
112
+ return this.text('title');
113
+ }
114
+
115
+ /**
116
+ * Get meta tags
117
+ * @param {string} name - Meta name or property
118
+ * @returns {string} Meta content
119
+ */
120
+ getMeta(name) {
121
+ const meta = this.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
122
+ return meta ? meta.getAttribute('content') : '';
123
+ }
124
+
125
+ /**
126
+ * Get all links
127
+ * @returns {Array<object>} Array of links with text and href
128
+ */
129
+ getLinks() {
130
+ const links = this.querySelectorAll('a');
131
+ return links.map(link => ({
132
+ text: link.text.trim(),
133
+ href: link.getAttribute('href'),
134
+ title: link.getAttribute('title')
135
+ }));
136
+ }
137
+
138
+ /**
139
+ * Get all images
140
+ * @returns {Array<object>} Array of images with src and alt
141
+ */
142
+ getImages() {
143
+ const images = this.querySelectorAll('img');
144
+ return images.map(img => ({
145
+ src: img.getAttribute('src'),
146
+ alt: img.getAttribute('alt'),
147
+ title: img.getAttribute('title')
148
+ }));
149
+ }
150
+
151
+ /**
152
+ * Extract data using custom mapping
153
+ * @param {object} schema - Extraction schema
154
+ * @returns {object} Extracted data
155
+ */
156
+ extract(schema) {
157
+ const result = {};
158
+
159
+ for (const [key, config] of Object.entries(schema)) {
160
+ if (typeof config === 'string') {
161
+ // Simple selector
162
+ result[key] = this.text(config);
163
+ } else if (typeof config === 'object') {
164
+ // Advanced config
165
+ const { selector, attr, multiple, transform } = config;
166
+
167
+ if (multiple) {
168
+ result[key] = attr
169
+ ? this.attrAll(selector, attr)
170
+ : this.textAll(selector);
171
+ } else {
172
+ result[key] = attr
173
+ ? this.attr(selector, attr)
174
+ : this.text(selector);
175
+ }
176
+
177
+ // Apply transform function if provided
178
+ if (transform && typeof transform === 'function') {
179
+ result[key] = transform(result[key]);
180
+ }
181
+ }
182
+ }
183
+
184
+ return result;
185
+ }
186
+
187
+ /**
188
+ * Get raw HTML
189
+ * @returns {string} Raw HTML
190
+ */
191
+ getRawHtml() {
192
+ return this.rawHtml;
193
+ }
194
+ }
195
+
196
+ module.exports = Parser;
package/package.json ADDED
@@ -0,0 +1,30 @@
1
+ {
2
+ "name": "html-fetch-parser",
3
+ "version": "1.0.0",
4
+ "description": "Lightweight HTML fetching and parsing library - combines fetch, parsing, and manipulation in one simple package",
5
+ "main": "index.js",
6
+ "types": "index.d.ts",
7
+ "keywords": [
8
+ "html",
9
+ "parser",
10
+ "fetch",
11
+ "scraper",
12
+ "cheerio",
13
+ "axios",
14
+ "dom",
15
+ "manipulation"
16
+ ],
17
+ "author": "KazeDevID",
18
+ "license": "MIT",
19
+ "dependencies": {
20
+ "node-html-parser": "^6.1.12"
21
+ },
22
+ "devDependencies": {},
23
+ "repository": {
24
+ "type": "git",
25
+ "url": "https://github.com/KazeDevID/html-fetch-parser"
26
+ },
27
+ "scripts": {
28
+ "test": "node examples/basic-usage.js"
29
+ }
30
+ }