@sharpapi/sharpapi-node-web-scraping 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +308 -0
- package/package.json +35 -0
- package/src/SharpApiWebScrapingService.js +112 -0
- package/src/index.js +6 -0
package/README.md
ADDED
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+

|
|
2
|
+
|
|
3
|
+
# Web Scraping API for Node.js
|
|
4
|
+
|
|
5
|
+
## 🌐 Extract web content and data with ease — powered by SharpAPI.
|
|
6
|
+
|
|
7
|
+
[](https://www.npmjs.com/package/@sharpapi/sharpapi-node-web-scraping)
|
|
8
|
+
[](https://github.com/sharpapi/sharpapi-node-client/blob/master/LICENSE.md)
|
|
9
|
+
|
|
10
|
+
**SharpAPI Web Scraping** provides powerful web scraping capabilities including HTML extraction, text content extraction, structured data parsing, link extraction, and screenshot capture. Perfect for data aggregation, monitoring, and content extraction applications.
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## 📋 Table of Contents
|
|
15
|
+
|
|
16
|
+
1. [Requirements](#requirements)
|
|
17
|
+
2. [Installation](#installation)
|
|
18
|
+
3. [Usage](#usage)
|
|
19
|
+
4. [API Documentation](#api-documentation)
|
|
20
|
+
5. [Examples](#examples)
|
|
21
|
+
6. [License](#license)
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Requirements
|
|
26
|
+
|
|
27
|
+
- Node.js >= 16.x
|
|
28
|
+
- npm or yarn
|
|
29
|
+
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Installation
|
|
33
|
+
|
|
34
|
+
### Step 1. Install the package via npm:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
npm install @sharpapi/sharpapi-node-web-scraping
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Step 2. Get your API key
|
|
41
|
+
|
|
42
|
+
Visit [SharpAPI.com](https://sharpapi.com/) to get your API key.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## Usage
|
|
47
|
+
|
|
48
|
+
```javascript
|
|
49
|
+
const { SharpApiWebScrapingService } = require('@sharpapi/sharpapi-node-web-scraping');
|
|
50
|
+
|
|
51
|
+
const apiKey = process.env.SHARP_API_KEY;
|
|
52
|
+
const service = new SharpApiWebScrapingService(apiKey);
|
|
53
|
+
|
|
54
|
+
async function scrapeWebsite() {
|
|
55
|
+
try {
|
|
56
|
+
// Extract content from a URL
|
|
57
|
+
const content = await service.scrapeUrl('https://example.com');
|
|
58
|
+
console.log('Title:', content.title);
|
|
59
|
+
console.log('Content:', content.content);
|
|
60
|
+
|
|
61
|
+
// Extract text content
|
|
62
|
+
const text = await service.extractText('https://example.com');
|
|
63
|
+
console.log('Text:', text);
|
|
64
|
+
|
|
65
|
+
// Extract all links
|
|
66
|
+
const links = await service.extractLinks('https://example.com');
|
|
67
|
+
console.log(`Found ${links.length} links`);
|
|
68
|
+
} catch (error) {
|
|
69
|
+
console.error('Error:', error.message);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
scrapeWebsite();
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
---
|
|
77
|
+
|
|
78
|
+
## API Documentation
|
|
79
|
+
|
|
80
|
+
### Methods
|
|
81
|
+
|
|
82
|
+
#### `scrapeUrl(url: string): Promise<object>`
|
|
83
|
+
|
|
84
|
+
Scrape a webpage and extract its content (synchronous).
|
|
85
|
+
|
|
86
|
+
**Parameters:**
|
|
87
|
+
- `url` (string, required): The URL to scrape
|
|
88
|
+
|
|
89
|
+
**Returns:**
|
|
90
|
+
- Scraped content including title, text, and metadata
|
|
91
|
+
|
|
92
|
+
#### `scrapeHtml(url: string, options?: object): Promise<object>`
|
|
93
|
+
|
|
94
|
+
Extract raw HTML content from a webpage.
|
|
95
|
+
|
|
96
|
+
**Options:**
|
|
97
|
+
- `javascript` (boolean): Execute JavaScript (default: false)
|
|
98
|
+
- `timeout` (number): Request timeout in ms (default: 30000)
|
|
99
|
+
- `userAgent` (string): Custom user agent
|
|
100
|
+
- `headers` (object): Custom HTTP headers
|
|
101
|
+
- `proxy` (string): Proxy server URL
|
|
102
|
+
|
|
103
|
+
#### `extractText(url: string, options?: object): Promise<object>`
|
|
104
|
+
|
|
105
|
+
Extract clean text content from a webpage.
|
|
106
|
+
|
|
107
|
+
#### `extractLinks(url: string, options?: object): Promise<object>`
|
|
108
|
+
|
|
109
|
+
Extract all links from a webpage.
|
|
110
|
+
|
|
111
|
+
#### `extractStructuredData(url: string, options?: object): Promise<object>`
|
|
112
|
+
|
|
113
|
+
Extract structured data (JSON-LD, microdata, etc.) from a webpage.
|
|
114
|
+
|
|
115
|
+
#### `takeScreenshot(url: string, options?: object): Promise<object>`
|
|
116
|
+
|
|
117
|
+
Capture a screenshot of a webpage.
|
|
118
|
+
|
|
119
|
+
**Options:**
|
|
120
|
+
- `fullPage` (boolean): Capture full page (default: false)
|
|
121
|
+
- `width` (number): Viewport width (default: 1280)
|
|
122
|
+
- `height` (number): Viewport height (default: 800)
|
|
123
|
+
- `javascript` (boolean): Execute JavaScript (default: true)
|
|
124
|
+
|
|
125
|
+
---
|
|
126
|
+
|
|
127
|
+
## Examples
|
|
128
|
+
|
|
129
|
+
### Price Monitoring
|
|
130
|
+
|
|
131
|
+
```javascript
|
|
132
|
+
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
133
|
+
|
|
134
|
+
async function monitorPrice(productUrl) {
|
|
135
|
+
const content = await service.scrapeUrl(productUrl);
|
|
136
|
+
|
|
137
|
+
// Extract price from content
|
|
138
|
+
const priceMatch = content.text.match(/\$(\d+\.\d{2})/);
|
|
139
|
+
const price = priceMatch ? parseFloat(priceMatch[1]) : null;
|
|
140
|
+
|
|
141
|
+
return {
|
|
142
|
+
url: productUrl,
|
|
143
|
+
price: price,
|
|
144
|
+
title: content.title,
|
|
145
|
+
timestamp: new Date().toISOString()
|
|
146
|
+
};
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
const priceData = await monitorPrice('https://example.com/product');
|
|
150
|
+
console.log('Price:', priceData.price);
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
### Content Aggregation
|
|
154
|
+
|
|
155
|
+
```javascript
|
|
156
|
+
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
157
|
+
|
|
158
|
+
async function aggregateNews(urls) {
|
|
159
|
+
const articles = await Promise.all(
|
|
160
|
+
urls.map(async (url) => {
|
|
161
|
+
const content = await service.scrapeUrl(url);
|
|
162
|
+
return {
|
|
163
|
+
title: content.title,
|
|
164
|
+
text: content.text.substring(0, 200) + '...',
|
|
165
|
+
url: url,
|
|
166
|
+
scrapedAt: new Date()
|
|
167
|
+
};
|
|
168
|
+
})
|
|
169
|
+
);
|
|
170
|
+
|
|
171
|
+
return articles;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
const newsUrls = [
|
|
175
|
+
'https://news-site.com/article-1',
|
|
176
|
+
'https://news-site.com/article-2'
|
|
177
|
+
];
|
|
178
|
+
|
|
179
|
+
const articles = await aggregateNews(newsUrls);
|
|
180
|
+
articles.forEach(article => {
|
|
181
|
+
console.log(`\n${article.title}`);
|
|
182
|
+
console.log(article.text);
|
|
183
|
+
});
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
### SEO Analysis
|
|
187
|
+
|
|
188
|
+
```javascript
|
|
189
|
+
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
190
|
+
|
|
191
|
+
async function analyzeSEO(url) {
|
|
192
|
+
const [html, links, structured] = await Promise.all([
|
|
193
|
+
service.scrapeHtml(url),
|
|
194
|
+
service.extractLinks(url),
|
|
195
|
+
service.extractStructuredData(url)
|
|
196
|
+
]);
|
|
197
|
+
|
|
198
|
+
const analysis = {
|
|
199
|
+
url: url,
|
|
200
|
+
title: html.title,
|
|
201
|
+
meta: html.meta,
|
|
202
|
+
wordCount: html.text.split(/\s+/).length,
|
|
203
|
+
internalLinks: links.filter(l => l.internal).length,
|
|
204
|
+
externalLinks: links.filter(l => !l.internal).length,
|
|
205
|
+
hasStructuredData: Object.keys(structured).length > 0,
|
|
206
|
+
structuredDataTypes: Object.keys(structured)
|
|
207
|
+
};
|
|
208
|
+
|
|
209
|
+
return analysis;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const seoReport = await analyzeSEO('https://example.com');
|
|
213
|
+
console.log('SEO Analysis:', seoReport);
|
|
214
|
+
```
|
|
215
|
+
|
|
216
|
+
### Website Screenshots
|
|
217
|
+
|
|
218
|
+
```javascript
|
|
219
|
+
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
220
|
+
const fs = require('fs');
|
|
221
|
+
|
|
222
|
+
async function captureWebsite(url, outputPath) {
|
|
223
|
+
const screenshot = await service.takeScreenshot(url, {
|
|
224
|
+
fullPage: true,
|
|
225
|
+
width: 1920,
|
|
226
|
+
height: 1080
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
// Save screenshot (base64 encoded)
|
|
230
|
+
const buffer = Buffer.from(screenshot.data, 'base64');
|
|
231
|
+
fs.writeFileSync(outputPath, buffer);
|
|
232
|
+
|
|
233
|
+
console.log(`Screenshot saved to ${outputPath}`);
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
await captureWebsite('https://example.com', './screenshot.png');
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
---
|
|
240
|
+
|
|
241
|
+
## Use Cases
|
|
242
|
+
|
|
243
|
+
- **Price Monitoring**: Track product prices across e-commerce sites
|
|
244
|
+
- **Content Aggregation**: Collect articles and news from multiple sources
|
|
245
|
+
- **SEO Analysis**: Analyze website structure and metadata
|
|
246
|
+
- **Lead Generation**: Extract contact information from websites
|
|
247
|
+
- **Competitive Intelligence**: Monitor competitor websites
|
|
248
|
+
- **Data Collection**: Gather research data from web sources
|
|
249
|
+
- **Website Monitoring**: Track website changes and updates
|
|
250
|
+
- **Screenshot Services**: Generate website previews
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Features
|
|
255
|
+
|
|
256
|
+
- **Synchronous Processing**: Instant results, no polling
|
|
257
|
+
- **JavaScript Execution**: Handle dynamic content
|
|
258
|
+
- **Custom Headers**: Full control over requests
|
|
259
|
+
- **Proxy Support**: Route requests through proxies
|
|
260
|
+
- **Screenshot Capture**: Visual website representation
|
|
261
|
+
- **Structured Data**: Extract JSON-LD and microdata
|
|
262
|
+
- **Link Extraction**: Discover internal and external links
|
|
263
|
+
- **Clean Text**: Remove HTML and extract readable content
|
|
264
|
+
|
|
265
|
+
---
|
|
266
|
+
|
|
267
|
+
## Best Practices
|
|
268
|
+
|
|
269
|
+
1. **Respect robots.txt**: Check website policies before scraping
|
|
270
|
+
2. **Rate Limiting**: Don't overwhelm target servers
|
|
271
|
+
3. **Error Handling**: Implement robust error handling
|
|
272
|
+
4. **Data Validation**: Validate extracted data
|
|
273
|
+
5. **Legal Compliance**: Ensure scraping is legal for your use case
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## API Endpoint
|
|
278
|
+
|
|
279
|
+
**GET** `/utilities/scrape_url`
|
|
280
|
+
|
|
281
|
+
For detailed API specifications, refer to:
|
|
282
|
+
- [Postman Documentation](https://documenter.getpostman.com/view/31106842/2sBXVeGsW6)
|
|
283
|
+
- [Product Page](https://sharpapi.com/en/catalog/utility/web-scraping)
|
|
284
|
+
|
|
285
|
+
---
|
|
286
|
+
|
|
287
|
+
## Related Packages
|
|
288
|
+
|
|
289
|
+
- [@sharpapi/sharpapi-node-detect-urls](https://www.npmjs.com/package/@sharpapi/sharpapi-node-detect-urls) - URL detection
|
|
290
|
+
- [@sharpapi/sharpapi-node-client](https://www.npmjs.com/package/@sharpapi/sharpapi-node-client) - Full SharpAPI SDK
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## License
|
|
295
|
+
|
|
296
|
+
This project is licensed under the MIT License. See the [LICENSE.md](LICENSE.md) file for details.
|
|
297
|
+
|
|
298
|
+
---
|
|
299
|
+
|
|
300
|
+
## Support
|
|
301
|
+
|
|
302
|
+
- **Documentation**: [SharpAPI.com Documentation](https://sharpapi.com/documentation)
|
|
303
|
+
- **Issues**: [GitHub Issues](https://github.com/sharpapi/sharpapi-node-client/issues)
|
|
304
|
+
- **Email**: contact@sharpapi.com
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
**Powered by [SharpAPI](https://sharpapi.com/) - AI-Powered API Workflow Automation**
|
package/package.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@sharpapi/sharpapi-node-web-scraping",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "SharpAPI.com Node.js SDK for Web Scraping API",
|
|
5
|
+
"main": "src/index.js",
|
|
6
|
+
"scripts": {
|
|
7
|
+
"test": "jest"
|
|
8
|
+
},
|
|
9
|
+
"keywords": [
|
|
10
|
+
"sharpapi",
|
|
11
|
+
"api",
|
|
12
|
+
"api integration",
|
|
13
|
+
"restful api",
|
|
14
|
+
"nodejs",
|
|
15
|
+
"software development",
|
|
16
|
+
"web scraping",
|
|
17
|
+
"html extraction",
|
|
18
|
+
"data extraction"
|
|
19
|
+
],
|
|
20
|
+
"author": "Dawid Makowski <contact@sharpapi.com>",
|
|
21
|
+
"license": "MIT",
|
|
22
|
+
"dependencies": {
|
|
23
|
+
"@sharpapi/sharpapi-node-core": "file:../sharpapi-node-core"
|
|
24
|
+
},
|
|
25
|
+
"devDependencies": {
|
|
26
|
+
"jest": "^29.7.0"
|
|
27
|
+
},
|
|
28
|
+
"publishConfig": {
|
|
29
|
+
"access": "public"
|
|
30
|
+
},
|
|
31
|
+
"repository": {
|
|
32
|
+
"type": "git",
|
|
33
|
+
"url": "https://github.com/sharpapi/sharpapi-node-web-scraping.git"
|
|
34
|
+
}
|
|
35
|
+
}
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
const { SharpApiCoreService } = require('@sharpapi/sharpapi-node-core');
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Service for accessing Web Scraping API using SharpAPI.com
|
|
5
|
+
*/
|
|
6
|
+
class SharpApiWebScrapingService extends SharpApiCoreService {
|
|
7
|
+
/**
|
|
8
|
+
* Scrape a webpage URL and extract its content (synchronous endpoint)
|
|
9
|
+
*
|
|
10
|
+
* @param {string} url - The URL of the webpage to scrape
|
|
11
|
+
* @returns {Promise<object>} - The scraped content
|
|
12
|
+
*/
|
|
13
|
+
async scrapeUrl(url) {
|
|
14
|
+
const response = await this.makeRequest('GET', '/utilities/scrape_url', { url });
|
|
15
|
+
return response.data;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Scrape a webpage and extract its HTML content
|
|
20
|
+
*
|
|
21
|
+
* @param {string} url - The URL of the webpage to scrape
|
|
22
|
+
* @param {object} [options] - Additional options for scraping
|
|
23
|
+
* @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
|
|
24
|
+
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
25
|
+
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
26
|
+
* @param {object} [options.headers] - Custom headers to send with the request
|
|
27
|
+
* @param {string} [options.proxy] - Proxy to use for the request
|
|
28
|
+
* @returns {Promise<object>} - The scraped HTML content and metadata
|
|
29
|
+
*/
|
|
30
|
+
async scrapeHtml(url, options = {}) {
|
|
31
|
+
const data = { url, ...options };
|
|
32
|
+
const response = await this.makeRequest('POST', '/utility/web-scraping/html', data);
|
|
33
|
+
return response.data;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Extract structured data from a webpage
|
|
38
|
+
*
|
|
39
|
+
* @param {string} url - The URL of the webpage to scrape
|
|
40
|
+
* @param {object} [options] - Additional options for scraping
|
|
41
|
+
* @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
|
|
42
|
+
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
43
|
+
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
44
|
+
* @param {object} [options.headers] - Custom headers to send with the request
|
|
45
|
+
* @param {string} [options.proxy] - Proxy to use for the request
|
|
46
|
+
* @returns {Promise<object>} - The extracted structured data
|
|
47
|
+
*/
|
|
48
|
+
async extractStructuredData(url, options = {}) {
|
|
49
|
+
const data = { url, ...options };
|
|
50
|
+
const response = await this.makeRequest('POST', '/utility/web-scraping/structured-data', data);
|
|
51
|
+
return response.data;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* Take a screenshot of a webpage
|
|
56
|
+
*
|
|
57
|
+
* @param {string} url - The URL of the webpage to screenshot
|
|
58
|
+
* @param {object} [options] - Additional options for screenshot
|
|
59
|
+
* @param {boolean} [options.fullPage=false] - Whether to capture the full page or just the viewport
|
|
60
|
+
* @param {number} [options.width=1280] - Viewport width
|
|
61
|
+
* @param {number} [options.height=800] - Viewport height
|
|
62
|
+
* @param {boolean} [options.javascript=true] - Whether to execute JavaScript on the page
|
|
63
|
+
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
64
|
+
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
65
|
+
* @param {object} [options.headers] - Custom headers to send with the request
|
|
66
|
+
* @param {string} [options.proxy] - Proxy to use for the request
|
|
67
|
+
* @returns {Promise<object>} - The screenshot data (base64 encoded)
|
|
68
|
+
*/
|
|
69
|
+
async takeScreenshot(url, options = {}) {
|
|
70
|
+
const data = { url, ...options };
|
|
71
|
+
const response = await this.makeRequest('POST', '/utility/web-scraping/screenshot', data);
|
|
72
|
+
return response.data;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Extract text content from a webpage
|
|
77
|
+
*
|
|
78
|
+
* @param {string} url - The URL of the webpage to scrape
|
|
79
|
+
* @param {object} [options] - Additional options for scraping
|
|
80
|
+
* @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
|
|
81
|
+
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
82
|
+
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
83
|
+
* @param {object} [options.headers] - Custom headers to send with the request
|
|
84
|
+
* @param {string} [options.proxy] - Proxy to use for the request
|
|
85
|
+
* @returns {Promise<object>} - The extracted text content
|
|
86
|
+
*/
|
|
87
|
+
async extractText(url, options = {}) {
|
|
88
|
+
const data = { url, ...options };
|
|
89
|
+
const response = await this.makeRequest('POST', '/utility/web-scraping/text', data);
|
|
90
|
+
return response.data;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/**
|
|
94
|
+
* Extract links from a webpage
|
|
95
|
+
*
|
|
96
|
+
* @param {string} url - The URL of the webpage to scrape
|
|
97
|
+
* @param {object} [options] - Additional options for scraping
|
|
98
|
+
* @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
|
|
99
|
+
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
100
|
+
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
101
|
+
* @param {object} [options.headers] - Custom headers to send with the request
|
|
102
|
+
* @param {string} [options.proxy] - Proxy to use for the request
|
|
103
|
+
* @returns {Promise<object>} - The extracted links
|
|
104
|
+
*/
|
|
105
|
+
async extractLinks(url, options = {}) {
|
|
106
|
+
const data = { url, ...options };
|
|
107
|
+
const response = await this.makeRequest('POST', '/utility/web-scraping/links', data);
|
|
108
|
+
return response.data;
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
module.exports = { SharpApiWebScrapingService };
|