@sharpapi/sharpapi-node-web-scraping 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,308 @@
1
+ ![SharpAPI GitHub cover](https://sharpapi.com/sharpapi-github-php-bg.jpg "SharpAPI Node.js Client")
2
+
3
+ # Web Scraping API for Node.js
4
+
5
+ ## 🌐 Extract web content and data with ease — powered by SharpAPI.
6
+
7
+ [![npm version](https://img.shields.io/npm/v/@sharpapi/sharpapi-node-web-scraping.svg)](https://www.npmjs.com/package/@sharpapi/sharpapi-node-web-scraping)
8
+ [![License](https://img.shields.io/npm/l/@sharpapi/sharpapi-node-web-scraping.svg)](https://github.com/sharpapi/sharpapi-node-client/blob/master/LICENSE.md)
9
+
10
+ **SharpAPI Web Scraping** provides powerful web scraping capabilities including HTML extraction, text content extraction, structured data parsing, link extraction, and screenshot capture. Perfect for data aggregation, monitoring, and content extraction applications.
11
+
12
+ ---
13
+
14
+ ## 📋 Table of Contents
15
+
16
+ 1. [Requirements](#requirements)
17
+ 2. [Installation](#installation)
18
+ 3. [Usage](#usage)
19
+ 4. [API Documentation](#api-documentation)
20
+ 5. [Examples](#examples)
21
+ 6. [License](#license)
22
+
23
+ ---
24
+
25
+ ## Requirements
26
+
27
+ - Node.js >= 16.x
28
+ - npm or yarn
29
+
30
+ ---
31
+
32
+ ## Installation
33
+
34
+ ### Step 1. Install the package via npm:
35
+
36
+ ```bash
37
+ npm install @sharpapi/sharpapi-node-web-scraping
38
+ ```
39
+
40
+ ### Step 2. Get your API key
41
+
42
+ Visit [SharpAPI.com](https://sharpapi.com/) to get your API key.
43
+
44
+ ---
45
+
46
+ ## Usage
47
+
48
+ ```javascript
49
+ const { SharpApiWebScrapingService } = require('@sharpapi/sharpapi-node-web-scraping');
50
+
51
+ const apiKey = process.env.SHARP_API_KEY;
52
+ const service = new SharpApiWebScrapingService(apiKey);
53
+
54
+ async function scrapeWebsite() {
55
+ try {
56
+ // Extract content from a URL
57
+ const content = await service.scrapeUrl('https://example.com');
58
+ console.log('Title:', content.title);
59
+ console.log('Content:', content.content);
60
+
61
+ // Extract text content
62
+ const text = await service.extractText('https://example.com');
63
+ console.log('Text:', text);
64
+
65
+ // Extract all links
66
+ const links = await service.extractLinks('https://example.com');
67
+ console.log(`Found ${links.length} links`);
68
+ } catch (error) {
69
+ console.error('Error:', error.message);
70
+ }
71
+ }
72
+
73
+ scrapeWebsite();
74
+ ```
75
+
76
+ ---
77
+
78
+ ## API Documentation
79
+
80
+ ### Methods
81
+
82
+ #### `scrapeUrl(url: string): Promise<object>`
83
+
84
+ Scrape a webpage and extract its content (synchronous).
85
+
86
+ **Parameters:**
87
+ - `url` (string, required): The URL to scrape
88
+
89
+ **Returns:**
90
+ - Scraped content including title, text, and metadata
91
+
92
+ #### `scrapeHtml(url: string, options?: object): Promise<object>`
93
+
94
+ Extract raw HTML content from a webpage.
95
+
96
+ **Options:**
97
+ - `javascript` (boolean): Execute JavaScript (default: false)
98
+ - `timeout` (number): Request timeout in ms (default: 30000)
99
+ - `userAgent` (string): Custom user agent
100
+ - `headers` (object): Custom HTTP headers
101
+ - `proxy` (string): Proxy server URL
102
+
103
+ #### `extractText(url: string, options?: object): Promise<object>`
104
+
105
+ Extract clean text content from a webpage.
106
+
107
+ #### `extractLinks(url: string, options?: object): Promise<object>`
108
+
109
+ Extract all links from a webpage.
110
+
111
+ #### `extractStructuredData(url: string, options?: object): Promise<object>`
112
+
113
+ Extract structured data (JSON-LD, microdata, etc.) from a webpage.
114
+
115
+ #### `takeScreenshot(url: string, options?: object): Promise<object>`
116
+
117
+ Capture a screenshot of a webpage.
118
+
119
+ **Options:**
120
+ - `fullPage` (boolean): Capture full page (default: false)
121
+ - `width` (number): Viewport width (default: 1280)
122
+ - `height` (number): Viewport height (default: 800)
123
+ - `javascript` (boolean): Execute JavaScript (default: true)
124
+
125
+ ---
126
+
127
+ ## Examples
128
+
129
+ ### Price Monitoring
130
+
131
+ ```javascript
132
+ const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
133
+
134
+ async function monitorPrice(productUrl) {
135
+ const content = await service.scrapeUrl(productUrl);
136
+
137
+ // Extract price from content
138
+ const priceMatch = content.text.match(/\$(\d+\.\d{2})/);
139
+ const price = priceMatch ? parseFloat(priceMatch[1]) : null;
140
+
141
+ return {
142
+ url: productUrl,
143
+ price: price,
144
+ title: content.title,
145
+ timestamp: new Date().toISOString()
146
+ };
147
+ }
148
+
149
+ const priceData = await monitorPrice('https://example.com/product');
150
+ console.log('Price:', priceData.price);
151
+ ```
152
+
153
+ ### Content Aggregation
154
+
155
+ ```javascript
156
+ const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
157
+
158
+ async function aggregateNews(urls) {
159
+ const articles = await Promise.all(
160
+ urls.map(async (url) => {
161
+ const content = await service.scrapeUrl(url);
162
+ return {
163
+ title: content.title,
164
+ text: content.text.substring(0, 200) + '...',
165
+ url: url,
166
+ scrapedAt: new Date()
167
+ };
168
+ })
169
+ );
170
+
171
+ return articles;
172
+ }
173
+
174
+ const newsUrls = [
175
+ 'https://news-site.com/article-1',
176
+ 'https://news-site.com/article-2'
177
+ ];
178
+
179
+ const articles = await aggregateNews(newsUrls);
180
+ articles.forEach(article => {
181
+ console.log(`\n${article.title}`);
182
+ console.log(article.text);
183
+ });
184
+ ```
185
+
186
+ ### SEO Analysis
187
+
188
+ ```javascript
189
+ const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
190
+
191
+ async function analyzeSEO(url) {
192
+ const [html, links, structured] = await Promise.all([
193
+ service.scrapeHtml(url),
194
+ service.extractLinks(url),
195
+ service.extractStructuredData(url)
196
+ ]);
197
+
198
+ const analysis = {
199
+ url: url,
200
+ title: html.title,
201
+ meta: html.meta,
202
+ wordCount: html.text.split(/\s+/).length,
203
+ internalLinks: links.filter(l => l.internal).length,
204
+ externalLinks: links.filter(l => !l.internal).length,
205
+ hasStructuredData: Object.keys(structured).length > 0,
206
+ structuredDataTypes: Object.keys(structured)
207
+ };
208
+
209
+ return analysis;
210
+ }
211
+
212
+ const seoReport = await analyzeSEO('https://example.com');
213
+ console.log('SEO Analysis:', seoReport);
214
+ ```
215
+
216
+ ### Website Screenshots
217
+
218
+ ```javascript
219
+ const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
220
+ const fs = require('fs');
221
+
222
+ async function captureWebsite(url, outputPath) {
223
+ const screenshot = await service.takeScreenshot(url, {
224
+ fullPage: true,
225
+ width: 1920,
226
+ height: 1080
227
+ });
228
+
229
+ // Save screenshot (base64 encoded)
230
+ const buffer = Buffer.from(screenshot.data, 'base64');
231
+ fs.writeFileSync(outputPath, buffer);
232
+
233
+ console.log(`Screenshot saved to ${outputPath}`);
234
+ }
235
+
236
+ await captureWebsite('https://example.com', './screenshot.png');
237
+ ```
238
+
239
+ ---
240
+
241
+ ## Use Cases
242
+
243
+ - **Price Monitoring**: Track product prices across e-commerce sites
244
+ - **Content Aggregation**: Collect articles and news from multiple sources
245
+ - **SEO Analysis**: Analyze website structure and metadata
246
+ - **Lead Generation**: Extract contact information from websites
247
+ - **Competitive Intelligence**: Monitor competitor websites
248
+ - **Data Collection**: Gather research data from web sources
249
+ - **Website Monitoring**: Track website changes and updates
250
+ - **Screenshot Services**: Generate website previews
251
+
252
+ ---
253
+
254
+ ## Features
255
+
256
+ - **Synchronous Processing**: Instant results, no polling
257
+ - **JavaScript Execution**: Handle dynamic content
258
+ - **Custom Headers**: Full control over requests
259
+ - **Proxy Support**: Route requests through proxies
260
+ - **Screenshot Capture**: Visual website representation
261
+ - **Structured Data**: Extract JSON-LD and microdata
262
+ - **Link Extraction**: Discover internal and external links
263
+ - **Clean Text**: Remove HTML and extract readable content
264
+
265
+ ---
266
+
267
+ ## Best Practices
268
+
269
+ 1. **Respect robots.txt**: Check website policies before scraping
270
+ 2. **Rate Limiting**: Don't overwhelm target servers
271
+ 3. **Error Handling**: Implement robust error handling
272
+ 4. **Data Validation**: Validate extracted data
273
+ 5. **Legal Compliance**: Ensure scraping is legal for your use case
274
+
275
+ ---
276
+
277
+ ## API Endpoint
278
+
279
+ **GET** `/utilities/scrape_url`
280
+
281
+ For detailed API specifications, refer to:
282
+ - [Postman Documentation](https://documenter.getpostman.com/view/31106842/2sBXVeGsW6)
283
+ - [Product Page](https://sharpapi.com/en/catalog/utility/web-scraping)
284
+
285
+ ---
286
+
287
+ ## Related Packages
288
+
289
+ - [@sharpapi/sharpapi-node-detect-urls](https://www.npmjs.com/package/@sharpapi/sharpapi-node-detect-urls) - URL detection
290
+ - [@sharpapi/sharpapi-node-client](https://www.npmjs.com/package/@sharpapi/sharpapi-node-client) - Full SharpAPI SDK
291
+
292
+ ---
293
+
294
+ ## License
295
+
296
+ This project is licensed under the MIT License. See the [LICENSE.md](LICENSE.md) file for details.
297
+
298
+ ---
299
+
300
+ ## Support
301
+
302
+ - **Documentation**: [SharpAPI.com Documentation](https://sharpapi.com/documentation)
303
+ - **Issues**: [GitHub Issues](https://github.com/sharpapi/sharpapi-node-client/issues)
304
+ - **Email**: contact@sharpapi.com
305
+
306
+ ---
307
+
308
+ **Powered by [SharpAPI](https://sharpapi.com/) - AI-Powered API Workflow Automation**
package/package.json ADDED
@@ -0,0 +1,35 @@
1
+ {
2
+ "name": "@sharpapi/sharpapi-node-web-scraping",
3
+ "version": "1.0.0",
4
+ "description": "SharpAPI.com Node.js SDK for Web Scraping API",
5
+ "main": "src/index.js",
6
+ "scripts": {
7
+ "test": "jest"
8
+ },
9
+ "keywords": [
10
+ "sharpapi",
11
+ "api",
12
+ "api integration",
13
+ "restful api",
14
+ "nodejs",
15
+ "software development",
16
+ "web scraping",
17
+ "html extraction",
18
+ "data extraction"
19
+ ],
20
+ "author": "Dawid Makowski <contact@sharpapi.com>",
21
+ "license": "MIT",
22
+ "dependencies": {
23
+ "@sharpapi/sharpapi-node-core": "file:../sharpapi-node-core"
24
+ },
25
+ "devDependencies": {
26
+ "jest": "^29.7.0"
27
+ },
28
+ "publishConfig": {
29
+ "access": "public"
30
+ },
31
+ "repository": {
32
+ "type": "git",
33
+ "url": "https://github.com/sharpapi/sharpapi-node-web-scraping.git"
34
+ }
35
+ }
@@ -0,0 +1,112 @@
1
+ const { SharpApiCoreService } = require('@sharpapi/sharpapi-node-core');
2
+
3
+ /**
4
+ * Service for accessing Web Scraping API using SharpAPI.com
5
+ */
6
+ class SharpApiWebScrapingService extends SharpApiCoreService {
7
+ /**
8
+ * Scrape a webpage URL and extract its content (synchronous endpoint)
9
+ *
10
+ * @param {string} url - The URL of the webpage to scrape
11
+ * @returns {Promise<object>} - The scraped content
12
+ */
13
+ async scrapeUrl(url) {
14
+ const response = await this.makeRequest('GET', '/utilities/scrape_url', { url });
15
+ return response.data;
16
+ }
17
+
18
+ /**
19
+ * Scrape a webpage and extract its HTML content
20
+ *
21
+ * @param {string} url - The URL of the webpage to scrape
22
+ * @param {object} [options] - Additional options for scraping
23
+ * @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
24
+ * @param {number} [options.timeout=30000] - Timeout in milliseconds
25
+ * @param {string} [options.userAgent] - Custom User-Agent string
26
+ * @param {object} [options.headers] - Custom headers to send with the request
27
+ * @param {string} [options.proxy] - Proxy to use for the request
28
+ * @returns {Promise<object>} - The scraped HTML content and metadata
29
+ */
30
+ async scrapeHtml(url, options = {}) {
31
+ const data = { url, ...options };
32
+ const response = await this.makeRequest('POST', '/utility/web-scraping/html', data);
33
+ return response.data;
34
+ }
35
+
36
+ /**
37
+ * Extract structured data from a webpage
38
+ *
39
+ * @param {string} url - The URL of the webpage to scrape
40
+ * @param {object} [options] - Additional options for scraping
41
+ * @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
42
+ * @param {number} [options.timeout=30000] - Timeout in milliseconds
43
+ * @param {string} [options.userAgent] - Custom User-Agent string
44
+ * @param {object} [options.headers] - Custom headers to send with the request
45
+ * @param {string} [options.proxy] - Proxy to use for the request
46
+ * @returns {Promise<object>} - The extracted structured data
47
+ */
48
+ async extractStructuredData(url, options = {}) {
49
+ const data = { url, ...options };
50
+ const response = await this.makeRequest('POST', '/utility/web-scraping/structured-data', data);
51
+ return response.data;
52
+ }
53
+
54
+ /**
55
+ * Take a screenshot of a webpage
56
+ *
57
+ * @param {string} url - The URL of the webpage to screenshot
58
+ * @param {object} [options] - Additional options for screenshot
59
+ * @param {boolean} [options.fullPage=false] - Whether to capture the full page or just the viewport
60
+ * @param {number} [options.width=1280] - Viewport width
61
+ * @param {number} [options.height=800] - Viewport height
62
+ * @param {boolean} [options.javascript=true] - Whether to execute JavaScript on the page
63
+ * @param {number} [options.timeout=30000] - Timeout in milliseconds
64
+ * @param {string} [options.userAgent] - Custom User-Agent string
65
+ * @param {object} [options.headers] - Custom headers to send with the request
66
+ * @param {string} [options.proxy] - Proxy to use for the request
67
+ * @returns {Promise<object>} - The screenshot data (base64 encoded)
68
+ */
69
+ async takeScreenshot(url, options = {}) {
70
+ const data = { url, ...options };
71
+ const response = await this.makeRequest('POST', '/utility/web-scraping/screenshot', data);
72
+ return response.data;
73
+ }
74
+
75
+ /**
76
+ * Extract text content from a webpage
77
+ *
78
+ * @param {string} url - The URL of the webpage to scrape
79
+ * @param {object} [options] - Additional options for scraping
80
+ * @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
81
+ * @param {number} [options.timeout=30000] - Timeout in milliseconds
82
+ * @param {string} [options.userAgent] - Custom User-Agent string
83
+ * @param {object} [options.headers] - Custom headers to send with the request
84
+ * @param {string} [options.proxy] - Proxy to use for the request
85
+ * @returns {Promise<object>} - The extracted text content
86
+ */
87
+ async extractText(url, options = {}) {
88
+ const data = { url, ...options };
89
+ const response = await this.makeRequest('POST', '/utility/web-scraping/text', data);
90
+ return response.data;
91
+ }
92
+
93
+ /**
94
+ * Extract links from a webpage
95
+ *
96
+ * @param {string} url - The URL of the webpage to scrape
97
+ * @param {object} [options] - Additional options for scraping
98
+ * @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
99
+ * @param {number} [options.timeout=30000] - Timeout in milliseconds
100
+ * @param {string} [options.userAgent] - Custom User-Agent string
101
+ * @param {object} [options.headers] - Custom headers to send with the request
102
+ * @param {string} [options.proxy] - Proxy to use for the request
103
+ * @returns {Promise<object>} - The extracted links
104
+ */
105
+ async extractLinks(url, options = {}) {
106
+ const data = { url, ...options };
107
+ const response = await this.makeRequest('POST', '/utility/web-scraping/links', data);
108
+ return response.data;
109
+ }
110
+ }
111
+
112
+ module.exports = { SharpApiWebScrapingService };
package/src/index.js ADDED
@@ -0,0 +1,6 @@
1
+ // sharpapi-node-web-scraping/src/index.js
2
+ const { SharpApiWebScrapingService } = require('./SharpApiWebScrapingService');
3
+
4
+ module.exports = {
5
+ SharpApiWebScrapingService,
6
+ };