@sharpapi/sharpapi-node-web-scraping 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +276 -162
- package/package.json +2 -2
- package/sharpapi-node-airports/CLAUDE.md +7 -0
- package/src/CLAUDE.md +7 -0
- package/src/SharpApiWebScrapingService.js +30 -94
package/README.md
CHANGED
|
@@ -2,12 +2,12 @@
|
|
|
2
2
|
|
|
3
3
|
# Web Scraping API for Node.js
|
|
4
4
|
|
|
5
|
-
## 🌐
|
|
5
|
+
## 🌐 Scrape web content with ease — powered by SharpAPI.
|
|
6
6
|
|
|
7
7
|
[](https://www.npmjs.com/package/@sharpapi/sharpapi-node-web-scraping)
|
|
8
8
|
[](https://github.com/sharpapi/sharpapi-node-client/blob/master/LICENSE.md)
|
|
9
9
|
|
|
10
|
-
**SharpAPI Web Scraping**
|
|
10
|
+
**SharpAPI Web Scraping** fetches and extracts content from web pages, providing structured data including page metadata, content, links, and more in a machine-readable JSON format. Perfect for data collection, content aggregation, SEO analysis, and research.
|
|
11
11
|
|
|
12
12
|
---
|
|
13
13
|
|
|
@@ -18,7 +18,12 @@
|
|
|
18
18
|
3. [Usage](#usage)
|
|
19
19
|
4. [API Documentation](#api-documentation)
|
|
20
20
|
5. [Examples](#examples)
|
|
21
|
-
6. [
|
|
21
|
+
6. [Use Cases](#use-cases)
|
|
22
|
+
7. [Response Format](#response-format)
|
|
23
|
+
8. [AI Integration](#ai-integration)
|
|
24
|
+
9. [API Endpoint](#api-endpoint)
|
|
25
|
+
10. [Related Packages](#related-packages)
|
|
26
|
+
11. [License](#license)
|
|
22
27
|
|
|
23
28
|
---
|
|
24
29
|
|
|
@@ -33,9 +38,9 @@
|
|
|
33
38
|
|
|
34
39
|
### Step 1. Install the package via npm:
|
|
35
40
|
|
|
36
|
-
|
|
41
|
+
\`\`\`bash
|
|
37
42
|
npm install @sharpapi/sharpapi-node-web-scraping
|
|
38
|
-
|
|
43
|
+
\`\`\`
|
|
39
44
|
|
|
40
45
|
### Step 2. Get your API key
|
|
41
46
|
|
|
@@ -45,248 +50,357 @@ Visit [SharpAPI.com](https://sharpapi.com/) to get your API key.
|
|
|
45
50
|
|
|
46
51
|
## Usage
|
|
47
52
|
|
|
48
|
-
|
|
53
|
+
\`\`\`javascript
|
|
49
54
|
const { SharpApiWebScrapingService } = require('@sharpapi/sharpapi-node-web-scraping');
|
|
50
55
|
|
|
51
|
-
const apiKey = process.env.SHARP_API_KEY;
|
|
56
|
+
const apiKey = process.env.SHARP_API_KEY; // Store your API key in environment variables
|
|
52
57
|
const service = new SharpApiWebScrapingService(apiKey);
|
|
53
58
|
|
|
54
59
|
async function scrapeWebsite() {
|
|
55
60
|
try {
|
|
56
|
-
//
|
|
57
|
-
const
|
|
58
|
-
|
|
59
|
-
console.log('
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
console.log('
|
|
64
|
-
|
|
65
|
-
// Extract all links
|
|
66
|
-
const links = await service.extractLinks('https://example.com');
|
|
67
|
-
console.log(`Found ${links.length} links`);
|
|
61
|
+
// Scrape a webpage
|
|
62
|
+
const data = await service.scrapeUrl('https://sharpapi.com');
|
|
63
|
+
|
|
64
|
+
console.log('Title:', data.title);
|
|
65
|
+
console.log('Description:', data.meta_description);
|
|
66
|
+
console.log('Keywords:', data.meta_keywords);
|
|
67
|
+
console.log('Content length:', data.content.length);
|
|
68
|
+
console.log('Links found:', data.links.length);
|
|
68
69
|
} catch (error) {
|
|
69
70
|
console.error('Error:', error.message);
|
|
70
71
|
}
|
|
71
72
|
}
|
|
72
73
|
|
|
73
74
|
scrapeWebsite();
|
|
74
|
-
|
|
75
|
+
\`\`\`
|
|
75
76
|
|
|
76
77
|
---
|
|
77
78
|
|
|
78
79
|
## API Documentation
|
|
79
80
|
|
|
80
|
-
|
|
81
|
+
This endpoint is **synchronous** and returns data immediately (no polling required).
|
|
81
82
|
|
|
82
|
-
|
|
83
|
+
### Method
|
|
83
84
|
|
|
84
|
-
|
|
85
|
+
#### `scrapeUrl(url)`
|
|
85
86
|
|
|
86
|
-
|
|
87
|
-
- `url` (string, required): The URL to scrape
|
|
88
|
-
|
|
89
|
-
**Returns:**
|
|
90
|
-
- Scraped content including title, text, and metadata
|
|
91
|
-
|
|
92
|
-
#### `scrapeHtml(url: string, options?: object): Promise<object>`
|
|
87
|
+
Scrape a webpage and extract its content in structured format.
|
|
93
88
|
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
-
|
|
101
|
-
-
|
|
102
|
-
|
|
103
|
-
|
|
89
|
+
**Parameters:**
|
|
90
|
+
- `url` (string, required): The URL to scrape (e.g., 'https://example.com' or 'example.com')
|
|
91
|
+
|
|
92
|
+
**Returns:** Promise<object> - Structured page data with metadata, content, and links
|
|
93
|
+
|
|
94
|
+
**Extracts:**
|
|
95
|
+
- **Page metadata**: Title, description, keywords, author
|
|
96
|
+
- **Open Graph tags**: OG:title, OG:description, OG:image, OG:type
|
|
97
|
+
- **Twitter Card tags**: Twitter:card, Twitter:title, Twitter:description
|
|
98
|
+
- **Content structure**: Headings (H1-H6), paragraphs, main content
|
|
99
|
+
- **Links**: Internal and external links with anchors
|
|
100
|
+
- **Meta information**: Language, charset, viewport, canonical URL
|
|
101
|
+
- **Timestamps**: Extraction date and time
|
|
102
|
+
|
|
103
|
+
**Example:**
|
|
104
|
+
\`\`\`javascript
|
|
105
|
+
const data = await service.scrapeUrl('https://example.com');
|
|
106
|
+
console.log('Page Title:', data.title);
|
|
107
|
+
console.log('Main Content:', data.content);
|
|
108
|
+
console.log('All Links:', data.links);
|
|
109
|
+
\`\`\`
|
|
104
110
|
|
|
105
|
-
|
|
111
|
+
---
|
|
106
112
|
|
|
107
|
-
|
|
113
|
+
## Examples
|
|
108
114
|
|
|
109
|
-
|
|
115
|
+
### Basic Web Scraping
|
|
110
116
|
|
|
111
|
-
|
|
117
|
+
\`\`\`javascript
|
|
118
|
+
const { SharpApiWebScrapingService } = require('@sharpapi/sharpapi-node-web-scraping');
|
|
112
119
|
|
|
113
|
-
|
|
120
|
+
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
114
121
|
|
|
115
|
-
|
|
122
|
+
async function scrapeExample() {
|
|
123
|
+
const result = await service.scrapeUrl('https://sharpapi.com');
|
|
116
124
|
|
|
117
|
-
|
|
125
|
+
console.log('=== Page Information ===');
|
|
126
|
+
console.log('Title:', result.title);
|
|
127
|
+
console.log('Description:', result.meta_description);
|
|
128
|
+
console.log('Language:', result.language);
|
|
129
|
+
console.log('Canonical URL:', result.canonical_url);
|
|
118
130
|
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
- `height` (number): Viewport height (default: 800)
|
|
123
|
-
- `javascript` (boolean): Execute JavaScript (default: true)
|
|
131
|
+
console.log('\\n=== Content ===');
|
|
132
|
+
console.log('Characters:', result.content.length);
|
|
133
|
+
console.log('Preview:', result.content.substring(0, 200) + '...');
|
|
124
134
|
|
|
125
|
-
|
|
135
|
+
console.log('\\n=== Links Found ===');
|
|
136
|
+
console.log('Total links:', result.links.length);
|
|
137
|
+
result.links.slice(0, 5).forEach(link => {
|
|
138
|
+
console.log(\`- \${link.text || 'No text'}: \${link.url}\`);
|
|
139
|
+
});
|
|
140
|
+
}
|
|
126
141
|
|
|
127
|
-
|
|
142
|
+
scrapeExample();
|
|
143
|
+
\`\`\`
|
|
128
144
|
|
|
129
|
-
###
|
|
145
|
+
### Extract Social Media Metadata
|
|
130
146
|
|
|
131
|
-
|
|
147
|
+
\`\`\`javascript
|
|
132
148
|
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
133
149
|
|
|
134
|
-
async function
|
|
135
|
-
const
|
|
150
|
+
async function getSocialMetadata(url) {
|
|
151
|
+
const data = await service.scrapeUrl(url);
|
|
136
152
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
153
|
+
console.log('=== Open Graph Tags ===');
|
|
154
|
+
console.log('OG:Title:', data.og_title);
|
|
155
|
+
console.log('OG:Description:', data.og_description);
|
|
156
|
+
console.log('OG:Image:', data.og_image);
|
|
157
|
+
console.log('OG:Type:', data.og_type);
|
|
140
158
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
};
|
|
159
|
+
console.log('\\n=== Twitter Card ===');
|
|
160
|
+
console.log('Card Type:', data.twitter_card);
|
|
161
|
+
console.log('Title:', data.twitter_title);
|
|
162
|
+
console.log('Description:', data.twitter_description);
|
|
163
|
+
console.log('Image:', data.twitter_image);
|
|
147
164
|
}
|
|
148
165
|
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
```
|
|
166
|
+
getSocialMetadata('https://example.com/article');
|
|
167
|
+
\`\`\`
|
|
152
168
|
|
|
153
|
-
###
|
|
169
|
+
### SEO Analysis
|
|
154
170
|
|
|
155
|
-
|
|
171
|
+
\`\`\`javascript
|
|
156
172
|
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
157
173
|
|
|
158
|
-
async function
|
|
159
|
-
const
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
)
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
}
|
|
174
|
+
async function analyzeSEO(url) {
|
|
175
|
+
const data = await service.scrapeUrl(url);
|
|
176
|
+
|
|
177
|
+
console.log('=== SEO Analysis ===');
|
|
178
|
+
console.log('Title:', data.title, \`(\${data.title.length} chars)\`);
|
|
179
|
+
console.log('Meta Description:', data.meta_description);
|
|
180
|
+
console.log('Keywords:', data.meta_keywords);
|
|
181
|
+
console.log('Canonical URL:', data.canonical_url);
|
|
182
|
+
console.log('Language:', data.language);
|
|
183
|
+
|
|
184
|
+
console.log('\\n=== Headings Structure ===');
|
|
185
|
+
if (data.headings) {
|
|
186
|
+
data.headings.forEach(heading => {
|
|
187
|
+
console.log(\`\${heading.level}: \${heading.text}\`);
|
|
188
|
+
});
|
|
189
|
+
}
|
|
173
190
|
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
191
|
+
console.log('\\n=== Link Analysis ===');
|
|
192
|
+
const internalLinks = data.links.filter(l => l.type === 'internal');
|
|
193
|
+
const externalLinks = data.links.filter(l => l.type === 'external');
|
|
194
|
+
console.log(\`Internal links: \${internalLinks.length}\`);
|
|
195
|
+
console.log(\`External links: \${externalLinks.length}\`);
|
|
196
|
+
}
|
|
178
197
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
console.log(`\n${article.title}`);
|
|
182
|
-
console.log(article.text);
|
|
183
|
-
});
|
|
184
|
-
```
|
|
198
|
+
analyzeSEO('https://your-website.com');
|
|
199
|
+
\`\`\`
|
|
185
200
|
|
|
186
|
-
###
|
|
201
|
+
### Content Extraction for AI Processing
|
|
187
202
|
|
|
188
|
-
|
|
203
|
+
\`\`\`javascript
|
|
189
204
|
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
190
205
|
|
|
191
|
-
async function
|
|
192
|
-
const
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
internalLinks: links.filter(l => l.internal).length,
|
|
204
|
-
externalLinks: links.filter(l => !l.internal).length,
|
|
205
|
-
hasStructuredData: Object.keys(structured).length > 0,
|
|
206
|
-
structuredDataTypes: Object.keys(structured)
|
|
206
|
+
async function extractForAI(url) {
|
|
207
|
+
const data = await service.scrapeUrl(url);
|
|
208
|
+
|
|
209
|
+
// Extract clean content for AI processing
|
|
210
|
+
const cleanContent = {
|
|
211
|
+
title: data.title,
|
|
212
|
+
description: data.meta_description,
|
|
213
|
+
mainContent: data.content,
|
|
214
|
+
language: data.language,
|
|
215
|
+
author: data.author,
|
|
216
|
+
publishedDate: data.published_date,
|
|
217
|
+
modifiedDate: data.modified_date
|
|
207
218
|
};
|
|
208
219
|
|
|
209
|
-
|
|
220
|
+
console.log('Extracted content ready for AI processing:');
|
|
221
|
+
console.log(JSON.stringify(cleanContent, null, 2));
|
|
222
|
+
|
|
223
|
+
// Now you can pass this to SharpAPI AI endpoints:
|
|
224
|
+
// - Summarization: @sharpapi/sharpapi-node-summarize-text
|
|
225
|
+
// - Translation: @sharpapi/sharpapi-node-translate
|
|
226
|
+
// - Keywords: @sharpapi/sharpapi-node-generate-keywords
|
|
227
|
+
// - SEO Tags: @sharpapi/sharpapi-node-seo-tags
|
|
210
228
|
}
|
|
211
229
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
```
|
|
230
|
+
extractForAI('https://blog.example.com/article');
|
|
231
|
+
\`\`\`
|
|
215
232
|
|
|
216
|
-
###
|
|
233
|
+
### Competitor Analysis
|
|
217
234
|
|
|
218
|
-
|
|
235
|
+
\`\`\`javascript
|
|
219
236
|
const service = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
220
|
-
const fs = require('fs');
|
|
221
237
|
|
|
222
|
-
async function
|
|
223
|
-
const
|
|
224
|
-
fullPage: true,
|
|
225
|
-
width: 1920,
|
|
226
|
-
height: 1080
|
|
227
|
-
});
|
|
238
|
+
async function analyzeCompetitor(url) {
|
|
239
|
+
const data = await service.scrapeUrl(url);
|
|
228
240
|
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
241
|
+
console.log('=== Competitor Analysis ===');
|
|
242
|
+
console.log('Domain:', new URL(url).hostname);
|
|
243
|
+
console.log('Title Strategy:', data.title);
|
|
244
|
+
console.log('Description:', data.meta_description);
|
|
245
|
+
console.log('Keywords Focus:', data.meta_keywords);
|
|
232
246
|
|
|
233
|
-
console.log(
|
|
247
|
+
console.log('\\n=== Content Strategy ===');
|
|
248
|
+
console.log('Content Length:', data.content.length, 'characters');
|
|
249
|
+
console.log('Word Count (approx):', Math.round(data.content.split(' ').length));
|
|
250
|
+
|
|
251
|
+
console.log('\\n=== Link Building ===');
|
|
252
|
+
const externalLinks = data.links.filter(l => l.type === 'external');
|
|
253
|
+
console.log('External Links:', externalLinks.length);
|
|
254
|
+
externalLinks.slice(0, 10).forEach(link => {
|
|
255
|
+
console.log(\` - \${link.url}\`);
|
|
256
|
+
});
|
|
234
257
|
}
|
|
235
258
|
|
|
236
|
-
|
|
237
|
-
|
|
259
|
+
analyzeCompetitor('https://competitor-website.com');
|
|
260
|
+
\`\`\`
|
|
238
261
|
|
|
239
262
|
---
|
|
240
263
|
|
|
241
264
|
## Use Cases
|
|
242
265
|
|
|
243
|
-
- **
|
|
244
|
-
- **
|
|
245
|
-
- **
|
|
246
|
-
- **Lead Generation**: Extract
|
|
247
|
-
- **
|
|
248
|
-
- **
|
|
249
|
-
- **
|
|
250
|
-
- **
|
|
266
|
+
- **Content Aggregation**: Collect content from multiple sources
|
|
267
|
+
- **Price Monitoring**: Track competitor pricing and availability
|
|
268
|
+
- **Research**: Gather data for analysis and insights
|
|
269
|
+
- **Lead Generation**: Extract business information from websites
|
|
270
|
+
- **Market Intelligence**: Monitor industry trends and news
|
|
271
|
+
- **SEO Analysis**: Analyze competitor websites and content
|
|
272
|
+
- **Content Curation**: Extract articles for content platforms
|
|
273
|
+
- **Social Media Monitoring**: Track mentions and brand presence
|
|
274
|
+
- **Data Enrichment**: Enhance existing data with web-sourced information
|
|
275
|
+
- **Competitive Intelligence**: Analyze competitor strategies
|
|
251
276
|
|
|
252
277
|
---
|
|
253
278
|
|
|
254
|
-
##
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
279
|
+
## Response Format
|
|
280
|
+
|
|
281
|
+
The API returns a comprehensive JSON object with the following structure:
|
|
282
|
+
|
|
283
|
+
\`\`\`json
|
|
284
|
+
{
|
|
285
|
+
"url": "https://sharpapi.com/",
|
|
286
|
+
"title": "SharpAPI - AI-Powered Workflow Automation API",
|
|
287
|
+
"meta_description": "Automate workflows with AI-powered API...",
|
|
288
|
+
"meta_keywords": "AI API, automation, workflow",
|
|
289
|
+
"author": "SharpAPI Team",
|
|
290
|
+
"language": "en",
|
|
291
|
+
"charset": "UTF-8",
|
|
292
|
+
"canonical_url": "https://sharpapi.com/",
|
|
293
|
+
"viewport": "width=device-width, initial-scale=1",
|
|
294
|
+
|
|
295
|
+
"og_title": "SharpAPI - AI-Powered API",
|
|
296
|
+
"og_description": "Automate your workflows...",
|
|
297
|
+
"og_image": "https://sharpapi.com/og-image.jpg",
|
|
298
|
+
"og_type": "website",
|
|
299
|
+
"og_url": "https://sharpapi.com/",
|
|
300
|
+
|
|
301
|
+
"twitter_card": "summary_large_image",
|
|
302
|
+
"twitter_title": "SharpAPI",
|
|
303
|
+
"twitter_description": "AI-Powered API",
|
|
304
|
+
"twitter_image": "https://sharpapi.com/twitter-card.jpg",
|
|
305
|
+
|
|
306
|
+
"content": "Full page content as text...",
|
|
307
|
+
"text_content": "Clean text without HTML...",
|
|
308
|
+
|
|
309
|
+
"headings": [
|
|
310
|
+
{ "level": "h1", "text": "Main Heading" },
|
|
311
|
+
{ "level": "h2", "text": "Subheading" }
|
|
312
|
+
],
|
|
313
|
+
|
|
314
|
+
"links": [
|
|
315
|
+
{
|
|
316
|
+
"url": "https://sharpapi.com/about",
|
|
317
|
+
"text": "About Us",
|
|
318
|
+
"type": "internal",
|
|
319
|
+
"rel": null
|
|
320
|
+
},
|
|
321
|
+
{
|
|
322
|
+
"url": "https://example.com",
|
|
323
|
+
"text": "External Link",
|
|
324
|
+
"type": "external",
|
|
325
|
+
"rel": "nofollow"
|
|
326
|
+
}
|
|
327
|
+
],
|
|
328
|
+
|
|
329
|
+
"images": [
|
|
330
|
+
{
|
|
331
|
+
"src": "https://sharpapi.com/image.jpg",
|
|
332
|
+
"alt": "Image description"
|
|
333
|
+
}
|
|
334
|
+
],
|
|
335
|
+
|
|
336
|
+
"extracted_at": "2026-01-10T15:30:00Z",
|
|
337
|
+
"processing_time_ms": 1250
|
|
338
|
+
}
|
|
339
|
+
\`\`\`
|
|
264
340
|
|
|
265
341
|
---
|
|
266
342
|
|
|
267
|
-
##
|
|
343
|
+
## AI Integration
|
|
344
|
+
|
|
345
|
+
The extracted data can be seamlessly integrated with **SharpAPI's AI-powered endpoints** for further analysis:
|
|
346
|
+
|
|
347
|
+
### Text Processing
|
|
348
|
+
- **[@sharpapi/sharpapi-node-summarize-text](https://www.npmjs.com/package/@sharpapi/sharpapi-node-summarize-text)** - Summarize extracted content
|
|
349
|
+
- **[@sharpapi/sharpapi-node-paraphrase](https://www.npmjs.com/package/@sharpapi/sharpapi-node-paraphrase)** - Rewrite content
|
|
350
|
+
- **[@sharpapi/sharpapi-node-translate](https://www.npmjs.com/package/@sharpapi/sharpapi-node-translate)** - Translate to other languages
|
|
351
|
+
|
|
352
|
+
### SEO & Keywords
|
|
353
|
+
- **[@sharpapi/sharpapi-node-generate-keywords](https://www.npmjs.com/package/@sharpapi/sharpapi-node-generate-keywords)** - Extract keywords
|
|
354
|
+
- **[@sharpapi/sharpapi-node-seo-tags](https://www.npmjs.com/package/@sharpapi/sharpapi-node-seo-tags)** - Generate SEO tags
|
|
268
355
|
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
356
|
+
### Content Analysis
|
|
357
|
+
- **[@sharpapi/sharpapi-node-detect-spam](https://www.npmjs.com/package/@sharpapi/sharpapi-node-detect-spam)** - Detect spam content
|
|
358
|
+
- **[@sharpapi/sharpapi-node-product-review-sentiment](https://www.npmjs.com/package/@sharpapi/sharpapi-node-product-review-sentiment)** - Analyze sentiment
|
|
359
|
+
|
|
360
|
+
### Example Integration
|
|
361
|
+
|
|
362
|
+
\`\`\`javascript
|
|
363
|
+
const { SharpApiWebScrapingService } = require('@sharpapi/sharpapi-node-web-scraping');
|
|
364
|
+
const { SharpApiSummarizeService } = require('@sharpapi/sharpapi-node-summarize-text');
|
|
365
|
+
|
|
366
|
+
const scrapingService = new SharpApiWebScrapingService(process.env.SHARP_API_KEY);
|
|
367
|
+
const summarizeService = new SharpApiSummarizeService(process.env.SHARP_API_KEY);
|
|
368
|
+
|
|
369
|
+
async function scrapeAndSummarize(url) {
|
|
370
|
+
// 1. Scrape the webpage
|
|
371
|
+
const scraped = await scrapingService.scrapeUrl(url);
|
|
372
|
+
|
|
373
|
+
// 2. Summarize the content
|
|
374
|
+
const statusUrl = await summarizeService.summarize(scraped.content);
|
|
375
|
+
const summary = await summarizeService.fetchResults(statusUrl);
|
|
376
|
+
|
|
377
|
+
console.log('Original length:', scraped.content.length);
|
|
378
|
+
console.log('Summary:', summary.getResultJson());
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
scrapeAndSummarize('https://blog.example.com/long-article');
|
|
382
|
+
\`\`\`
|
|
274
383
|
|
|
275
384
|
---
|
|
276
385
|
|
|
277
386
|
## API Endpoint
|
|
278
387
|
|
|
279
|
-
**GET** `/utilities/scrape_url`
|
|
388
|
+
**GET** `/utilities/scrape_url?url={url}`
|
|
389
|
+
|
|
390
|
+
This endpoint is **synchronous** and returns 200 OK immediately.
|
|
280
391
|
|
|
281
392
|
For detailed API specifications, refer to:
|
|
282
|
-
- [Postman Documentation](https://documenter.getpostman.com/view/31106842/
|
|
283
|
-
- [Product Page](https://sharpapi.com/en/catalog/utility/web-scraping)
|
|
393
|
+
- [Postman Documentation](https://documenter.getpostman.com/view/31106842/2s9Ye8faUp)
|
|
394
|
+
- [Product Page](https://sharpapi.com/en/catalog/utility/web-scraping-api)
|
|
284
395
|
|
|
285
396
|
---
|
|
286
397
|
|
|
287
398
|
## Related Packages
|
|
288
399
|
|
|
289
|
-
- [@sharpapi/sharpapi-node-detect-urls](https://www.npmjs.com/package/@sharpapi/sharpapi-node-detect-urls) -
|
|
400
|
+
- [@sharpapi/sharpapi-node-detect-urls](https://www.npmjs.com/package/@sharpapi/sharpapi-node-detect-urls) - Extract URLs from text
|
|
401
|
+
- [@sharpapi/sharpapi-node-detect-emails](https://www.npmjs.com/package/@sharpapi/sharpapi-node-detect-emails) - Extract emails from text
|
|
402
|
+
- [@sharpapi/sharpapi-node-summarize-text](https://www.npmjs.com/package/@sharpapi/sharpapi-node-summarize-text) - Summarize content
|
|
403
|
+
- [@sharpapi/sharpapi-node-seo-tags](https://www.npmjs.com/package/@sharpapi/sharpapi-node-seo-tags) - Generate SEO tags
|
|
290
404
|
- [@sharpapi/sharpapi-node-client](https://www.npmjs.com/package/@sharpapi/sharpapi-node-client) - Full SharpAPI SDK
|
|
291
405
|
|
|
292
406
|
---
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@sharpapi/sharpapi-node-web-scraping",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.2",
|
|
4
4
|
"description": "SharpAPI.com Node.js SDK for Web Scraping API",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"scripts": {
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
"author": "Dawid Makowski <contact@sharpapi.com>",
|
|
21
21
|
"license": "MIT",
|
|
22
22
|
"dependencies": {
|
|
23
|
-
"@sharpapi/sharpapi-node-core": "
|
|
23
|
+
"@sharpapi/sharpapi-node-core": "^1.0.0"
|
|
24
24
|
},
|
|
25
25
|
"devDependencies": {
|
|
26
26
|
"jest": "^29.7.0"
|
package/src/CLAUDE.md
ADDED
|
@@ -2,111 +2,47 @@ const { SharpApiCoreService } = require('@sharpapi/sharpapi-node-core');
|
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* Service for accessing Web Scraping API using SharpAPI.com
|
|
5
|
+
*
|
|
6
|
+
* Fetches and extracts content from publicly accessible URLs, providing structured data
|
|
7
|
+
* including page metadata, content, links, and more in a machine-readable JSON format.
|
|
5
8
|
*/
|
|
6
9
|
class SharpApiWebScrapingService extends SharpApiCoreService {
|
|
7
10
|
/**
|
|
8
|
-
*
|
|
9
|
-
*
|
|
10
|
-
* @param {string}
|
|
11
|
-
* @returns {Promise<object>} - The scraped content
|
|
11
|
+
* Creates a new SharpApiWebScrapingService instance
|
|
12
|
+
* @param {string} apiKey - Your SharpAPI API key
|
|
13
|
+
* @param {string} [apiBaseUrl='https://sharpapi.com/api/v1'] - API base URL
|
|
12
14
|
*/
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
return response.data;
|
|
15
|
+
constructor(apiKey, apiBaseUrl = 'https://sharpapi.com/api/v1') {
|
|
16
|
+
super(apiKey, apiBaseUrl, '@sharpapi/sharpapi-node-web-scraping/1.0.2');
|
|
16
17
|
}
|
|
17
18
|
|
|
18
19
|
/**
|
|
19
|
-
* Scrape a webpage and extract its
|
|
20
|
-
*
|
|
21
|
-
* @param {string} url - The URL of the webpage to scrape
|
|
22
|
-
* @param {object} [options] - Additional options for scraping
|
|
23
|
-
* @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
|
|
24
|
-
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
25
|
-
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
26
|
-
* @param {object} [options.headers] - Custom headers to send with the request
|
|
27
|
-
* @param {string} [options.proxy] - Proxy to use for the request
|
|
28
|
-
* @returns {Promise<object>} - The scraped HTML content and metadata
|
|
29
|
-
*/
|
|
30
|
-
async scrapeHtml(url, options = {}) {
|
|
31
|
-
const data = { url, ...options };
|
|
32
|
-
const response = await this.makeRequest('POST', '/utility/web-scraping/html', data);
|
|
33
|
-
return response.data;
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
/**
|
|
37
|
-
* Extract structured data from a webpage
|
|
38
|
-
*
|
|
39
|
-
* @param {string} url - The URL of the webpage to scrape
|
|
40
|
-
* @param {object} [options] - Additional options for scraping
|
|
41
|
-
* @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
|
|
42
|
-
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
43
|
-
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
44
|
-
* @param {object} [options.headers] - Custom headers to send with the request
|
|
45
|
-
* @param {string} [options.proxy] - Proxy to use for the request
|
|
46
|
-
* @returns {Promise<object>} - The extracted structured data
|
|
47
|
-
*/
|
|
48
|
-
async extractStructuredData(url, options = {}) {
|
|
49
|
-
const data = { url, ...options };
|
|
50
|
-
const response = await this.makeRequest('POST', '/utility/web-scraping/structured-data', data);
|
|
51
|
-
return response.data;
|
|
52
|
-
}
|
|
53
|
-
|
|
54
|
-
/**
|
|
55
|
-
* Take a screenshot of a webpage
|
|
20
|
+
* Scrape a webpage URL and extract its content (synchronous endpoint)
|
|
56
21
|
*
|
|
57
|
-
*
|
|
58
|
-
*
|
|
59
|
-
*
|
|
60
|
-
*
|
|
61
|
-
*
|
|
62
|
-
*
|
|
63
|
-
*
|
|
64
|
-
*
|
|
65
|
-
* @param {object} [options.headers] - Custom headers to send with the request
|
|
66
|
-
* @param {string} [options.proxy] - Proxy to use for the request
|
|
67
|
-
* @returns {Promise<object>} - The screenshot data (base64 encoded)
|
|
68
|
-
*/
|
|
69
|
-
async takeScreenshot(url, options = {}) {
|
|
70
|
-
const data = { url, ...options };
|
|
71
|
-
const response = await this.makeRequest('POST', '/utility/web-scraping/screenshot', data);
|
|
72
|
-
return response.data;
|
|
73
|
-
}
|
|
74
|
-
|
|
75
|
-
/**
|
|
76
|
-
* Extract text content from a webpage
|
|
22
|
+
* Fetches and extracts structured data from any publicly accessible URL, including:
|
|
23
|
+
* - Page title and metadata (description, keywords, author)
|
|
24
|
+
* - Open Graph and Twitter card details
|
|
25
|
+
* - Headers and meta tags (content type, viewport, canonical URL, charset)
|
|
26
|
+
* - Structured content extraction (headings, paragraphs, key text elements)
|
|
27
|
+
* - Internal and external links for site structure analysis
|
|
28
|
+
* - Language detection for localization
|
|
29
|
+
* - Timestamped results for tracking
|
|
77
30
|
*
|
|
78
|
-
* @param {string} url - The URL of the webpage to scrape
|
|
79
|
-
* @
|
|
80
|
-
* @param {boolean} [options.javascript=false] - Whether to execute JavaScript on the page
|
|
81
|
-
* @param {number} [options.timeout=30000] - Timeout in milliseconds
|
|
82
|
-
* @param {string} [options.userAgent] - Custom User-Agent string
|
|
83
|
-
* @param {object} [options.headers] - Custom headers to send with the request
|
|
84
|
-
* @param {string} [options.proxy] - Proxy to use for the request
|
|
85
|
-
* @returns {Promise<object>} - The extracted text content
|
|
86
|
-
*/
|
|
87
|
-
async extractText(url, options = {}) {
|
|
88
|
-
const data = { url, ...options };
|
|
89
|
-
const response = await this.makeRequest('POST', '/utility/web-scraping/text', data);
|
|
90
|
-
return response.data;
|
|
91
|
-
}
|
|
92
|
-
|
|
93
|
-
/**
|
|
94
|
-
* Extract links from a webpage
|
|
31
|
+
* @param {string} url - The URL of the webpage to scrape (e.g., 'https://example.com' or 'example.com')
|
|
32
|
+
* @returns {Promise<object>} - The scraped content with metadata, structured data, and links
|
|
95
33
|
*
|
|
96
|
-
* @
|
|
97
|
-
*
|
|
98
|
-
*
|
|
99
|
-
*
|
|
100
|
-
*
|
|
101
|
-
*
|
|
102
|
-
*
|
|
103
|
-
* @returns {Promise<object>} - The extracted links
|
|
34
|
+
* @example
|
|
35
|
+
* // Scrape a webpage
|
|
36
|
+
* const data = await service.scrapeUrl('https://sharpapi.com');
|
|
37
|
+
* console.log(data.title); // Page title
|
|
38
|
+
* console.log(data.description); // Meta description
|
|
39
|
+
* console.log(data.content); // Main content
|
|
40
|
+
* console.log(data.links); // Array of links found
|
|
104
41
|
*/
|
|
105
|
-
async
|
|
106
|
-
const
|
|
107
|
-
|
|
108
|
-
return response.data;
|
|
42
|
+
async scrapeUrl(url) {
|
|
43
|
+
const response = await this.makeRequest('GET', '/utilities/scrape_url', { url });
|
|
44
|
+
return response;
|
|
109
45
|
}
|
|
110
46
|
}
|
|
111
47
|
|
|
112
|
-
module.exports = { SharpApiWebScrapingService };
|
|
48
|
+
module.exports = { SharpApiWebScrapingService };
|