@monostate/node-scraper 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +70 -1
- package/index.d.ts +50 -0
- package/index.js +238 -0
- package/package.json +8 -2
package/README.md
CHANGED
|
@@ -19,7 +19,9 @@ yarn add @monostate/node-scraper
|
|
|
19
19
|
pnpm add @monostate/node-scraper
|
|
20
20
|
```
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
**🤖 New in v1.5.0**: AI-powered Q&A! Ask questions about any website using OpenRouter, OpenAI, or built-in AI. (Note: v1.4.0 was an internal release)
|
|
23
|
+
|
|
24
|
+
**🎉 Also in v1.3.0**: PDF parsing support added! Automatically extracts text, metadata, and page count from PDF documents.
|
|
23
25
|
|
|
24
26
|
**✨ Also in v1.2.0**: Lightpanda binary is now automatically downloaded and configured during installation! No manual setup required.
|
|
25
27
|
|
|
@@ -199,6 +201,73 @@ Clean up resources (close browser instances).
|
|
|
199
201
|
await scraper.cleanup();
|
|
200
202
|
```
|
|
201
203
|
|
|
204
|
+
### 🤖 AI-Powered Q&A
|
|
205
|
+
|
|
206
|
+
Ask questions about any website and get AI-generated answers:
|
|
207
|
+
|
|
208
|
+
```javascript
|
|
209
|
+
// Method 1: Using your own OpenRouter API key
|
|
210
|
+
const scraper = new BNCASmartScraper({
|
|
211
|
+
openRouterApiKey: 'your-openrouter-api-key'
|
|
212
|
+
});
|
|
213
|
+
const result = await scraper.askAI('https://example.com', 'What is this website about?');
|
|
214
|
+
|
|
215
|
+
// Method 2: Using OpenAI API (or compatible endpoints)
|
|
216
|
+
const scraper = new BNCASmartScraper({
|
|
217
|
+
openAIApiKey: 'your-openai-api-key',
|
|
218
|
+
// Optional: Use a compatible endpoint like Groq, Together AI, etc.
|
|
219
|
+
openAIBaseUrl: 'https://api.groq.com/openai'
|
|
220
|
+
});
|
|
221
|
+
const result = await scraper.askAI('https://example.com', 'What services do they offer?');
|
|
222
|
+
|
|
223
|
+
// Method 3: One-liner with OpenRouter
|
|
224
|
+
import { askWebsiteAI } from '@monostate/node-scraper';
|
|
225
|
+
const answer = await askWebsiteAI('https://example.com', 'What is the main topic?', {
|
|
226
|
+
openRouterApiKey: process.env.OPENROUTER_API_KEY
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
// Method 4: Using BNCA backend API (requires BNCA API key)
|
|
230
|
+
const scraper = new BNCASmartScraper({
|
|
231
|
+
apiKey: 'your-bnca-api-key'
|
|
232
|
+
});
|
|
233
|
+
const result = await scraper.askAI('https://example.com', 'What products are featured?');
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
**API Key Priority:**
|
|
237
|
+
1. OpenRouter API key (`openRouterApiKey`)
|
|
238
|
+
2. OpenAI API key (`openAIApiKey`)
|
|
239
|
+
3. BNCA backend API (`apiKey`)
|
|
240
|
+
4. Local fallback (pattern matching - no API key required)
|
|
241
|
+
|
|
242
|
+
**Configuration Options:**
|
|
243
|
+
```javascript
|
|
244
|
+
const result = await scraper.askAI(url, question, {
|
|
245
|
+
// OpenRouter specific
|
|
246
|
+
openRouterApiKey: 'sk-or-...',
|
|
247
|
+
model: 'meta-llama/llama-4-scout:free', // Default model
|
|
248
|
+
|
|
249
|
+
// OpenAI specific
|
|
250
|
+
openAIApiKey: 'sk-...',
|
|
251
|
+
openAIBaseUrl: 'https://api.openai.com', // Or compatible endpoint
|
|
252
|
+
model: 'gpt-3.5-turbo',
|
|
253
|
+
|
|
254
|
+
// Shared options
|
|
255
|
+
temperature: 0.3,
|
|
256
|
+
maxTokens: 500
|
|
257
|
+
});
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
**Response Format:**
|
|
261
|
+
```javascript
|
|
262
|
+
{
|
|
263
|
+
success: true,
|
|
264
|
+
answer: "This website is about...",
|
|
265
|
+
method: "direct-fetch", // Scraping method used
|
|
266
|
+
scrapeTime: 1234, // Time to scrape in ms
|
|
267
|
+
processing: "openrouter" // AI processing method used
|
|
268
|
+
}
|
|
269
|
+
```
|
|
270
|
+
|
|
202
271
|
### 📄 PDF Support
|
|
203
272
|
|
|
204
273
|
BNCA automatically detects and parses PDF documents:
|
package/index.d.ts
CHANGED
|
@@ -13,6 +13,24 @@ export interface ScrapingOptions {
|
|
|
13
13
|
lightpandaPath?: string;
|
|
14
14
|
/** Custom user agent string */
|
|
15
15
|
userAgent?: string;
|
|
16
|
+
/** BNCA API key for backend services */
|
|
17
|
+
apiKey?: string;
|
|
18
|
+
/** BNCA API URL (defaults to https://bnca-api.fly.dev) */
|
|
19
|
+
apiUrl?: string;
|
|
20
|
+
/** OpenRouter API key for AI processing */
|
|
21
|
+
openRouterApiKey?: string;
|
|
22
|
+
/** OpenAI API key for AI processing */
|
|
23
|
+
openAIApiKey?: string;
|
|
24
|
+
/** OpenAI base URL (for compatible endpoints) */
|
|
25
|
+
openAIBaseUrl?: string;
|
|
26
|
+
/** AI model to use */
|
|
27
|
+
model?: string;
|
|
28
|
+
/** AI temperature setting */
|
|
29
|
+
temperature?: number;
|
|
30
|
+
/** Maximum tokens for AI response */
|
|
31
|
+
maxTokens?: number;
|
|
32
|
+
/** HTTP referer for OpenRouter */
|
|
33
|
+
referer?: string;
|
|
16
34
|
}
|
|
17
35
|
|
|
18
36
|
export interface ScrapingResult {
|
|
@@ -146,6 +164,22 @@ export class BNCASmartScraper {
|
|
|
146
164
|
* @returns Promise resolving to screenshot result
|
|
147
165
|
*/
|
|
148
166
|
quickshot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Ask AI a question about a URL
|
|
170
|
+
* @param url The URL to analyze
|
|
171
|
+
* @param question The question to answer about the page
|
|
172
|
+
* @param options Optional configuration overrides
|
|
173
|
+
* @returns Promise resolving to AI answer
|
|
174
|
+
*/
|
|
175
|
+
askAI(url: string, question: string, options?: ScrapingOptions): Promise<{
|
|
176
|
+
success: boolean;
|
|
177
|
+
answer?: string;
|
|
178
|
+
error?: string;
|
|
179
|
+
method?: string;
|
|
180
|
+
scrapeTime?: number;
|
|
181
|
+
processing?: 'openrouter' | 'openai' | 'backend' | 'local';
|
|
182
|
+
}>;
|
|
149
183
|
|
|
150
184
|
/**
|
|
151
185
|
* Get performance statistics for all methods
|
|
@@ -248,6 +282,22 @@ export function smartScreenshot(url: string, options?: ScrapingOptions): Promise
|
|
|
248
282
|
*/
|
|
249
283
|
export function quickShot(url: string, options?: ScrapingOptions): Promise<ScrapingResult>;
|
|
250
284
|
|
|
285
|
+
/**
|
|
286
|
+
* Convenience function for asking AI questions about a webpage
|
|
287
|
+
* @param url The URL to analyze
|
|
288
|
+
* @param question The question to answer
|
|
289
|
+
* @param options Optional configuration
|
|
290
|
+
* @returns Promise resolving to AI answer
|
|
291
|
+
*/
|
|
292
|
+
export function askWebsiteAI(url: string, question: string, options?: ScrapingOptions): Promise<{
|
|
293
|
+
success: boolean;
|
|
294
|
+
answer?: string;
|
|
295
|
+
error?: string;
|
|
296
|
+
method?: string;
|
|
297
|
+
scrapeTime?: number;
|
|
298
|
+
processing?: 'openrouter' | 'openai' | 'backend' | 'local';
|
|
299
|
+
}>;
|
|
300
|
+
|
|
251
301
|
/**
|
|
252
302
|
* Default export - same as BNCASmartScraper class
|
|
253
303
|
*/
|
package/index.js
CHANGED
|
@@ -48,6 +48,232 @@ export class BNCASmartScraper {
|
|
|
48
48
|
};
|
|
49
49
|
}
|
|
50
50
|
|
|
51
|
+
/**
|
|
52
|
+
* Ask AI a question about a URL
|
|
53
|
+
* Scrapes the URL and uses AI to answer the question
|
|
54
|
+
*
|
|
55
|
+
* @param {string} url - URL to analyze
|
|
56
|
+
* @param {string} question - Question to answer
|
|
57
|
+
* @param {object} options - Additional options
|
|
58
|
+
* @returns {Promise<object>} AI response with answer
|
|
59
|
+
*/
|
|
60
|
+
async askAI(url, question, options = {}) {
|
|
61
|
+
try {
|
|
62
|
+
// First scrape the content
|
|
63
|
+
const scrapeResult = await this.scrape(url, options);
|
|
64
|
+
|
|
65
|
+
if (!scrapeResult.success) {
|
|
66
|
+
return {
|
|
67
|
+
success: false,
|
|
68
|
+
error: `Failed to scrape URL: ${scrapeResult.error}`,
|
|
69
|
+
method: scrapeResult.method
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Check for OpenRouter/OpenAI API key
|
|
74
|
+
const openRouterKey = options.openRouterApiKey || this.options.openRouterApiKey || process.env.OPENROUTER_API_KEY;
|
|
75
|
+
const openAIKey = options.openAIApiKey || this.options.openAIApiKey || process.env.OPENAI_API_KEY;
|
|
76
|
+
|
|
77
|
+
// Priority: OpenRouter > OpenAI > Backend API > Local
|
|
78
|
+
if (openRouterKey) {
|
|
79
|
+
try {
|
|
80
|
+
const answer = await this.processWithOpenRouter(question, scrapeResult.content, openRouterKey, options);
|
|
81
|
+
return {
|
|
82
|
+
success: true,
|
|
83
|
+
answer,
|
|
84
|
+
method: scrapeResult.method,
|
|
85
|
+
scrapeTime: scrapeResult.stats.totalTime,
|
|
86
|
+
processing: 'openrouter'
|
|
87
|
+
};
|
|
88
|
+
} catch (error) {
|
|
89
|
+
this.log(' ⚠️ OpenRouter API call failed, falling back...');
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (openAIKey) {
|
|
94
|
+
try {
|
|
95
|
+
const answer = await this.processWithOpenAI(question, scrapeResult.content, openAIKey, options);
|
|
96
|
+
return {
|
|
97
|
+
success: true,
|
|
98
|
+
answer,
|
|
99
|
+
method: scrapeResult.method,
|
|
100
|
+
scrapeTime: scrapeResult.stats.totalTime,
|
|
101
|
+
processing: 'openai'
|
|
102
|
+
};
|
|
103
|
+
} catch (error) {
|
|
104
|
+
this.log(' ⚠️ OpenAI API call failed, falling back...');
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// If BNCA API key is provided, use the backend API
|
|
109
|
+
if (this.options.apiKey) {
|
|
110
|
+
try {
|
|
111
|
+
const response = await fetch(`${this.options.apiUrl || 'https://bnca-api.fly.dev'}/aireply`, {
|
|
112
|
+
method: 'POST',
|
|
113
|
+
headers: {
|
|
114
|
+
'x-api-key': this.options.apiKey,
|
|
115
|
+
'Content-Type': 'application/json'
|
|
116
|
+
},
|
|
117
|
+
body: JSON.stringify({ url, question })
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
if (response.ok) {
|
|
121
|
+
const data = await response.json();
|
|
122
|
+
return {
|
|
123
|
+
success: true,
|
|
124
|
+
answer: data.answer,
|
|
125
|
+
method: scrapeResult.method,
|
|
126
|
+
scrapeTime: scrapeResult.stats.totalTime,
|
|
127
|
+
processing: 'backend'
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
} catch (error) {
|
|
131
|
+
this.log(' ⚠️ Backend API call failed, using local AI processing');
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Local AI processing fallback
|
|
136
|
+
const answer = this.processLocally(question, scrapeResult.content);
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
success: true,
|
|
140
|
+
answer,
|
|
141
|
+
method: scrapeResult.method,
|
|
142
|
+
scrapeTime: scrapeResult.stats.totalTime,
|
|
143
|
+
processing: 'local'
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
} catch (error) {
|
|
147
|
+
return {
|
|
148
|
+
success: false,
|
|
149
|
+
error: error.message || 'AI processing failed'
|
|
150
|
+
};
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Process with OpenRouter API
|
|
156
|
+
* @private
|
|
157
|
+
*/
|
|
158
|
+
async processWithOpenRouter(question, content, apiKey, options = {}) {
|
|
159
|
+
const parsedContent = typeof content === 'string' ? JSON.parse(content) : content;
|
|
160
|
+
|
|
161
|
+
const contentText = `
|
|
162
|
+
Title: ${parsedContent.title || 'Unknown'}
|
|
163
|
+
Content: ${parsedContent.content || parsedContent.bodyText || 'No content available'}
|
|
164
|
+
Meta Description: ${parsedContent.metaDescription || 'None'}
|
|
165
|
+
${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h => `- ${h.text || h}`).join('\n')}` : ''}
|
|
166
|
+
`.trim();
|
|
167
|
+
|
|
168
|
+
const response = await fetch('https://openrouter.ai/api/v1/chat/completions', {
|
|
169
|
+
method: 'POST',
|
|
170
|
+
headers: {
|
|
171
|
+
'Content-Type': 'application/json',
|
|
172
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
173
|
+
'HTTP-Referer': options.referer || 'https://github.com/monostate/node-scraper',
|
|
174
|
+
'X-Title': 'BNCA Node Scraper',
|
|
175
|
+
},
|
|
176
|
+
body: JSON.stringify({
|
|
177
|
+
model: options.model || 'meta-llama/llama-4-scout:free',
|
|
178
|
+
messages: [
|
|
179
|
+
{
|
|
180
|
+
role: 'system',
|
|
181
|
+
content: 'You are a helpful assistant that answers questions based on website content. Provide accurate, concise answers based only on the provided content.'
|
|
182
|
+
},
|
|
183
|
+
{
|
|
184
|
+
role: 'user',
|
|
185
|
+
content: `Based on the following website content, please answer this question: ${question}\n\nWebsite content:\n${contentText}`
|
|
186
|
+
}
|
|
187
|
+
],
|
|
188
|
+
temperature: options.temperature || 0.3,
|
|
189
|
+
max_tokens: options.maxTokens || 500,
|
|
190
|
+
}),
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
if (!response.ok) {
|
|
194
|
+
throw new Error(`OpenRouter API error: ${response.status}`);
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const data = await response.json();
|
|
198
|
+
return data.choices[0]?.message?.content || 'No response from AI';
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Process with OpenAI API
|
|
203
|
+
* @private
|
|
204
|
+
*/
|
|
205
|
+
async processWithOpenAI(question, content, apiKey, options = {}) {
|
|
206
|
+
const parsedContent = typeof content === 'string' ? JSON.parse(content) : content;
|
|
207
|
+
|
|
208
|
+
const contentText = `
|
|
209
|
+
Title: ${parsedContent.title || 'Unknown'}
|
|
210
|
+
Content: ${parsedContent.content || parsedContent.bodyText || 'No content available'}
|
|
211
|
+
Meta Description: ${parsedContent.metaDescription || 'None'}
|
|
212
|
+
${parsedContent.headings?.length ? `\nHeadings:\n${parsedContent.headings.map(h => `- ${h.text || h}`).join('\n')}` : ''}
|
|
213
|
+
`.trim();
|
|
214
|
+
|
|
215
|
+
const baseUrl = options.openAIBaseUrl || 'https://api.openai.com';
|
|
216
|
+
const response = await fetch(`${baseUrl}/v1/chat/completions`, {
|
|
217
|
+
method: 'POST',
|
|
218
|
+
headers: {
|
|
219
|
+
'Content-Type': 'application/json',
|
|
220
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
221
|
+
},
|
|
222
|
+
body: JSON.stringify({
|
|
223
|
+
model: options.model || 'gpt-3.5-turbo',
|
|
224
|
+
messages: [
|
|
225
|
+
{
|
|
226
|
+
role: 'system',
|
|
227
|
+
content: 'You are a helpful assistant that answers questions based on website content. Provide accurate, concise answers based only on the provided content.'
|
|
228
|
+
},
|
|
229
|
+
{
|
|
230
|
+
role: 'user',
|
|
231
|
+
content: `Based on the following website content, please answer this question: ${question}\n\nWebsite content:\n${contentText}`
|
|
232
|
+
}
|
|
233
|
+
],
|
|
234
|
+
temperature: options.temperature || 0.3,
|
|
235
|
+
max_tokens: options.maxTokens || 500,
|
|
236
|
+
}),
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
if (!response.ok) {
|
|
240
|
+
throw new Error(`OpenAI API error: ${response.status}`);
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const data = await response.json();
|
|
244
|
+
return data.choices[0]?.message?.content || 'No response from AI';
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
/**
|
|
248
|
+
* Local AI processing (simple pattern matching)
|
|
249
|
+
* @private
|
|
250
|
+
*/
|
|
251
|
+
processLocally(question, content) {
|
|
252
|
+
const parsedContent = typeof content === 'string' ?
|
|
253
|
+
JSON.parse(content) : content;
|
|
254
|
+
|
|
255
|
+
const title = parsedContent.title || 'Unknown';
|
|
256
|
+
const text = parsedContent.content || parsedContent.bodyText || '';
|
|
257
|
+
const lowerQuestion = question.toLowerCase();
|
|
258
|
+
|
|
259
|
+
if (lowerQuestion.includes('title')) {
|
|
260
|
+
return `The page title is "${title}".`;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (lowerQuestion.includes('about') || lowerQuestion.includes('what')) {
|
|
264
|
+
return `This page titled "${title}" contains: ${text.substring(0, 200)}...`;
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
if (lowerQuestion.includes('contact') || lowerQuestion.includes('email')) {
|
|
268
|
+
const emailMatch = text.match(/[\w.-]+@[\w.-]+\.\w+/);
|
|
269
|
+
return emailMatch ?
|
|
270
|
+
`Found contact: ${emailMatch[0]}` :
|
|
271
|
+
'No contact information found.';
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
return `Based on "${title}": ${text.substring(0, 150)}...`;
|
|
275
|
+
}
|
|
276
|
+
|
|
51
277
|
/**
|
|
52
278
|
* Main scraping method with intelligent fallback
|
|
53
279
|
*/
|
|
@@ -1165,4 +1391,16 @@ export async function quickShot(url, options = {}) {
|
|
|
1165
1391
|
}
|
|
1166
1392
|
}
|
|
1167
1393
|
|
|
1394
|
+
export async function askWebsiteAI(url, question, options = {}) {
|
|
1395
|
+
const scraper = new BNCASmartScraper(options);
|
|
1396
|
+
try {
|
|
1397
|
+
const result = await scraper.askAI(url, question, options);
|
|
1398
|
+
return result;
|
|
1399
|
+
} catch (error) {
|
|
1400
|
+
throw error;
|
|
1401
|
+
} finally {
|
|
1402
|
+
await scraper.cleanup();
|
|
1403
|
+
}
|
|
1404
|
+
}
|
|
1405
|
+
|
|
1168
1406
|
export default BNCASmartScraper;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@monostate/node-scraper",
|
|
3
|
-
"version": "1.
|
|
4
|
-
"description": "Intelligent web scraping with PDF support and multi-level fallback system -
|
|
3
|
+
"version": "1.5.0",
|
|
4
|
+
"description": "Intelligent web scraping with AI Q&A, PDF support and multi-level fallback system - 11x faster than traditional scrapers",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "index.js",
|
|
7
7
|
"types": "index.d.ts",
|
|
@@ -32,6 +32,12 @@
|
|
|
32
32
|
"data-extraction",
|
|
33
33
|
"automation",
|
|
34
34
|
"browser",
|
|
35
|
+
"ai-powered",
|
|
36
|
+
"question-answering",
|
|
37
|
+
"pdf-parsing",
|
|
38
|
+
"openrouter",
|
|
39
|
+
"openai",
|
|
40
|
+
"llm",
|
|
35
41
|
"nextjs",
|
|
36
42
|
"react",
|
|
37
43
|
"performance",
|