@ignidor/web-search-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +267 -0
- package/bin/web-search-mcp.js +13 -0
- package/dist/crawl4ai-client.d.ts +238 -0
- package/dist/crawl4ai-client.d.ts.map +1 -0
- package/dist/crawl4ai-client.js +608 -0
- package/dist/crawl4ai-client.js.map +1 -0
- package/dist/index.d.ts +39 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +561 -0
- package/dist/index.js.map +1 -0
- package/dist/playwright-crawler.d.ts +92 -0
- package/dist/playwright-crawler.d.ts.map +1 -0
- package/dist/playwright-crawler.js +454 -0
- package/dist/playwright-crawler.js.map +1 -0
- package/dist/ranking.d.ts +58 -0
- package/dist/ranking.d.ts.map +1 -0
- package/dist/ranking.js +218 -0
- package/dist/ranking.js.map +1 -0
- package/dist/search.d.ts +15 -0
- package/dist/search.d.ts.map +1 -0
- package/dist/search.js +187 -0
- package/dist/search.js.map +1 -0
- package/dist/types/index.d.ts +131 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/index.js +3 -0
- package/dist/types/index.js.map +1 -0
- package/dist/utils/concurrency.d.ts +24 -0
- package/dist/utils/concurrency.d.ts.map +1 -0
- package/dist/utils/concurrency.js +53 -0
- package/dist/utils/concurrency.js.map +1 -0
- package/dist/utils/validators.d.ts +21 -0
- package/dist/utils/validators.d.ts.map +1 -0
- package/dist/utils/validators.js +75 -0
- package/dist/utils/validators.js.map +1 -0
- package/package.json +77 -0
package/README.md
ADDED
|
@@ -0,0 +1,267 @@
|
|
|
1
|
+
# @ignidor/web-search-mcp
|
|
2
|
+
|
|
3
|
+
**Local, unlimited web-search MCP server with BM25 ranking and Playwright crawling.**
|
|
4
|
+
|
|
5
|
+
- 🔍 **No API keys** - Uses free DuckDuckGo HTML search
|
|
6
|
+
- 🚀 **No rate limits** - Unlimited searches, 24/7
|
|
7
|
+
- 🐳 **No Docker** - Direct Playwright integration (optional)
|
|
8
|
+
- 📊 **Smart ranking** - BM25 + hybrid scoring with freshness
|
|
9
|
+
- 📄 **Full extraction** - 1000+ words per page (not 200-word snippets)
|
|
10
|
+
- 💰 **100% Free** - Outperforms Brave Search, Tavily, commercial alternatives
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Features
|
|
15
|
+
|
|
16
|
+
| Tool | Description |
|
|
17
|
+
|------|-------------|
|
|
18
|
+
| `search` | Fast web search with BM25 ranking (DuckDuckGo) |
|
|
19
|
+
| `crawl_and_extract` | Extract full content from URLs using Playwright |
|
|
20
|
+
| `search_and_crawl` | Search + extract top results (one-stop research) |
|
|
21
|
+
| `capture_screenshot` | Screenshot any webpage (base64 PNG) |
|
|
22
|
+
| `generate_pdf` | Convert webpage to PDF (base64) |
|
|
23
|
+
| `extract_structured` | CSS selector-based data extraction |
|
|
24
|
+
| `execute_js` | Run custom JavaScript on webpages |
|
|
25
|
+
| `extract_regex` | Extract emails, phones, URLs, dates (21 patterns) |
|
|
26
|
+
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
### Installation (via npx)
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
npx @ignidor/web-search-mcp
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### Claude Desktop / Cursor / Windsurf Config
|
|
38
|
+
|
|
39
|
+
**For npx usage (recommended):**
|
|
40
|
+
|
|
41
|
+
```json
|
|
42
|
+
{
|
|
43
|
+
"mcpServers": {
|
|
44
|
+
"web-search": {
|
|
45
|
+
"command": "npx",
|
|
46
|
+
"args": ["-y", "@ignidor/web-search-mcp"]
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
**For local/SSH usage:**
|
|
53
|
+
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"mcpServers": {
|
|
57
|
+
"web-search": {
|
|
58
|
+
"command": "node",
|
|
59
|
+
"args": ["/path/to/dist/index.js"]
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
---
|
|
66
|
+
|
|
67
|
+
## Tool Examples
|
|
68
|
+
|
|
69
|
+
### 1. Search with BM25 Ranking
|
|
70
|
+
|
|
71
|
+
```javascript
|
|
72
|
+
// Search for anything - unlimited queries, no API key
|
|
73
|
+
{
|
|
74
|
+
"name": "search",
|
|
75
|
+
"arguments": {
|
|
76
|
+
"query": "Rust programming language tutorial",
|
|
77
|
+
"limit": 10,
|
|
78
|
+
"rankingMode": "hybrid" // 'bm25' or 'hybrid'
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### 2. Search + Extract Full Content
|
|
84
|
+
|
|
85
|
+
```javascript
|
|
86
|
+
// Best for deep research - gets full articles, not snippets
|
|
87
|
+
{
|
|
88
|
+
"name": "search_and_crawl",
|
|
89
|
+
"arguments": {
|
|
90
|
+
"query": "AWS DynamoDB batchWrite bug fix",
|
|
91
|
+
"extractTopN": 5,
|
|
92
|
+
"rerankAfterExtract": true
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
**Result:** 8,000+ words of detailed content including:
|
|
98
|
+
- Root cause analysis
|
|
99
|
+
- Step-by-step fixes
|
|
100
|
+
- Complete code examples
|
|
101
|
+
- Common pitfalls
|
|
102
|
+
|
|
103
|
+
### 3. Extract Structured Data
|
|
104
|
+
|
|
105
|
+
```javascript
|
|
106
|
+
// Scrape product listings, articles, etc.
|
|
107
|
+
{
|
|
108
|
+
"name": "extract_structured",
|
|
109
|
+
"arguments": {
|
|
110
|
+
"url": "https://example.com/products",
|
|
111
|
+
"baseSelector": ".product",
|
|
112
|
+
"fields": [
|
|
113
|
+
{ "name": "title", "selector": "h2", "type": "text" },
|
|
114
|
+
{ "name": "price", "selector": ".price", "type": "text" },
|
|
115
|
+
{ "name": "link", "selector": "a", "type": "attribute", "attribute": "href" }
|
|
116
|
+
]
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### 4. Execute JavaScript
|
|
122
|
+
|
|
123
|
+
```javascript
|
|
124
|
+
// Great for dynamic content, debugging
|
|
125
|
+
{
|
|
126
|
+
"name": "execute_js",
|
|
127
|
+
"arguments": {
|
|
128
|
+
"url": "https://example.com",
|
|
129
|
+
"scripts": [
|
|
130
|
+
"return document.title",
|
|
131
|
+
"return document.links.length",
|
|
132
|
+
"return document.URL"
|
|
133
|
+
]
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
### 5. Screenshot
|
|
139
|
+
|
|
140
|
+
```javascript
|
|
141
|
+
{
|
|
142
|
+
"name": "capture_screenshot",
|
|
143
|
+
"arguments": {
|
|
144
|
+
"url": "https://example.com",
|
|
145
|
+
"waitFor": 2 // seconds
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### 6. Regex Extraction
|
|
151
|
+
|
|
152
|
+
```javascript
|
|
153
|
+
// Extract emails, phones, URLs, etc.
|
|
154
|
+
{
|
|
155
|
+
"name": "extract_regex",
|
|
156
|
+
"arguments": {
|
|
157
|
+
"url": "https://example.com/contact",
|
|
158
|
+
"patterns": ["email", "phone_intl", "url"]
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
**21 built-in patterns:** `email`, `phone_intl`, `phone_us`, `url`, `ipv4`, `ipv6`, `uuid`, `currency`, `percentage`, `number`, `date_iso`, `date_us`, `time_24h`, `postal_us`, `postal_uk`, `hex_color`, `twitter_handle`, `hashtag`, `mac_addr`, `iban`, `credit_card`, `all`
|
|
164
|
+
|
|
165
|
+
---
|
|
166
|
+
|
|
167
|
+
## Playwright Setup (Optional but Recommended)
|
|
168
|
+
|
|
169
|
+
For full functionality (crawling, screenshots, PDFs, JS execution), install Playwright browsers:
|
|
170
|
+
|
|
171
|
+
```bash
|
|
172
|
+
npx playwright install chromium
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
**Without Playwright:** Only `search` tool works (DuckDuckGo results only).
|
|
176
|
+
|
|
177
|
+
**With Playwright:** All 11 tools work with full content extraction.
|
|
178
|
+
|
|
179
|
+
---
|
|
180
|
+
|
|
181
|
+
## Why This Over Brave Search?
|
|
182
|
+
|
|
183
|
+
| Feature | Brave Free | This MCP |
|
|
184
|
+
|---------|-----------|----------|
|
|
185
|
+
| Cost | Free tier only | **100% Free** |
|
|
186
|
+
| Rate Limits | 2,000 requests/month | **Unlimited** |
|
|
187
|
+
| Content Depth | ~200 words snippet | **1,000+ words** |
|
|
188
|
+
| Ranking | Black-box | **Transparent BM25** |
|
|
189
|
+
| Infrastructure | Cloud API | **Local control** |
|
|
190
|
+
| API Key | Required | **Not needed** |
|
|
191
|
+
|
|
192
|
+
---
|
|
193
|
+
|
|
194
|
+
## Architecture
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
198
|
+
│ Claude Desktop / Cursor │
|
|
199
|
+
└───────────────────────────────┬─────────────────────────────────┘
|
|
200
|
+
│ MCP (stdio)
|
|
201
|
+
▼
|
|
202
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
203
|
+
│ @ignidor/web-search-mcp │
|
|
204
|
+
│ ┌───────────────────────────────────────────────────────────┐ │
|
|
205
|
+
│ │ Tool Router │ │
|
|
206
|
+
│ │ • search → DuckDuckGo + BM25 ranking │ │
|
|
207
|
+
│ │ • crawl_and_extract → Playwright → Markdown │ │
|
|
208
|
+
│ │ • search_and_crawl → Combined (search + extract) │ │
|
|
209
|
+
│ │ • capture_screenshot → Playwright → base64 PNG │ │
|
|
210
|
+
│ │ • generate_pdf → Playwright → base64 PDF │ │
|
|
211
|
+
│ │ • extract_structured → Playwright → CSS extraction │ │
|
|
212
|
+
│ │ • execute_js → Playwright → JS results │ │
|
|
213
|
+
│ │ • extract_regex → Playwright → 21 patterns │ │
|
|
214
|
+
│ └───────────────────────────┬───────────────────────────────┘ │
|
|
215
|
+
│ │ │
|
|
216
|
+
│ ┌───────────────────────────▼───────────────────────────────┐ │
|
|
217
|
+
│ │ Ranking Engine (BM25 + Hybrid) │ │
|
|
218
|
+
│ │ • fast-bm25 package for scoring │ │
|
|
219
|
+
│ │ • Freshness scoring (exponential decay) │ │
|
|
220
|
+
│ │ • Domain authority heuristics │ │
|
|
221
|
+
│ └───────────────────────────────────────────────────────────┘ │
|
|
222
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
223
|
+
│
|
|
224
|
+
▼
|
|
225
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
226
|
+
│ Playwright (optional) │
|
|
227
|
+
│ • Chromium browser for dynamic content │
|
|
228
|
+
│ • Screenshot, PDF generation │
|
|
229
|
+
│ • JavaScript execution │
|
|
230
|
+
└─────────────────────────────────────────────────────────────────┘
|
|
231
|
+
```
|
|
232
|
+
|
|
233
|
+
---
|
|
234
|
+
|
|
235
|
+
## Development
|
|
236
|
+
|
|
237
|
+
```bash
|
|
238
|
+
# Clone repo
|
|
239
|
+
git clone https://github.com/JayaBigDataIsCool/ignidor-web-search-mcp.git
|
|
240
|
+
cd ignidor-web-search-mcp
|
|
241
|
+
|
|
242
|
+
# Install dependencies
|
|
243
|
+
npm install
|
|
244
|
+
|
|
245
|
+
# Install Playwright (optional but recommended)
|
|
246
|
+
npx playwright install chromium
|
|
247
|
+
|
|
248
|
+
# Build
|
|
249
|
+
npm run build
|
|
250
|
+
|
|
251
|
+
# Run locally
|
|
252
|
+
npm start
|
|
253
|
+
```
|
|
254
|
+
|
|
255
|
+
---
|
|
256
|
+
|
|
257
|
+
## License
|
|
258
|
+
|
|
259
|
+
MIT © Ignidor Team
|
|
260
|
+
|
|
261
|
+
---
|
|
262
|
+
|
|
263
|
+
## Links
|
|
264
|
+
|
|
265
|
+
- [GitHub](https://github.com/JayaBigDataIsCool/ignidor-web-search-mcp)
|
|
266
|
+
- [Issues](https://github.com/JayaBigDataIsCool/ignidor-web-search-mcp/issues)
|
|
267
|
+
- [NPM](https://www.npmjs.com/package/@ignidor/web-search-mcp)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// CLI Entry Point for @ignidor/web-search-mcp Server
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
import { dirname, join } from 'path';
|
|
5
|
+
|
|
6
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
7
|
+
const __dirname = dirname(__filename);
|
|
8
|
+
|
|
9
|
+
// Import and run the main server
|
|
10
|
+
import('../dist/index.js').catch(error => {
|
|
11
|
+
console.error('Failed to start MCP server:', error);
|
|
12
|
+
process.exit(1);
|
|
13
|
+
});
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
export interface Crawl4AIConfig {
|
|
2
|
+
extractDepth?: 'basic' | 'advanced';
|
|
3
|
+
includeImages?: boolean;
|
|
4
|
+
waitForSelector?: string;
|
|
5
|
+
executeJavascript?: boolean;
|
|
6
|
+
timeout?: number;
|
|
7
|
+
}
|
|
8
|
+
export interface Crawl4AIResponse {
|
|
9
|
+
success: boolean;
|
|
10
|
+
url?: string;
|
|
11
|
+
markdown?: string;
|
|
12
|
+
cleaned_html?: string;
|
|
13
|
+
title?: string;
|
|
14
|
+
links?: {
|
|
15
|
+
internal: number;
|
|
16
|
+
external: number;
|
|
17
|
+
};
|
|
18
|
+
media?: {
|
|
19
|
+
images: string[];
|
|
20
|
+
};
|
|
21
|
+
wordCount?: number;
|
|
22
|
+
error?: string;
|
|
23
|
+
}
|
|
24
|
+
export interface MarkdownFormatOptions {
|
|
25
|
+
format?: 'raw' | 'fit' | 'bm25' | 'llm';
|
|
26
|
+
query?: string;
|
|
27
|
+
bypassCache?: boolean;
|
|
28
|
+
}
|
|
29
|
+
export interface ScreenshotOptions {
|
|
30
|
+
waitFor?: number;
|
|
31
|
+
fullPath?: string;
|
|
32
|
+
}
|
|
33
|
+
export interface CSSExtractionSchema {
|
|
34
|
+
name: string;
|
|
35
|
+
baseSelector: string;
|
|
36
|
+
baseFields?: Array<{
|
|
37
|
+
name: string;
|
|
38
|
+
selector?: string;
|
|
39
|
+
type: string;
|
|
40
|
+
attribute?: string;
|
|
41
|
+
}>;
|
|
42
|
+
fields: Array<{
|
|
43
|
+
name: string;
|
|
44
|
+
selector: string;
|
|
45
|
+
type: 'text' | 'html' | 'attribute' | 'nested' | 'list' | 'nested_list';
|
|
46
|
+
attribute?: string;
|
|
47
|
+
fields?: any[];
|
|
48
|
+
}>;
|
|
49
|
+
}
|
|
50
|
+
export interface JSExecutionOptions {
|
|
51
|
+
scripts: string[];
|
|
52
|
+
}
|
|
53
|
+
export type DeepCrawlStrategyType = 'bfs' | 'dfs' | 'best_first';
|
|
54
|
+
export interface DeepCrawlOptions {
|
|
55
|
+
maxDepth?: number;
|
|
56
|
+
includeExternal?: boolean;
|
|
57
|
+
maxPages?: number;
|
|
58
|
+
scoreThreshold?: number;
|
|
59
|
+
stream?: boolean;
|
|
60
|
+
strategy?: DeepCrawlStrategyType;
|
|
61
|
+
filterChain?: FilterChain;
|
|
62
|
+
urlScorer?: UrlScorer;
|
|
63
|
+
}
|
|
64
|
+
export interface FilterChain {
|
|
65
|
+
filters: Array<{
|
|
66
|
+
type: 'url_pattern' | 'domain' | 'content_type' | 'seo' | 'content_relevance';
|
|
67
|
+
config?: Record<string, unknown>;
|
|
68
|
+
}>;
|
|
69
|
+
}
|
|
70
|
+
export interface UrlScorer {
|
|
71
|
+
type: 'keyword_relevance';
|
|
72
|
+
keywords: string[];
|
|
73
|
+
weight?: number;
|
|
74
|
+
}
|
|
75
|
+
export interface DeepCrawlResult {
|
|
76
|
+
url: string;
|
|
77
|
+
depth: number;
|
|
78
|
+
score?: number;
|
|
79
|
+
success: boolean;
|
|
80
|
+
markdown?: string;
|
|
81
|
+
title?: string;
|
|
82
|
+
error?: string;
|
|
83
|
+
}
|
|
84
|
+
export type AdaptiveStrategyType = 'statistical' | 'embedding';
|
|
85
|
+
export interface AdaptiveCrawlOptions {
|
|
86
|
+
strategy?: AdaptiveStrategyType;
|
|
87
|
+
confidenceThreshold?: number;
|
|
88
|
+
maxPages?: number;
|
|
89
|
+
topKLinks?: number;
|
|
90
|
+
minGainThreshold?: number;
|
|
91
|
+
saveState?: boolean;
|
|
92
|
+
statePath?: string;
|
|
93
|
+
embeddingModel?: string;
|
|
94
|
+
}
|
|
95
|
+
export interface AdaptiveCrawlResult {
|
|
96
|
+
success: boolean;
|
|
97
|
+
confidence: number;
|
|
98
|
+
pagesCrawled: number;
|
|
99
|
+
relevantContent?: Array<{
|
|
100
|
+
url: string;
|
|
101
|
+
score: number;
|
|
102
|
+
content: string;
|
|
103
|
+
}>;
|
|
104
|
+
metrics?: {
|
|
105
|
+
coverage: number;
|
|
106
|
+
consistency: number;
|
|
107
|
+
saturation: number;
|
|
108
|
+
};
|
|
109
|
+
error?: string;
|
|
110
|
+
}
|
|
111
|
+
export type RegexPatternType = 'email' | 'phone_intl' | 'phone_us' | 'url' | 'ipv4' | 'ipv6' | 'uuid' | 'currency' | 'percentage' | 'number' | 'date_iso' | 'date_us' | 'time_24h' | 'postal_us' | 'postal_uk' | 'hex_color' | 'twitter_handle' | 'hashtag' | 'mac_addr' | 'iban' | 'credit_card' | 'all';
|
|
112
|
+
export interface RegexExtractionOptions {
|
|
113
|
+
pattern?: RegexPatternType | RegexPatternType[];
|
|
114
|
+
custom?: Record<string, string>;
|
|
115
|
+
}
|
|
116
|
+
export interface ContentFilterOptions {
|
|
117
|
+
type?: 'pruning' | 'bm25';
|
|
118
|
+
threshold?: number;
|
|
119
|
+
thresholdType?: 'fixed' | 'dynamic';
|
|
120
|
+
minWordThreshold?: number;
|
|
121
|
+
query?: string;
|
|
122
|
+
}
|
|
123
|
+
export declare class Crawl4AIClient {
|
|
124
|
+
private baseUrl;
|
|
125
|
+
private timeout;
|
|
126
|
+
private available;
|
|
127
|
+
constructor(baseUrl?: string, timeout?: number);
|
|
128
|
+
/**
|
|
129
|
+
* Check if Crawl4AI service is available.
|
|
130
|
+
* Gracefully handles if the service is not running.
|
|
131
|
+
*/
|
|
132
|
+
checkHealth(): Promise<boolean>;
|
|
133
|
+
/**
|
|
134
|
+
* Extract full content from a URL using Crawl4AI /crawl endpoint.
|
|
135
|
+
* Returns null if service is unavailable.
|
|
136
|
+
*/
|
|
137
|
+
extractUrl(url: string, config?: Crawl4AIConfig): Promise<Crawl4AIResponse | null>;
|
|
138
|
+
/**
|
|
139
|
+
* Extract multiple URLs in parallel with concurrency control.
|
|
140
|
+
*/
|
|
141
|
+
extractMultipleUrls(urls: string[], config?: Crawl4AIConfig, concurrency?: number): Promise<Map<string, Crawl4AIResponse>>;
|
|
142
|
+
/**
|
|
143
|
+
* Generate markdown from a URL with formatting options.
|
|
144
|
+
* Formats: raw (raw content), fit (clean content), bm25 (BM25 ranked), llm (LLM optimized)
|
|
145
|
+
*/
|
|
146
|
+
generateMarkdown(url: string, options?: MarkdownFormatOptions): Promise<string | null>;
|
|
147
|
+
/**
|
|
148
|
+
* Capture a screenshot of a webpage.
|
|
149
|
+
* Returns the screenshot path or base64 data if no path provided.
|
|
150
|
+
*/
|
|
151
|
+
captureScreenshot(url: string, options?: ScreenshotOptions): Promise<{
|
|
152
|
+
success: boolean;
|
|
153
|
+
path?: string;
|
|
154
|
+
base64?: string;
|
|
155
|
+
error?: string;
|
|
156
|
+
}>;
|
|
157
|
+
/**
|
|
158
|
+
* Generate a PDF from a webpage.
|
|
159
|
+
*/
|
|
160
|
+
generatePDF(url: string, outputPath?: string): Promise<{
|
|
161
|
+
success: boolean;
|
|
162
|
+
path?: string;
|
|
163
|
+
base64?: string;
|
|
164
|
+
error?: string;
|
|
165
|
+
}>;
|
|
166
|
+
/**
|
|
167
|
+
* Execute custom JavaScript on a webpage.
|
|
168
|
+
* Returns array of results from each script.
|
|
169
|
+
*/
|
|
170
|
+
executeJavaScript(url: string, options: JSExecutionOptions): Promise<{
|
|
171
|
+
success: boolean;
|
|
172
|
+
results?: unknown[];
|
|
173
|
+
error?: string;
|
|
174
|
+
}>;
|
|
175
|
+
/**
|
|
176
|
+
* Extract structured data using CSS selectors.
|
|
177
|
+
*/
|
|
178
|
+
extractWithCSS(url: string, schema: CSSExtractionSchema): Promise<{
|
|
179
|
+
success: boolean;
|
|
180
|
+
data?: unknown;
|
|
181
|
+
error?: string;
|
|
182
|
+
}>;
|
|
183
|
+
/**
|
|
184
|
+
* Get preprocessed HTML optimized for schema extraction.
|
|
185
|
+
*/
|
|
186
|
+
getHTML(url: string): Promise<{
|
|
187
|
+
success: boolean;
|
|
188
|
+
html?: string;
|
|
189
|
+
error?: string;
|
|
190
|
+
}>;
|
|
191
|
+
/**
|
|
192
|
+
* Perform deep crawling with BFS, DFS, or BestFirst strategy.
|
|
193
|
+
* Returns a list of crawled pages with their content.
|
|
194
|
+
*/
|
|
195
|
+
deepCrawl(url: string, options?: DeepCrawlOptions): Promise<{
|
|
196
|
+
success: boolean;
|
|
197
|
+
results?: DeepCrawlResult[];
|
|
198
|
+
error?: string;
|
|
199
|
+
}>;
|
|
200
|
+
/**
|
|
201
|
+
* Perform adaptive crawling - intelligently explores until enough information is gathered.
|
|
202
|
+
* Uses statistical (free) or embedding (API) strategies.
|
|
203
|
+
*/
|
|
204
|
+
adaptiveCrawl(url: string, query: string, options?: AdaptiveCrawlOptions): Promise<AdaptiveCrawlResult>;
|
|
205
|
+
/**
|
|
206
|
+
* Extract data using regex patterns.
|
|
207
|
+
* Supports built-in patterns (email, phone, url, currency, etc.) and custom patterns.
|
|
208
|
+
*/
|
|
209
|
+
extractWithRegex(url: string, options: RegexExtractionOptions): Promise<{
|
|
210
|
+
success: boolean;
|
|
211
|
+
data?: unknown[];
|
|
212
|
+
error?: string;
|
|
213
|
+
}>;
|
|
214
|
+
/**
|
|
215
|
+
* Generate markdown with advanced content filtering (PruningFilter, BM25Filter).
|
|
216
|
+
* Filters out low-quality content based on thresholds and relevance.
|
|
217
|
+
*/
|
|
218
|
+
generateFilteredMarkdown(url: string, filterOptions?: ContentFilterOptions): Promise<{
|
|
219
|
+
success: boolean;
|
|
220
|
+
markdown?: string;
|
|
221
|
+
filteredMarkdown?: string;
|
|
222
|
+
error?: string;
|
|
223
|
+
}>;
|
|
224
|
+
/**
|
|
225
|
+
* Check if the service is currently available.
|
|
226
|
+
*/
|
|
227
|
+
isAvailable(): boolean;
|
|
228
|
+
/**
|
|
229
|
+
* Get API schema/documentation.
|
|
230
|
+
*/
|
|
231
|
+
getSchema(): Promise<unknown | null>;
|
|
232
|
+
/**
|
|
233
|
+
* Get Prometheus metrics.
|
|
234
|
+
*/
|
|
235
|
+
getMetrics(): Promise<string | null>;
|
|
236
|
+
}
|
|
237
|
+
export declare function getSharedClient(url?: string): Crawl4AIClient;
|
|
238
|
+
//# sourceMappingURL=crawl4ai-client.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"crawl4ai-client.d.ts","sourceRoot":"","sources":["../src/crawl4ai-client.ts"],"names":[],"mappings":"AAOA,MAAM,WAAW,cAAc;IAC7B,YAAY,CAAC,EAAE,OAAO,GAAG,UAAU,CAAC;IACpC,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,iBAAiB,CAAC,EAAE,OAAO,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,gBAAgB;IAC/B,OAAO,EAAE,OAAO,CAAC;IACjB,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,QAAQ,EAAE,MAAM,CAAA;KAAE,CAAC;IAC/C,KAAK,CAAC,EAAE;QAAE,MAAM,EAAE,MAAM,EAAE,CAAA;KAAE,CAAC;IAC7B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAaD,MAAM,WAAW,qBAAqB;IACpC,MAAM,CAAC,EAAE,KAAK,GAAG,KAAK,GAAG,MAAM,GAAG,KAAK,CAAC;IACxC,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,OAAO,CAAC;CACvB;AAED,MAAM,WAAW,iBAAiB;IAChC,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,MAAM,CAAC;IACb,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAC;QAAC,SAAS,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC1F,MAAM,EAAE,KAAK,CAAC;QACZ,IAAI,EAAE,MAAM,CAAC;QACb,QAAQ,EAAE,MAAM,CAAC;QACjB,IAAI,EAAE,MAAM,GAAG,MAAM,GAAG,WAAW,GAAG,QAAQ,GAAG,MAAM,GAAG,aAAa,CAAC;QACxE,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,MAAM,CAAC,EAAE,GAAG,EAAE,CAAC;KAChB,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,kBAAkB;IACjC,OAAO,EAAE,MAAM,EAAE,CAAC;CACnB;AAMD,MAAM,MAAM,qBAAqB,GAAG,KAAK,GAAG,KAAK,GAAG,YAAY,CAAC;AAEjE,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,MAAM,CAAC,EAAE,OAAO,CAAC;IACjB,QAAQ,CAAC,EAAE,qBAAqB,CAAC;IACjC,WAAW,CAAC,EAAE,WAAW,CAAC;IAC1B,SAAS,CAAC,EAAE,SAAS,CAAC;CACvB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,KAAK,CAAC;QACb,IAAI,EAAE,aAAa,GAAG,QAAQ,GAAG,cAAc,GAAG,KAAK,GAAG,mBAAmB,CAAC;QAC9E,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KAClC,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,SAAS;IACxB,IAAI,EAAE,mBAAmB,CAAC;IAC1B,QAAQ,EAAE,MAAM,EAAE,CAAC;IACnB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,OAAO,CAAC;IACjB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAMD,MAAM,MAAM,oBAAoB,GAAG,aAAa,GAAG,WAAW,CAAC;AAE/D,MAAM,WAAW,oBAAoB;IACnC,QAAQ,CAAC,EAAE,oBAAoB,CAAC;IAChC,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,SAAS,CAAC,EAAE,OAAO,CAAC;IACpB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,mBAAmB;IAClC,OAAO,EAAE,OAAO,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,eAAe,CAAC,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,CAAC;QAAC,OAAO,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IACzE,OAAO,CAAC,EAAE;QACR,QAAQ,EAAE,MAAM,CAAC;QACjB,WAAW,EAAE,MAAM,CAAC;QACpB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAMD,MAAM,MAAM,gBAAgB,GACxB,OAAO,GACP,YAAY,GACZ,UAAU,GACV,KAAK,GACL,MAAM,GACN,MAAM,GACN,MAAM,GACN,UAAU,GACV,YAAY,GACZ,QAAQ,GACR,UAAU,GACV,SAAS,GACT,UAAU,GACV,WAAW,GACX,WAAW,GACX,WAAW,GACX,gBAAgB,GAChB,SAAS,GACT,UAAU,GACV,MAAM,GACN,aAAa,GACb,KAAK,CAAC;AAEV,MAAM,WAAW,sBAAsB;IACrC,OAAO,CAAC,EAAE,gBAAgB,GAAG,gBAAgB,EAAE,CAAC;IAChD,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACjC;AAMD,MAAM,WAAW,oBAAoB;IACnC,IAAI,CAAC,EAAE,SAAS,GAAG,MAAM,CAAC;IAC1B,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,aAAa,CAAC,EAAE,OAAO,GAAG,SAAS,CAAC;IACpC,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAC1B,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AAMD,qBAAa,cAAc;IACzB,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,OAAO,CAAS;IACxB,OAAO,CAAC,SAAS,CAAkB;gBAEvB,OAAO,GAAE,MAA6B,EAAE,OAAO,GAAE,MAAwB;IAKrF;;;OAGG;IACG,WAAW,IAAI,OAAO,CAAC,OAAO,CAAC;IAqBrC;;;OAGG;IACG,UAAU,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,CAAC,EAAE,cAAc,GAAG,OAAO,CAAC,gBAAgB,GAAG,IAAI,CAAC;IAkDxF;;OAEG;IACG,mBAAmB,CACvB,IAAI,EAAE,MAAM,EAAE,EACd,MAAM,CAAC,EAAE,cAAc,EACvB,WAAW,GAAE,MAAU,GACtB,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,gBAAgB,CAAC,CAAC;IAwBzC;;;OAGG;IACG,gBAAgB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,qBAA0B,GAAG,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;IA6BhG;;;OAGG;IACG,iBAAiB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,iBAAsB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,IAAI,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAiCpJ;;OAEG;IACG,WAAW,CAAC,GAAG,EAAE,MAAM,EAAE,UAAU,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,IAAI,CAAC,EAAE,MAAM,CAAC;QAAC,MAAM,CAAC,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAgClI;;;OAGG;IACG,iBAAiB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,kBAAkB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,OAAO,CAAC,EAAE,OAAO,EAAE,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IA6BrI;;OAEG;IACG,cAAc,CAAC,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,mBAAmB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,IAAI,CAAC,EAAE,OAAO,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAgD7H;;OAEG;IACG,OAAO,CAAC,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,IAAI,CAAC,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IA0BxF;;;OAGG;IACG,SAAS,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,GAAE,gBAAqB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,OAAO,CAAC,EAAE,eAAe,EAAE,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IA8ExI;;;OAGG;IACG,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,OAAO,GAAE,oBAAyB,GAAG,OAAO,CAAC,mBAAmB,CAAC;IAuDjH;;;OAGG;IACG,gBAAgB,CAAC,GAAG,EAAE,MAAM,EAAE,OAAO,EAAE,sBAAsB,GAAG,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,IAAI,CAAC,EAAE,OAAO,EAAE,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IA2FrI;;;OAGG;IACG,wBAAwB,CAC5B,GAAG,EAAE,MAAM,EACX,aAAa,GAAE,oBAAyB,GACvC,OAAO,CAAC;QAAE,OAAO,EAAE,OAAO,CAAC;QAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;QAAC,gBAAgB,CAAC,EAAE,MAAM,CAAC;QAAC,KAAK,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IAoE9F;;OAEG;IACH,WAAW,IAAI,OAAO;IAItB;;OAEG;IACG,SAAS,IAAI,OAAO,CAAC,OAAO,GAAG,IAAI,CAAC;IAS1C;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC;CAQ3C;AAQD,wBAAgB,eAAe,CAAC,GAAG,CAAC,EAAE,MAAM,GAAG,cAAc,CAK5D"}
|