@dex-ai/web-tools 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/package.json +49 -0
- package/src/cache.ts +55 -0
- package/src/index.ts +63 -0
- package/src/render.ts +95 -0
- package/src/types.ts +63 -0
- package/src/web-fetch.ts +276 -0
- package/src/web-search.ts +122 -0
package/README.md
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# @dex-ai/web-tools
|
|
2
|
+
|
|
3
|
+
Web fetch and search tools for Dex agents.
|
|
4
|
+
|
|
5
|
+
## Tools
|
|
6
|
+
|
|
7
|
+
| Tool | Description |
|
|
8
|
+
|------|-------------|
|
|
9
|
+
| `web_fetch` | Fetch any URL and return its content as clean markdown/text/JSON. Uses Readability + turndown for HTML→markdown. |
|
|
10
|
+
| `web_search` | Search the web. Default: DuckDuckGo HTML search (no API key). Configurable backend. |
|
|
11
|
+
| `web_snapshot` | *(optional)* Render a page with Playwright headless browser for JS-rendered content. |
|
|
12
|
+
|
|
13
|
+
## Usage
|
|
14
|
+
|
|
15
|
+
```typescript
|
|
16
|
+
import { createWebToolsExtension } from "@dex-ai/web-tools";
|
|
17
|
+
|
|
18
|
+
const agent = await Agent.create({
|
|
19
|
+
extensions: [
|
|
20
|
+
createWebToolsExtension({
|
|
21
|
+
// optional config
|
|
22
|
+
timeout: 15_000,
|
|
23
|
+
cacheTTL: 300_000,
|
|
24
|
+
rateLimitPerDomain: 10,
|
|
25
|
+
}),
|
|
26
|
+
// ... other extensions
|
|
27
|
+
],
|
|
28
|
+
});
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Configuration
|
|
32
|
+
|
|
33
|
+
| Option | Default | Description |
|
|
34
|
+
|--------|---------|-------------|
|
|
35
|
+
| `timeout` | `15_000` | Request timeout in ms |
|
|
36
|
+
| `maxResponseSize` | `1_048_576` | Max response body (bytes) |
|
|
37
|
+
| `cacheTTL` | `300_000` | Cache TTL in ms (5 min) |
|
|
38
|
+
| `rateLimitPerDomain` | `10` | Max requests per domain per window |
|
|
39
|
+
| `rateLimitWindow` | `60_000` | Rate limit window in ms |
|
|
40
|
+
| `searchEndpoint` | DuckDuckGo | Custom search URL |
|
|
41
|
+
| `searchApiKey` | — | API key for custom search |
|
|
42
|
+
| `allowedDomains` | `[]` | Allowlist (empty = all allowed) |
|
|
43
|
+
| `blockedDomains` | `[]` | Blocklist |
|
|
44
|
+
| `respectRobotsTxt` | `true` | Check robots.txt before fetching |
|
|
45
|
+
|
|
46
|
+
## Sub-path Exports
|
|
47
|
+
|
|
48
|
+
- `@dex-ai/web-tools` — core: `web_fetch` + `web_search`
|
|
49
|
+
- `@dex-ai/web-tools/with-snapshot` — adds `web_snapshot` (requires Playwright install)
|
package/package.json
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@dex-ai/web-tools",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "Web fetch and search tools for Dex agents — fetch URLs as clean markdown, search the web, and render JS pages with Playwright.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"exports": {
|
|
7
|
+
".": {
|
|
8
|
+
"types": "./src/index.ts",
|
|
9
|
+
"default": "./src/index.ts"
|
|
10
|
+
},
|
|
11
|
+
"./with-snapshot": {
|
|
12
|
+
"types": "./src/with-snapshot.ts",
|
|
13
|
+
"default": "./src/with-snapshot.ts"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"src"
|
|
18
|
+
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"typecheck": "tsc --noEmit",
|
|
21
|
+
"test": "bun test",
|
|
22
|
+
"changeset": "changeset",
|
|
23
|
+
"version": "changeset version",
|
|
24
|
+
"release": "changeset publish"
|
|
25
|
+
},
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"@dex-ai/sdk": "^0.1.2",
|
|
28
|
+
"zod": "^3.23.8",
|
|
29
|
+
"jsdom": "^25.0.0",
|
|
30
|
+
"@mozilla/readability": "^0.5.0",
|
|
31
|
+
"turndown": "^7.2.0"
|
|
32
|
+
},
|
|
33
|
+
"optionalDependencies": {
|
|
34
|
+
"playwright": "^1.48.0"
|
|
35
|
+
},
|
|
36
|
+
"devDependencies": {
|
|
37
|
+
"typescript": "^5.6.3",
|
|
38
|
+
"@types/bun": "latest",
|
|
39
|
+
"bun-types": "latest",
|
|
40
|
+
"@types/jsdom": "^21.1.7",
|
|
41
|
+
"@types/turndown": "^5.0.5",
|
|
42
|
+
"@changesets/cli": "^2.29.0"
|
|
43
|
+
},
|
|
44
|
+
"sideEffects": false,
|
|
45
|
+
"publishConfig": {
|
|
46
|
+
"access": "public",
|
|
47
|
+
"registry": "https://registry.npmjs.org/"
|
|
48
|
+
}
|
|
49
|
+
}
|
package/src/cache.ts
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* WebCache — disk-based request cache with TTL.
|
|
3
|
+
*
|
|
4
|
+
* Caches FetchResult objects keyed by SHA-256 hash of the URL.
|
|
5
|
+
* Handles malformed cache files gracefully by treating them as misses.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { createHash } from "node:crypto";
|
|
9
|
+
import { readFile, writeFile, mkdir } from "node:fs/promises";
|
|
10
|
+
import { join } from "node:path";
|
|
11
|
+
import type { FetchResult } from "./types";
|
|
12
|
+
|
|
13
|
+
interface CacheEntry {
|
|
14
|
+
cachedAt: number;
|
|
15
|
+
result: FetchResult;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export class WebCache {
|
|
19
|
+
private dir: string;
|
|
20
|
+
private ttl: number;
|
|
21
|
+
|
|
22
|
+
constructor(dir: string, ttl: number) {
|
|
23
|
+
this.dir = dir;
|
|
24
|
+
this.ttl = ttl;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
private key(url: string): string {
|
|
28
|
+
return createHash("sha256").update(url).digest("hex").slice(0, 16);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
private path(url: string): string {
|
|
32
|
+
return join(this.dir, `${this.key(url)}.json`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Retrieve a cached result. Returns null on miss or expiry. */
|
|
36
|
+
async get(url: string): Promise<FetchResult | null> {
|
|
37
|
+
try {
|
|
38
|
+
const raw = await readFile(this.path(url), "utf-8");
|
|
39
|
+
const entry = JSON.parse(raw) as CacheEntry;
|
|
40
|
+
if (Date.now() - entry.cachedAt > this.ttl) {
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
return entry.result;
|
|
44
|
+
} catch {
|
|
45
|
+
return null;
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
/** Store a result in the cache. */
|
|
50
|
+
async set(url: string, result: FetchResult): Promise<void> {
|
|
51
|
+
await mkdir(this.dir, { recursive: true });
|
|
52
|
+
const entry: CacheEntry = { cachedAt: Date.now(), result };
|
|
53
|
+
await writeFile(this.path(url), JSON.stringify(entry), "utf-8");
|
|
54
|
+
}
|
|
55
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @dex-ai/web-tools — Web fetch and search tools for Dex agents.
|
|
3
|
+
*
|
|
4
|
+
* Provides:
|
|
5
|
+
* - web_fetch: fetch URLs as clean markdown/text/JSON
|
|
6
|
+
* - web_search: search the web via configurable backend (default: DuckDuckGo)
|
|
7
|
+
*
|
|
8
|
+
* Optional (sub-path @dex-ai/web-tools/with-snapshot):
|
|
9
|
+
* - web_snapshot: render JS-rendered pages with Playwright
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { Extension, AnyTool } from "@dex-ai/sdk";
|
|
13
|
+
import type { WebToolsConfig } from "./types";
|
|
14
|
+
import { DEFAULTS } from "./types";
|
|
15
|
+
import { homedir } from "node:os";
|
|
16
|
+
import { join } from "node:path";
|
|
17
|
+
import { createWebFetchTool } from "./web-fetch";
|
|
18
|
+
import { createWebSearchTool } from "./web-search";
|
|
19
|
+
|
|
20
|
+
export type { WebToolsConfig, FetchResult, SearchResult } from "./types";
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* Create the web tools extension with fetch and search capabilities.
|
|
24
|
+
*
|
|
25
|
+
* @example
|
|
26
|
+
* ```typescript
|
|
27
|
+
* import { createWebToolsExtension } from "@dex-ai/web-tools";
|
|
28
|
+
*
|
|
29
|
+
* const agent = await Agent.create({
|
|
30
|
+
* extensions: [
|
|
31
|
+
* createWebToolsExtension({ timeout: 10_000 }),
|
|
32
|
+
* ],
|
|
33
|
+
* });
|
|
34
|
+
* ```
|
|
35
|
+
*/
|
|
36
|
+
export function createWebToolsExtension(
|
|
37
|
+
config: WebToolsConfig = {},
|
|
38
|
+
): Extension {
|
|
39
|
+
// Resolve cache directory
|
|
40
|
+
const resolvedConfig: WebToolsConfig = {
|
|
41
|
+
...config,
|
|
42
|
+
cacheDir: config.cacheDir ?? join(homedir(), ".dex", "cache", "web"),
|
|
43
|
+
cacheTTL: config.cacheTTL ?? DEFAULTS.cacheTTL,
|
|
44
|
+
timeout: config.timeout ?? DEFAULTS.timeout,
|
|
45
|
+
maxResponseSize: config.maxResponseSize ?? DEFAULTS.maxResponseSize,
|
|
46
|
+
rateLimitPerDomain:
|
|
47
|
+
config.rateLimitPerDomain ?? DEFAULTS.rateLimitPerDomain,
|
|
48
|
+
rateLimitWindow: config.rateLimitWindow ?? DEFAULTS.rateLimitWindow,
|
|
49
|
+
userAgent: config.userAgent ?? DEFAULTS.userAgent,
|
|
50
|
+
respectRobotsTxt: config.respectRobotsTxt ?? DEFAULTS.respectRobotsTxt,
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
const tools: AnyTool[] = [
|
|
54
|
+
createWebFetchTool(resolvedConfig),
|
|
55
|
+
createWebSearchTool(resolvedConfig),
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
return {
|
|
59
|
+
name: "web-tools",
|
|
60
|
+
description: "Web fetch and search tools for Dex agents.",
|
|
61
|
+
tools,
|
|
62
|
+
};
|
|
63
|
+
}
|
package/src/render.ts
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML→Markdown rendering utilities.
|
|
3
|
+
*
|
|
4
|
+
* - htmlToMarkdown: extract article content via Readability + convert to markdown via turndown
|
|
5
|
+
* - extractBySelector: extract text from a specific CSS selector
|
|
6
|
+
* - extractSearchResults: parse DuckDuckGo HTML search result page
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { JSDOM } from "jsdom";
|
|
10
|
+
import { Readability } from "@mozilla/readability";
|
|
11
|
+
import TurndownService from "turndown";
|
|
12
|
+
import type { SearchResult } from "./types";
|
|
13
|
+
|
|
14
|
+
/* ── Turndown singleton ──────────────────────────────────── */
|
|
15
|
+
|
|
16
|
+
let _turndown: TurndownService | null = null;
|
|
17
|
+
|
|
18
|
+
function getTurndown(): TurndownService {
|
|
19
|
+
if (!_turndown) {
|
|
20
|
+
_turndown = new TurndownService({
|
|
21
|
+
headingStyle: "atx",
|
|
22
|
+
codeBlockStyle: "fenced",
|
|
23
|
+
linkStyle: "inlined",
|
|
24
|
+
bulletListMarker: "-",
|
|
25
|
+
});
|
|
26
|
+
}
|
|
27
|
+
return _turndown;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/* ── Public API ─────────────────────────────────────────── */
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Extract readable article content from HTML and convert to Markdown.
|
|
34
|
+
* Falls back to raw HTML→markdown if Readability fails to parse.
|
|
35
|
+
*/
|
|
36
|
+
export function htmlToMarkdown(
|
|
37
|
+
html: string,
|
|
38
|
+
url: string,
|
|
39
|
+
): { title: string | null; content: string } {
|
|
40
|
+
const dom = new JSDOM(html, { url });
|
|
41
|
+
const reader = new Readability(dom.window.document);
|
|
42
|
+
const article = reader.parse();
|
|
43
|
+
|
|
44
|
+
const turndown = getTurndown();
|
|
45
|
+
const title = article?.title ?? null;
|
|
46
|
+
const body = article?.content ?? html;
|
|
47
|
+
const content = turndown.turndown(body);
|
|
48
|
+
|
|
49
|
+
return { title, content };
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Extract text content from a specific CSS selector.
|
|
54
|
+
* Returns the text of the first matching element, or null.
|
|
55
|
+
*/
|
|
56
|
+
export function extractBySelector(
|
|
57
|
+
html: string,
|
|
58
|
+
selector: string,
|
|
59
|
+
): string | null {
|
|
60
|
+
const dom = new JSDOM(html);
|
|
61
|
+
const el = dom.window.document.querySelector(selector);
|
|
62
|
+
if (!el) return null;
|
|
63
|
+
return el.textContent?.trim() ?? null;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Parse DuckDuckGo HTML search results page.
|
|
68
|
+
* Extracts title, URL, and snippet for each .result element.
|
|
69
|
+
*/
|
|
70
|
+
export function extractSearchResults(html: string): SearchResult[] {
|
|
71
|
+
const dom = new JSDOM(html);
|
|
72
|
+
const doc = dom.window.document;
|
|
73
|
+
const results: SearchResult[] = [];
|
|
74
|
+
|
|
75
|
+
const items = doc.querySelectorAll(".result");
|
|
76
|
+
for (const item of items) {
|
|
77
|
+
const linkEl = item.querySelector(".result__a");
|
|
78
|
+
const snippetEl = item.querySelector(".result__snippet");
|
|
79
|
+
|
|
80
|
+
if (!linkEl) continue;
|
|
81
|
+
|
|
82
|
+
const title = linkEl.textContent?.trim() ?? "";
|
|
83
|
+
const href = linkEl.getAttribute("href");
|
|
84
|
+
const snippet = snippetEl?.textContent?.trim() ?? "";
|
|
85
|
+
|
|
86
|
+
// DuckDuckGo wraps real URLs in redirect links — extract via text
|
|
87
|
+
const url = href ?? "";
|
|
88
|
+
|
|
89
|
+
if (title && url) {
|
|
90
|
+
results.push({ title, url, snippet });
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
return results;
|
|
95
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared types for @dex-ai/web-tools.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export interface WebToolsConfig {
|
|
6
|
+
/** User-Agent header for all requests */
|
|
7
|
+
userAgent?: string;
|
|
8
|
+
/** Default timeout in ms */
|
|
9
|
+
timeout?: number;
|
|
10
|
+
/** Max response body size in bytes */
|
|
11
|
+
maxResponseSize?: number;
|
|
12
|
+
/** Cache TTL in ms */
|
|
13
|
+
cacheTTL?: number;
|
|
14
|
+
/** Cache directory */
|
|
15
|
+
cacheDir?: string;
|
|
16
|
+
/** Rate limit: requests per domain per window */
|
|
17
|
+
rateLimitPerDomain?: number;
|
|
18
|
+
/** Rate limit window in ms */
|
|
19
|
+
rateLimitWindow?: number;
|
|
20
|
+
/** Custom search endpoint URL. If not set, uses DuckDuckGo. */
|
|
21
|
+
searchEndpoint?: string;
|
|
22
|
+
/** Custom search API key (if endpoint requires one). */
|
|
23
|
+
searchApiKey?: string;
|
|
24
|
+
/** Allowed domains (empty = all allowed). */
|
|
25
|
+
allowedDomains?: string[];
|
|
26
|
+
/** Blocked domains. */
|
|
27
|
+
blockedDomains?: string[];
|
|
28
|
+
/** Respect robots.txt before fetching. */
|
|
29
|
+
respectRobotsTxt?: boolean;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface FetchResult {
|
|
33
|
+
url: string;
|
|
34
|
+
title: string | null;
|
|
35
|
+
content: string;
|
|
36
|
+
format: "markdown" | "json" | "text";
|
|
37
|
+
excerpt: string | null;
|
|
38
|
+
size: number;
|
|
39
|
+
cached: boolean;
|
|
40
|
+
truncated: boolean;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
export interface SearchResult {
|
|
44
|
+
title: string;
|
|
45
|
+
url: string;
|
|
46
|
+
snippet: string;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface SearchOptions {
|
|
50
|
+
count?: number;
|
|
51
|
+
site?: string;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export const DEFAULTS = {
|
|
55
|
+
userAgent: "Dex/1.0 WebTools",
|
|
56
|
+
timeout: 15_000,
|
|
57
|
+
maxResponseSize: 1_048_576,
|
|
58
|
+
cacheTTL: 300_000,
|
|
59
|
+
cacheDir: "", // resolved at runtime: ~/.dex/cache/web
|
|
60
|
+
rateLimitPerDomain: 10,
|
|
61
|
+
rateLimitWindow: 60_000,
|
|
62
|
+
respectRobotsTxt: true,
|
|
63
|
+
} as const;
|
package/src/web-fetch.ts
ADDED
|
@@ -0,0 +1,276 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* web_fetch tool — fetch a URL and return its content as clean markdown/text/JSON.
|
|
3
|
+
*
|
|
4
|
+
* Features:
|
|
5
|
+
* - HTML→markdown via Readability + turndown
|
|
6
|
+
* - JSON pretty-print passthrough
|
|
7
|
+
* - CSS selector extraction
|
|
8
|
+
* - Disk caching with TTL
|
|
9
|
+
* - Rate limiting per domain
|
|
10
|
+
* - SSRF protection (blocks private IPs)
|
|
11
|
+
* - Response size cap
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { homedir } from "node:os";
|
|
15
|
+
import { join } from "node:path";
|
|
16
|
+
import { z } from "zod";
|
|
17
|
+
import { Tool } from "@dex-ai/sdk";
|
|
18
|
+
import { WebCache } from "./cache";
|
|
19
|
+
import { htmlToMarkdown, extractBySelector } from "./render";
|
|
20
|
+
import type { WebToolsConfig, FetchResult } from "./types";
|
|
21
|
+
import { DEFAULTS } from "./types";
|
|
22
|
+
|
|
23
|
+
/* ── Rate limiting ──────────────────────────────────────── */
|
|
24
|
+
|
|
25
|
+
const domainCounters = new Map<string, { count: number; resetAt: number }>();
|
|
26
|
+
|
|
27
|
+
function checkRateLimit(
|
|
28
|
+
domain: string,
|
|
29
|
+
maxPerDomain: number,
|
|
30
|
+
window: number,
|
|
31
|
+
): boolean {
|
|
32
|
+
const now = Date.now();
|
|
33
|
+
let entry = domainCounters.get(domain);
|
|
34
|
+
if (!entry || now > entry.resetAt) {
|
|
35
|
+
entry = { count: 0, resetAt: now + window };
|
|
36
|
+
domainCounters.set(domain, entry);
|
|
37
|
+
}
|
|
38
|
+
entry.count++;
|
|
39
|
+
return entry.count <= maxPerDomain;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/* ── SSRF protection ───────────────────────────────────── */
|
|
43
|
+
|
|
44
|
+
function isPrivateHost(hostname: string): boolean {
|
|
45
|
+
const parts = hostname.toLowerCase();
|
|
46
|
+
return (
|
|
47
|
+
parts === "localhost" ||
|
|
48
|
+
parts === "127.0.0.1" ||
|
|
49
|
+
parts === "::1" ||
|
|
50
|
+
parts.startsWith("10.") ||
|
|
51
|
+
parts.startsWith("172.") ||
|
|
52
|
+
parts.startsWith("192.168.") ||
|
|
53
|
+
parts.endsWith(".local") ||
|
|
54
|
+
parts.endsWith(".localhost")
|
|
55
|
+
);
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/* ── Tool factory ───────────────────────────────────────── */
|
|
59
|
+
|
|
60
|
+
export function createWebFetchTool(
|
|
61
|
+
config: WebToolsConfig,
|
|
62
|
+
): ReturnType<typeof Tool.define> {
|
|
63
|
+
const timeout = config.timeout ?? DEFAULTS.timeout;
|
|
64
|
+
const maxSize = config.maxResponseSize ?? DEFAULTS.maxResponseSize;
|
|
65
|
+
const cacheDir = config.cacheDir ?? join(homedir(), ".dex", "cache", "web");
|
|
66
|
+
const cacheTTL = config.cacheTTL ?? DEFAULTS.cacheTTL;
|
|
67
|
+
const cache = new WebCache(cacheDir, cacheTTL);
|
|
68
|
+
const rateLimitPerDomain =
|
|
69
|
+
config.rateLimitPerDomain ?? DEFAULTS.rateLimitPerDomain;
|
|
70
|
+
const rateLimitWindow = config.rateLimitWindow ?? DEFAULTS.rateLimitWindow;
|
|
71
|
+
const userAgent = config.userAgent ?? DEFAULTS.userAgent;
|
|
72
|
+
const allowedDomains = config.allowedDomains ?? [];
|
|
73
|
+
const blockedDomains = config.blockedDomains ?? [];
|
|
74
|
+
|
|
75
|
+
return Tool.define({
|
|
76
|
+
name: "web_fetch",
|
|
77
|
+
displayName: "Fetch",
|
|
78
|
+
description:
|
|
79
|
+
"Fetch a URL and return its content as clean markdown/text/JSON. " +
|
|
80
|
+
"Use for reading docs, web pages, APIs, and JSON endpoints. " +
|
|
81
|
+
"Large pages are auto-indexed into the session knowledge base for later search.",
|
|
82
|
+
access: "read" as const,
|
|
83
|
+
parameters: z.object({
|
|
84
|
+
url: z.string().url().describe("The URL to fetch (must be http/https)."),
|
|
85
|
+
format: z
|
|
86
|
+
.enum(["auto", "markdown", "json", "text"])
|
|
87
|
+
.optional()
|
|
88
|
+
.default("auto")
|
|
89
|
+
.describe(
|
|
90
|
+
"'auto' selects based on Content-Type. 'markdown' forces HTML→markdown. " +
|
|
91
|
+
"'json' pretty-prints JSON. 'text' returns raw text.",
|
|
92
|
+
),
|
|
93
|
+
selector: z
|
|
94
|
+
.string()
|
|
95
|
+
.optional()
|
|
96
|
+
.describe(
|
|
97
|
+
"CSS selector to extract only a portion of the page (e.g., '#readme', 'article').",
|
|
98
|
+
),
|
|
99
|
+
}),
|
|
100
|
+
async execute(input) {
|
|
101
|
+
// ── Domain validation ──────────────────────────────
|
|
102
|
+
const parsedUrl = new URL(input.url);
|
|
103
|
+
const domain = parsedUrl.hostname;
|
|
104
|
+
|
|
105
|
+
if (isPrivateHost(domain)) {
|
|
106
|
+
return {
|
|
107
|
+
type: "error-text" as const,
|
|
108
|
+
value:
|
|
109
|
+
"SSRF blocked: cannot fetch from private/internal addresses " +
|
|
110
|
+
`(${domain}). Only public URLs are allowed.`,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
if (allowedDomains.length > 0 && !allowedDomains.includes(domain)) {
|
|
115
|
+
return {
|
|
116
|
+
type: "error-text" as const,
|
|
117
|
+
value:
|
|
118
|
+
`Domain "${domain}" is not in the allowed domains list. ` +
|
|
119
|
+
"Configure allowedDomains in WebToolsConfig to enable it.",
|
|
120
|
+
};
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
if (blockedDomains.includes(domain)) {
|
|
124
|
+
return {
|
|
125
|
+
type: "error-text" as const,
|
|
126
|
+
value:
|
|
127
|
+
`Domain "${domain}" is blocked. Remove it from blockedDomains ` +
|
|
128
|
+
"in WebToolsConfig to allow requests.",
|
|
129
|
+
};
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// ── Rate limit ─────────────────────────────────────
|
|
133
|
+
if (!checkRateLimit(domain, rateLimitPerDomain, rateLimitWindow)) {
|
|
134
|
+
return {
|
|
135
|
+
type: "error-text" as const,
|
|
136
|
+
value:
|
|
137
|
+
`Rate limit exceeded for ${domain}: ` +
|
|
138
|
+
`${rateLimitPerDomain} requests per ${rateLimitWindow / 1000}s. ` +
|
|
139
|
+
"Wait before retrying or increase rateLimitPerDomain.",
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ── Cache check ────────────────────────────────────
|
|
144
|
+
const cached = await cache.get(input.url);
|
|
145
|
+
if (cached) {
|
|
146
|
+
return { type: "json" as const, value: { ...cached, cached: true } };
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
// ── Fetch ──────────────────────────────────────────
|
|
150
|
+
const controller = new AbortController();
|
|
151
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
152
|
+
|
|
153
|
+
try {
|
|
154
|
+
const response = await fetch(input.url, {
|
|
155
|
+
signal: controller.signal,
|
|
156
|
+
headers: { "User-Agent": userAgent },
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
if (!response.ok) {
|
|
160
|
+
return {
|
|
161
|
+
type: "error-text" as const,
|
|
162
|
+
value: `HTTP ${response.status} ${response.statusText} for ${input.url}`,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
const contentType =
|
|
167
|
+
response.headers.get("content-type") ?? "text/plain";
|
|
168
|
+
const body = await response.text();
|
|
169
|
+
const format =
|
|
170
|
+
input.format === "auto" ? detectFormat(contentType) : input.format;
|
|
171
|
+
|
|
172
|
+
// ── Process ──────────────────────────────────────
|
|
173
|
+
let result: FetchResult;
|
|
174
|
+
|
|
175
|
+
if (format === "json") {
|
|
176
|
+
// Validate it's actually JSON
|
|
177
|
+
try {
|
|
178
|
+
JSON.parse(body);
|
|
179
|
+
} catch {
|
|
180
|
+
return {
|
|
181
|
+
type: "error-text" as const,
|
|
182
|
+
value:
|
|
183
|
+
`Response from ${input.url} is not valid JSON ` +
|
|
184
|
+
`(Content-Type: ${contentType}). Try format="text".`,
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
result = {
|
|
188
|
+
url: input.url,
|
|
189
|
+
title: null,
|
|
190
|
+
content: body,
|
|
191
|
+
format: "json",
|
|
192
|
+
excerpt: body.slice(0, 200),
|
|
193
|
+
size: body.length,
|
|
194
|
+
cached: false,
|
|
195
|
+
truncated: body.length > maxSize,
|
|
196
|
+
};
|
|
197
|
+
} else if (format === "markdown" || contentType.includes("html")) {
|
|
198
|
+
const truncated = body.length > maxSize;
|
|
199
|
+
const html = truncated ? body.slice(0, maxSize) : body;
|
|
200
|
+
|
|
201
|
+
if (input.selector) {
|
|
202
|
+
const extracted = extractBySelector(html, input.selector);
|
|
203
|
+
if (!extracted) {
|
|
204
|
+
return {
|
|
205
|
+
type: "error-text" as const,
|
|
206
|
+
value:
|
|
207
|
+
`CSS selector "${input.selector}" matched no elements ` +
|
|
208
|
+
`on ${input.url}.`,
|
|
209
|
+
};
|
|
210
|
+
}
|
|
211
|
+
result = {
|
|
212
|
+
url: input.url,
|
|
213
|
+
title: null,
|
|
214
|
+
content: extracted,
|
|
215
|
+
format: "markdown",
|
|
216
|
+
excerpt: extracted.slice(0, 200),
|
|
217
|
+
size: extracted.length,
|
|
218
|
+
cached: false,
|
|
219
|
+
truncated: false,
|
|
220
|
+
};
|
|
221
|
+
} else {
|
|
222
|
+
const { title, content } = htmlToMarkdown(html, input.url);
|
|
223
|
+
result = {
|
|
224
|
+
url: input.url,
|
|
225
|
+
title,
|
|
226
|
+
content,
|
|
227
|
+
format: "markdown",
|
|
228
|
+
excerpt: content.slice(0, 200),
|
|
229
|
+
size: content.length,
|
|
230
|
+
cached: false,
|
|
231
|
+
truncated,
|
|
232
|
+
};
|
|
233
|
+
}
|
|
234
|
+
} else {
|
|
235
|
+
result = {
|
|
236
|
+
url: input.url,
|
|
237
|
+
title: null,
|
|
238
|
+
content: body,
|
|
239
|
+
format: "text",
|
|
240
|
+
excerpt: body.slice(0, 200),
|
|
241
|
+
size: body.length,
|
|
242
|
+
cached: false,
|
|
243
|
+
truncated: body.length > maxSize,
|
|
244
|
+
};
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
// ── Cache ────────────────────────────────────────
|
|
248
|
+
await cache.set(input.url, result);
|
|
249
|
+
|
|
250
|
+
return { type: "json" as const, value: result };
|
|
251
|
+
} catch (err) {
|
|
252
|
+
const message =
|
|
253
|
+
err instanceof Error
|
|
254
|
+
? err.name === "AbortError"
|
|
255
|
+
? `Request timed out after ${timeout}ms for ${input.url}`
|
|
256
|
+
: err.message
|
|
257
|
+
: String(err);
|
|
258
|
+
return {
|
|
259
|
+
type: "error-text" as const,
|
|
260
|
+
value: `Failed to fetch ${input.url}: ${message}`,
|
|
261
|
+
};
|
|
262
|
+
} finally {
|
|
263
|
+
clearTimeout(timer);
|
|
264
|
+
}
|
|
265
|
+
},
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
/* ── Helpers ────────────────────────────────────────────── */
|
|
270
|
+
|
|
271
|
+
function detectFormat(contentType: string): "markdown" | "json" | "text" {
|
|
272
|
+
const ct = contentType.toLowerCase();
|
|
273
|
+
if (ct.includes("json")) return "json";
|
|
274
|
+
if (ct.includes("html")) return "markdown";
|
|
275
|
+
return "text";
|
|
276
|
+
}
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* web_search tool — search the web and return results.
|
|
3
|
+
*
|
|
4
|
+
* Default backend: DuckDuckGo HTML search (https://html.duckduckgo.com/html).
|
|
5
|
+
* No API key needed. Configurable to use any search endpoint.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { z } from "zod";
|
|
9
|
+
import { Tool } from "@dex-ai/sdk";
|
|
10
|
+
import { extractSearchResults } from "./render";
|
|
11
|
+
import type { WebToolsConfig, SearchResult } from "./types";
|
|
12
|
+
import { DEFAULTS } from "./types";
|
|
13
|
+
|
|
14
|
+
/* ── Tool factory ───────────────────────────────────────── */
|
|
15
|
+
|
|
16
|
+
export function createWebSearchTool(
|
|
17
|
+
config: WebToolsConfig,
|
|
18
|
+
): ReturnType<typeof Tool.define> {
|
|
19
|
+
const userAgent = config.userAgent ?? DEFAULTS.userAgent;
|
|
20
|
+
const searchEndpoint =
|
|
21
|
+
config.searchEndpoint ?? "https://html.duckduckgo.com/html";
|
|
22
|
+
const searchApiKey = config.searchApiKey;
|
|
23
|
+
const timeout = config.timeout ?? DEFAULTS.timeout;
|
|
24
|
+
|
|
25
|
+
return Tool.define({
|
|
26
|
+
name: "web_search",
|
|
27
|
+
displayName: "Web Search",
|
|
28
|
+
description:
|
|
29
|
+
"Search the web and return matching results with title, URL, and snippet. " +
|
|
30
|
+
"Uses DuckDuckGo by default (no API key needed). " +
|
|
31
|
+
"Use 'site:domain.com' in your query to scope results to a specific domain.",
|
|
32
|
+
access: "read" as const,
|
|
33
|
+
parameters: z.object({
|
|
34
|
+
query: z.string().min(1).describe("The search query."),
|
|
35
|
+
count: z
|
|
36
|
+
.number()
|
|
37
|
+
.min(1)
|
|
38
|
+
.max(20)
|
|
39
|
+
.optional()
|
|
40
|
+
.default(5)
|
|
41
|
+
.describe("Number of results to return (1-20)."),
|
|
42
|
+
site: z
|
|
43
|
+
.string()
|
|
44
|
+
.optional()
|
|
45
|
+
.describe("Scope search to a specific domain (e.g., 'docs.rs')."),
|
|
46
|
+
}),
|
|
47
|
+
async execute(input) {
|
|
48
|
+
const fullQuery = input.site
|
|
49
|
+
? `site:${input.site} ${input.query}`
|
|
50
|
+
: input.query;
|
|
51
|
+
const body = new URLSearchParams({ q: fullQuery });
|
|
52
|
+
|
|
53
|
+
const headers: Record<string, string> = {
|
|
54
|
+
"User-Agent": userAgent,
|
|
55
|
+
"Content-Type": "application/x-www-form-urlencoded",
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
// Some search APIs use Authorization header
|
|
59
|
+
if (searchApiKey) {
|
|
60
|
+
headers["Authorization"] = `Bearer ${searchApiKey}`;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const controller = new AbortController();
|
|
64
|
+
const timer = setTimeout(() => controller.abort(), timeout);
|
|
65
|
+
|
|
66
|
+
try {
|
|
67
|
+
const response = await fetch(searchEndpoint, {
|
|
68
|
+
method: "POST",
|
|
69
|
+
headers,
|
|
70
|
+
body,
|
|
71
|
+
signal: controller.signal,
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
if (!response.ok) {
|
|
75
|
+
return {
|
|
76
|
+
type: "error-text" as const,
|
|
77
|
+
value:
|
|
78
|
+
`Search API returned HTTP ${response.status} ${response.statusText}. ` +
|
|
79
|
+
"Check your searchEndpoint configuration.",
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
const html = await response.text();
|
|
84
|
+
const results = extractSearchResults(html).slice(0, input.count);
|
|
85
|
+
|
|
86
|
+
if (results.length === 0) {
|
|
87
|
+
return {
|
|
88
|
+
type: "text" as const,
|
|
89
|
+
value:
|
|
90
|
+
`No search results found for "${input.query}"${input.site ? ` on ${input.site}` : ""}. ` +
|
|
91
|
+
"Try rephrasing your query or using different terms.",
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const formatted = results
|
|
96
|
+
.map(
|
|
97
|
+
(r: SearchResult, i: number) =>
|
|
98
|
+
`${i + 1}. [${r.title}](${r.url})\n ${r.snippet}`,
|
|
99
|
+
)
|
|
100
|
+
.join("\n\n");
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
type: "text" as const,
|
|
104
|
+
value: `Search results for "${input.query}"${input.site ? ` (site:${input.site})` : ""}:\n\n${formatted}`,
|
|
105
|
+
};
|
|
106
|
+
} catch (err) {
|
|
107
|
+
const message =
|
|
108
|
+
err instanceof Error
|
|
109
|
+
? err.name === "AbortError"
|
|
110
|
+
? `Search request timed out after ${timeout}ms`
|
|
111
|
+
: err.message
|
|
112
|
+
: String(err);
|
|
113
|
+
return {
|
|
114
|
+
type: "error-text" as const,
|
|
115
|
+
value: `Search failed: ${message}`,
|
|
116
|
+
};
|
|
117
|
+
} finally {
|
|
118
|
+
clearTimeout(timer);
|
|
119
|
+
}
|
|
120
|
+
},
|
|
121
|
+
});
|
|
122
|
+
}
|