@oyasmi/pipiclaw 0.5.7 → 0.5.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +52 -3
- package/dist/agent/prompt-builder.js +6 -0
- package/dist/index.d.ts +2 -1
- package/dist/index.js +2 -1
- package/dist/paths.d.ts +1 -0
- package/dist/paths.js +1 -0
- package/dist/runtime/bootstrap.d.ts +1 -1
- package/dist/runtime/bootstrap.js +25 -13
- package/dist/runtime/dingtalk.js +0 -3
- package/dist/sandbox.js +63 -5
- package/dist/security/config.js +19 -0
- package/dist/security/network.d.ts +28 -0
- package/dist/security/network.js +246 -0
- package/dist/security/types.d.ts +16 -1
- package/dist/shared/shell-escape.d.ts +7 -0
- package/dist/shared/shell-escape.js +11 -0
- package/dist/subagents/discovery.d.ts +1 -1
- package/dist/subagents/discovery.js +1 -1
- package/dist/subagents/tool.d.ts +2 -0
- package/dist/subagents/tool.js +24 -2
- package/dist/tools/config.d.ts +30 -0
- package/dist/tools/config.js +114 -0
- package/dist/tools/edit.js +2 -2
- package/dist/tools/index.js +22 -0
- package/dist/tools/read.js +6 -6
- package/dist/tools/web-fetch.d.ts +17 -0
- package/dist/tools/web-fetch.js +29 -0
- package/dist/tools/web-search.d.ts +16 -0
- package/dist/tools/web-search.js +29 -0
- package/dist/tools/write-content.js +5 -4
- package/dist/web/client.d.ts +40 -0
- package/dist/web/client.js +181 -0
- package/dist/web/config.d.ts +18 -0
- package/dist/web/config.js +34 -0
- package/dist/web/extract.d.ts +7 -0
- package/dist/web/extract.js +122 -0
- package/dist/web/fetch.d.ts +22 -0
- package/dist/web/fetch.js +148 -0
- package/dist/web/format.d.ts +21 -0
- package/dist/web/format.js +38 -0
- package/dist/web/search-providers.d.ts +15 -0
- package/dist/web/search-providers.js +196 -0
- package/dist/web/search.d.ts +19 -0
- package/dist/web/search.js +52 -0
- package/package.json +9 -2
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { Type } from "@sinclair/typebox";
|
|
2
|
+
import { resolveWebFetchRequest } from "../web/config.js";
|
|
3
|
+
import { runWebFetch } from "../web/fetch.js";
|
|
4
|
+
const webFetchSchema = Type.Object({
|
|
5
|
+
label: Type.String({ description: "Brief description of what you're fetching and why (shown to user)" }),
|
|
6
|
+
url: Type.String({ description: "HTTP or HTTPS URL to fetch" }),
|
|
7
|
+
extractMode: Type.Optional(Type.Union([Type.Literal("markdown"), Type.Literal("text")], {
|
|
8
|
+
description: "Preferred text extraction format for HTML pages",
|
|
9
|
+
})),
|
|
10
|
+
maxChars: Type.Optional(Type.Number({ description: "Maximum extracted text characters to return" })),
|
|
11
|
+
});
|
|
12
|
+
export function createWebFetchTool(options) {
|
|
13
|
+
return {
|
|
14
|
+
name: "web_fetch",
|
|
15
|
+
label: "web_fetch",
|
|
16
|
+
description: "Fetch a public URL and extract readable content. Returns text for HTML/JSON/text pages and image content blocks for images.",
|
|
17
|
+
parameters: webFetchSchema,
|
|
18
|
+
execute: async (_toolCallId, { url, extractMode, maxChars, }, signal) => {
|
|
19
|
+
const request = resolveWebFetchRequest(options.webConfig.fetch, url, extractMode, maxChars);
|
|
20
|
+
const result = await runWebFetch({
|
|
21
|
+
webConfig: options.webConfig,
|
|
22
|
+
securityConfig: options.securityConfig,
|
|
23
|
+
workspaceDir: options.workspaceDir,
|
|
24
|
+
channelId: options.channelId,
|
|
25
|
+
}, request, signal);
|
|
26
|
+
return result;
|
|
27
|
+
},
|
|
28
|
+
};
|
|
29
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { AgentTool } from "@mariozechner/pi-agent-core";
|
|
2
|
+
import type { SecurityConfig } from "../security/types.js";
|
|
3
|
+
import type { PipiclawWebToolsConfig } from "./config.js";
|
|
4
|
+
declare const webSearchSchema: import("@sinclair/typebox").TObject<{
|
|
5
|
+
label: import("@sinclair/typebox").TString;
|
|
6
|
+
query: import("@sinclair/typebox").TString;
|
|
7
|
+
count: import("@sinclair/typebox").TOptional<import("@sinclair/typebox").TNumber>;
|
|
8
|
+
}>;
|
|
9
|
+
export interface WebSearchToolOptions {
|
|
10
|
+
webConfig: PipiclawWebToolsConfig;
|
|
11
|
+
securityConfig: SecurityConfig;
|
|
12
|
+
workspaceDir: string;
|
|
13
|
+
channelId?: string;
|
|
14
|
+
}
|
|
15
|
+
export declare function createWebSearchTool(options: WebSearchToolOptions): AgentTool<typeof webSearchSchema>;
|
|
16
|
+
export {};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import { Type } from "@sinclair/typebox";
|
|
2
|
+
import { resolveWebSearchRequest } from "../web/config.js";
|
|
3
|
+
import { runWebSearch } from "../web/search.js";
|
|
4
|
+
const webSearchSchema = Type.Object({
|
|
5
|
+
label: Type.String({ description: "Brief description of what you're searching for and why (shown to user)" }),
|
|
6
|
+
query: Type.String({ description: "Search query" }),
|
|
7
|
+
count: Type.Optional(Type.Number({ description: "Maximum number of results to return (1-10)" })),
|
|
8
|
+
});
|
|
9
|
+
export function createWebSearchTool(options) {
|
|
10
|
+
return {
|
|
11
|
+
name: "web_search",
|
|
12
|
+
label: "web_search",
|
|
13
|
+
description: "Search the public web and return titles, URLs, and snippets from the configured provider.",
|
|
14
|
+
parameters: webSearchSchema,
|
|
15
|
+
execute: async (_toolCallId, { query, count }, signal) => {
|
|
16
|
+
const request = resolveWebSearchRequest(options.webConfig.search, query, count);
|
|
17
|
+
const result = await runWebSearch({
|
|
18
|
+
webConfig: options.webConfig,
|
|
19
|
+
securityConfig: options.securityConfig,
|
|
20
|
+
workspaceDir: options.workspaceDir,
|
|
21
|
+
channelId: options.channelId,
|
|
22
|
+
}, request.query, request.count, signal);
|
|
23
|
+
return {
|
|
24
|
+
content: [{ type: "text", text: result.content }],
|
|
25
|
+
details: result.details,
|
|
26
|
+
};
|
|
27
|
+
},
|
|
28
|
+
};
|
|
29
|
+
}
|
|
@@ -1,9 +1,10 @@
|
|
|
1
|
+
import { dirname } from "node:path";
|
|
1
2
|
import { DEFAULT_SECURITY_CONFIG } from "../security/config.js";
|
|
2
3
|
import { logSecurityEvent } from "../security/logger.js";
|
|
3
4
|
import { guardPath } from "../security/path-guard.js";
|
|
4
|
-
import {
|
|
5
|
+
import { shellEscapePath } from "../shared/shell-escape.js";
|
|
5
6
|
function getDir(path) {
|
|
6
|
-
return
|
|
7
|
+
return dirname(path);
|
|
7
8
|
}
|
|
8
9
|
function ensureSuccess(result, path) {
|
|
9
10
|
if (result.code !== 0) {
|
|
@@ -41,8 +42,8 @@ export async function writeContent(executor, path, content, signal, options) {
|
|
|
41
42
|
throw new Error(lines.join("\n"));
|
|
42
43
|
}
|
|
43
44
|
}
|
|
44
|
-
const dirPrefix = createParentDir ? `mkdir -p ${
|
|
45
|
-
const result = await executor.exec(`${dirPrefix}cat > ${
|
|
45
|
+
const dirPrefix = createParentDir ? `mkdir -p ${shellEscapePath(getDir(path))} && ` : "";
|
|
46
|
+
const result = await executor.exec(`${dirPrefix}cat > ${shellEscapePath(path)}`, {
|
|
46
47
|
signal,
|
|
47
48
|
stdin: content,
|
|
48
49
|
});
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
import { Buffer } from "node:buffer";
|
|
2
|
+
import type { SecurityConfig } from "../security/types.js";
|
|
3
|
+
import type { PipiclawWebToolsConfig } from "../tools/config.js";
|
|
4
|
+
export declare const WEB_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Pipiclaw/0.5";
|
|
5
|
+
export interface WebRuntimeContext {
|
|
6
|
+
webConfig: PipiclawWebToolsConfig;
|
|
7
|
+
securityConfig: SecurityConfig;
|
|
8
|
+
workspaceDir: string;
|
|
9
|
+
channelId?: string;
|
|
10
|
+
}
|
|
11
|
+
export interface WebHttpResponse {
|
|
12
|
+
status: number;
|
|
13
|
+
finalUrl: string;
|
|
14
|
+
headers: Record<string, string>;
|
|
15
|
+
body: Buffer;
|
|
16
|
+
}
|
|
17
|
+
export interface WebHttpRequestOptions {
|
|
18
|
+
method?: "GET" | "POST";
|
|
19
|
+
url: string;
|
|
20
|
+
headers?: Record<string, string>;
|
|
21
|
+
params?: Record<string, string | number | boolean | undefined>;
|
|
22
|
+
data?: unknown;
|
|
23
|
+
timeoutMs: number;
|
|
24
|
+
signal?: AbortSignal;
|
|
25
|
+
maxRedirects?: number;
|
|
26
|
+
}
|
|
27
|
+
export declare class WebHttpClient {
|
|
28
|
+
private readonly context;
|
|
29
|
+
constructor(context: WebRuntimeContext);
|
|
30
|
+
request(options: WebHttpRequestOptions): Promise<WebHttpResponse>;
|
|
31
|
+
requestJson<T>(options: WebHttpRequestOptions): Promise<{
|
|
32
|
+
response: WebHttpResponse;
|
|
33
|
+
data: T;
|
|
34
|
+
}>;
|
|
35
|
+
requestText(options: WebHttpRequestOptions): Promise<{
|
|
36
|
+
response: WebHttpResponse;
|
|
37
|
+
text: string;
|
|
38
|
+
}>;
|
|
39
|
+
}
|
|
40
|
+
export declare function createWebHttpClient(context: WebRuntimeContext): WebHttpClient;
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
import { Buffer } from "node:buffer";
|
|
2
|
+
import axios from "axios";
|
|
3
|
+
import { HttpProxyAgent } from "http-proxy-agent";
|
|
4
|
+
import { HttpsProxyAgent } from "https-proxy-agent";
|
|
5
|
+
import { getProxyForUrl } from "proxy-from-env";
|
|
6
|
+
import { SocksProxyAgent } from "socks-proxy-agent";
|
|
7
|
+
import { logSecurityEvent } from "../security/logger.js";
|
|
8
|
+
import { NetworkGuardError, validateNetworkTarget, validateRedirectTarget } from "../security/network.js";
|
|
9
|
+
export const WEB_USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Pipiclaw/0.5";
|
|
10
|
+
const agentCache = new Map();
|
|
11
|
+
function normalizeHeaders(headers) {
|
|
12
|
+
if (!headers || typeof headers !== "object") {
|
|
13
|
+
return {};
|
|
14
|
+
}
|
|
15
|
+
const result = {};
|
|
16
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
17
|
+
if (typeof value === "string") {
|
|
18
|
+
result[key.toLowerCase()] = value;
|
|
19
|
+
}
|
|
20
|
+
else if (Array.isArray(value)) {
|
|
21
|
+
result[key.toLowerCase()] = value.join(", ");
|
|
22
|
+
}
|
|
23
|
+
else if (value !== undefined && value !== null) {
|
|
24
|
+
result[key.toLowerCase()] = String(value);
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
return result;
|
|
28
|
+
}
|
|
29
|
+
function buildUrlWithParams(url, params) {
|
|
30
|
+
if (!params) {
|
|
31
|
+
return url;
|
|
32
|
+
}
|
|
33
|
+
const resolved = new URL(url);
|
|
34
|
+
for (const [key, value] of Object.entries(params)) {
|
|
35
|
+
if (value === undefined) {
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
resolved.searchParams.set(key, String(value));
|
|
39
|
+
}
|
|
40
|
+
return resolved.toString();
|
|
41
|
+
}
|
|
42
|
+
function getProxyAgent(requestUrl, explicitProxy) {
|
|
43
|
+
const proxyUrl = explicitProxy?.trim() || getProxyForUrl(requestUrl);
|
|
44
|
+
if (!proxyUrl) {
|
|
45
|
+
return undefined;
|
|
46
|
+
}
|
|
47
|
+
const requestProtocol = new URL(requestUrl).protocol;
|
|
48
|
+
const proxyProtocol = new URL(proxyUrl).protocol;
|
|
49
|
+
const cacheKey = `${requestProtocol}|${proxyUrl}`;
|
|
50
|
+
const cached = agentCache.get(cacheKey);
|
|
51
|
+
if (cached) {
|
|
52
|
+
return cached;
|
|
53
|
+
}
|
|
54
|
+
let agent;
|
|
55
|
+
if (proxyProtocol.startsWith("socks")) {
|
|
56
|
+
agent = new SocksProxyAgent(proxyUrl);
|
|
57
|
+
}
|
|
58
|
+
else if (requestProtocol === "https:") {
|
|
59
|
+
agent = new HttpsProxyAgent(proxyUrl);
|
|
60
|
+
}
|
|
61
|
+
else {
|
|
62
|
+
agent = new HttpProxyAgent(proxyUrl);
|
|
63
|
+
}
|
|
64
|
+
agentCache.set(cacheKey, agent);
|
|
65
|
+
return agent;
|
|
66
|
+
}
|
|
67
|
+
function logBlockedRequest(context, error) {
|
|
68
|
+
logSecurityEvent(context.workspaceDir, context.securityConfig, {
|
|
69
|
+
type: "network",
|
|
70
|
+
tool: "web",
|
|
71
|
+
channelId: context.channelId,
|
|
72
|
+
url: error.url,
|
|
73
|
+
stage: error.stage,
|
|
74
|
+
resolvedHost: error.resolvedHost,
|
|
75
|
+
resolvedAddress: error.resolvedAddress,
|
|
76
|
+
category: error.category,
|
|
77
|
+
reason: error.message,
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
function decodeBody(body) {
|
|
81
|
+
return new TextDecoder("utf-8", { fatal: false }).decode(body);
|
|
82
|
+
}
|
|
83
|
+
function isRedirectStatus(status) {
|
|
84
|
+
return status === 301 || status === 302 || status === 303 || status === 307 || status === 308;
|
|
85
|
+
}
|
|
86
|
+
export class WebHttpClient {
|
|
87
|
+
constructor(context) {
|
|
88
|
+
this.context = context;
|
|
89
|
+
}
|
|
90
|
+
async request(options) {
|
|
91
|
+
const maxRedirects = options.maxRedirects ?? this.context.securityConfig.networkGuard.maxRedirects;
|
|
92
|
+
let currentUrl = buildUrlWithParams(options.url, options.params);
|
|
93
|
+
let method = options.method ?? "GET";
|
|
94
|
+
let data = options.data;
|
|
95
|
+
for (let redirectCount = 0; redirectCount <= maxRedirects; redirectCount++) {
|
|
96
|
+
try {
|
|
97
|
+
if (redirectCount === 0) {
|
|
98
|
+
await validateNetworkTarget(currentUrl, { config: this.context.securityConfig });
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
await validateRedirectTarget(currentUrl, { config: this.context.securityConfig });
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
catch (error) {
|
|
105
|
+
if (error instanceof NetworkGuardError) {
|
|
106
|
+
logBlockedRequest(this.context, error);
|
|
107
|
+
}
|
|
108
|
+
throw error;
|
|
109
|
+
}
|
|
110
|
+
const agent = getProxyAgent(currentUrl, this.context.webConfig.proxy);
|
|
111
|
+
const response = await axios.request({
|
|
112
|
+
method,
|
|
113
|
+
url: currentUrl,
|
|
114
|
+
data,
|
|
115
|
+
headers: {
|
|
116
|
+
"User-Agent": WEB_USER_AGENT,
|
|
117
|
+
Accept: "*/*",
|
|
118
|
+
...options.headers,
|
|
119
|
+
},
|
|
120
|
+
responseType: "arraybuffer",
|
|
121
|
+
validateStatus: () => true,
|
|
122
|
+
timeout: options.timeoutMs,
|
|
123
|
+
signal: options.signal,
|
|
124
|
+
maxRedirects: 0,
|
|
125
|
+
proxy: false,
|
|
126
|
+
httpAgent: agent,
|
|
127
|
+
httpsAgent: agent,
|
|
128
|
+
});
|
|
129
|
+
const headers = normalizeHeaders(response.headers);
|
|
130
|
+
const body = Buffer.isBuffer(response.data) ? response.data : Buffer.from(response.data);
|
|
131
|
+
if (isRedirectStatus(response.status) && headers.location) {
|
|
132
|
+
if (redirectCount === maxRedirects) {
|
|
133
|
+
throw new Error(`Too many redirects while fetching ${options.url}`);
|
|
134
|
+
}
|
|
135
|
+
currentUrl = new URL(headers.location, currentUrl).toString();
|
|
136
|
+
if (response.status === 303 ||
|
|
137
|
+
((response.status === 301 || response.status === 302) && method === "POST")) {
|
|
138
|
+
method = "GET";
|
|
139
|
+
data = undefined;
|
|
140
|
+
}
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
return {
|
|
144
|
+
status: response.status,
|
|
145
|
+
finalUrl: currentUrl,
|
|
146
|
+
headers,
|
|
147
|
+
body,
|
|
148
|
+
};
|
|
149
|
+
}
|
|
150
|
+
throw new Error(`Too many redirects while fetching ${options.url}`);
|
|
151
|
+
}
|
|
152
|
+
async requestJson(options) {
|
|
153
|
+
const response = await this.request({
|
|
154
|
+
...options,
|
|
155
|
+
headers: {
|
|
156
|
+
Accept: "application/json",
|
|
157
|
+
...options.headers,
|
|
158
|
+
},
|
|
159
|
+
});
|
|
160
|
+
const text = decodeBody(response.body);
|
|
161
|
+
try {
|
|
162
|
+
return {
|
|
163
|
+
response,
|
|
164
|
+
data: JSON.parse(text),
|
|
165
|
+
};
|
|
166
|
+
}
|
|
167
|
+
catch (error) {
|
|
168
|
+
throw new Error(`Expected JSON response from ${options.url}, got invalid JSON: ${error instanceof Error ? error.message : String(error)}`);
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
async requestText(options) {
|
|
172
|
+
const response = await this.request(options);
|
|
173
|
+
return {
|
|
174
|
+
response,
|
|
175
|
+
text: decodeBody(response.body),
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
export function createWebHttpClient(context) {
|
|
180
|
+
return new WebHttpClient(context);
|
|
181
|
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import type { PipiclawWebFetchConfig, PipiclawWebSearchConfig, PipiclawWebToolsConfig } from "../tools/config.js";
|
|
2
|
+
export interface ResolvedWebSearchRequest {
|
|
3
|
+
query: string;
|
|
4
|
+
count: number;
|
|
5
|
+
timeoutMs: number;
|
|
6
|
+
}
|
|
7
|
+
export interface ResolvedWebFetchRequest {
|
|
8
|
+
url: string;
|
|
9
|
+
extractMode: "markdown" | "text";
|
|
10
|
+
maxChars: number;
|
|
11
|
+
timeoutMs: number;
|
|
12
|
+
maxImageBytes: number;
|
|
13
|
+
preferJina: boolean;
|
|
14
|
+
enableJinaFallback: boolean;
|
|
15
|
+
}
|
|
16
|
+
export declare function resolveWebSearchRequest(config: PipiclawWebSearchConfig, query: string, count?: number): ResolvedWebSearchRequest;
|
|
17
|
+
export declare function resolveWebFetchRequest(config: PipiclawWebFetchConfig, url: string, extractMode?: "markdown" | "text", maxChars?: number): ResolvedWebFetchRequest;
|
|
18
|
+
export declare function isWebToolsEnabled(config: PipiclawWebToolsConfig): boolean;
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
export function resolveWebSearchRequest(config, query, count) {
|
|
2
|
+
return {
|
|
3
|
+
query: query.trim(),
|
|
4
|
+
count: clamp(count, config.maxResults, 1, 10),
|
|
5
|
+
timeoutMs: config.timeoutMs,
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
export function resolveWebFetchRequest(config, url, extractMode, maxChars) {
|
|
9
|
+
return {
|
|
10
|
+
url: url.trim(),
|
|
11
|
+
extractMode: extractMode === "text" ? "text" : extractMode === "markdown" ? "markdown" : config.defaultExtractMode,
|
|
12
|
+
maxChars: clamp(maxChars, config.maxChars, 100),
|
|
13
|
+
timeoutMs: config.timeoutMs,
|
|
14
|
+
maxImageBytes: config.maxImageBytes,
|
|
15
|
+
preferJina: config.preferJina,
|
|
16
|
+
enableJinaFallback: config.enableJinaFallback,
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
export function isWebToolsEnabled(config) {
|
|
20
|
+
return config.enable !== false;
|
|
21
|
+
}
|
|
22
|
+
function clamp(value, fallback, minimum, maximum) {
|
|
23
|
+
if (typeof value !== "number" || !Number.isFinite(value)) {
|
|
24
|
+
return fallback;
|
|
25
|
+
}
|
|
26
|
+
const normalized = Math.floor(value);
|
|
27
|
+
if (normalized < minimum) {
|
|
28
|
+
return fallback;
|
|
29
|
+
}
|
|
30
|
+
if (maximum !== undefined && normalized > maximum) {
|
|
31
|
+
return fallback;
|
|
32
|
+
}
|
|
33
|
+
return normalized;
|
|
34
|
+
}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export declare function htmlToText(html: string): string;
|
|
2
|
+
export declare function htmlToMarkdown(html: string): string;
|
|
3
|
+
export declare function extractReadableContent(html: string, url: string, extractMode: "markdown" | "text"): {
|
|
4
|
+
title: string;
|
|
5
|
+
content: string;
|
|
6
|
+
extractor: string;
|
|
7
|
+
};
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
import { Readability } from "@mozilla/readability";
|
|
2
|
+
import { JSDOM, VirtualConsole } from "jsdom";
|
|
3
|
+
const ELEMENT_NODE = 1;
|
|
4
|
+
const TEXT_NODE = 3;
|
|
5
|
+
function createSilentVirtualConsole() {
|
|
6
|
+
const virtualConsole = new VirtualConsole();
|
|
7
|
+
virtualConsole.on("jsdomError", (_error) => {
|
|
8
|
+
// Ignore parser noise such as malformed inline CSS. These pages are still
|
|
9
|
+
// often readable enough for Readability / text extraction, and forwarding
|
|
10
|
+
// jsdom's internal parse warnings pollutes Pipiclaw runtime logs.
|
|
11
|
+
});
|
|
12
|
+
return virtualConsole;
|
|
13
|
+
}
|
|
14
|
+
function createDom(html, url) {
|
|
15
|
+
const options = {
|
|
16
|
+
virtualConsole: createSilentVirtualConsole(),
|
|
17
|
+
...(url ? { url } : {}),
|
|
18
|
+
};
|
|
19
|
+
return new JSDOM(html, options);
|
|
20
|
+
}
|
|
21
|
+
function normalizeWhitespace(value) {
|
|
22
|
+
return value
|
|
23
|
+
.replace(/\r\n/g, "\n")
|
|
24
|
+
.replace(/[ \t]+\n/g, "\n")
|
|
25
|
+
.replace(/\n{3,}/g, "\n\n")
|
|
26
|
+
.trim();
|
|
27
|
+
}
|
|
28
|
+
function escapeMarkdown(value) {
|
|
29
|
+
return value.replace(/([\\`*_{}[\]()#+.!>-])/g, "\\$1");
|
|
30
|
+
}
|
|
31
|
+
function renderNode(node) {
|
|
32
|
+
const domNode = node;
|
|
33
|
+
if (domNode.nodeType === TEXT_NODE) {
|
|
34
|
+
return escapeMarkdown(domNode.textContent ?? "");
|
|
35
|
+
}
|
|
36
|
+
if (domNode.nodeType !== ELEMENT_NODE) {
|
|
37
|
+
return "";
|
|
38
|
+
}
|
|
39
|
+
const element = node;
|
|
40
|
+
const children = Array.from(element.childNodes ?? [])
|
|
41
|
+
.map(renderNode)
|
|
42
|
+
.join("")
|
|
43
|
+
.trim();
|
|
44
|
+
const tag = element.tagName.toLowerCase();
|
|
45
|
+
switch (tag) {
|
|
46
|
+
case "h1":
|
|
47
|
+
return `# ${children}\n\n`;
|
|
48
|
+
case "h2":
|
|
49
|
+
return `## ${children}\n\n`;
|
|
50
|
+
case "h3":
|
|
51
|
+
return `### ${children}\n\n`;
|
|
52
|
+
case "h4":
|
|
53
|
+
return `#### ${children}\n\n`;
|
|
54
|
+
case "h5":
|
|
55
|
+
return `##### ${children}\n\n`;
|
|
56
|
+
case "h6":
|
|
57
|
+
return `###### ${children}\n\n`;
|
|
58
|
+
case "p":
|
|
59
|
+
return `${children}\n\n`;
|
|
60
|
+
case "br":
|
|
61
|
+
return "\n";
|
|
62
|
+
case "strong":
|
|
63
|
+
case "b":
|
|
64
|
+
return `**${children}**`;
|
|
65
|
+
case "em":
|
|
66
|
+
case "i":
|
|
67
|
+
return `*${children}*`;
|
|
68
|
+
case "code":
|
|
69
|
+
return `\`${children}\``;
|
|
70
|
+
case "pre":
|
|
71
|
+
return `\`\`\`\n${element.textContent?.trim() ?? ""}\n\`\`\`\n\n`;
|
|
72
|
+
case "blockquote":
|
|
73
|
+
return `${children
|
|
74
|
+
.split("\n")
|
|
75
|
+
.map((line) => (line.trim() ? `> ${line}` : ">"))
|
|
76
|
+
.join("\n")}\n\n`;
|
|
77
|
+
case "ul":
|
|
78
|
+
return `${Array.from(element.children ?? [])
|
|
79
|
+
.map((child) => `- ${renderNode(child).trim()}`)
|
|
80
|
+
.join("\n")}\n\n`;
|
|
81
|
+
case "ol":
|
|
82
|
+
return `${Array.from(element.children ?? [])
|
|
83
|
+
.map((child, index) => `${index + 1}. ${renderNode(child).trim()}`)
|
|
84
|
+
.join("\n")}\n\n`;
|
|
85
|
+
case "li":
|
|
86
|
+
return children;
|
|
87
|
+
case "a": {
|
|
88
|
+
const href = element.getAttribute("href");
|
|
89
|
+
return href ? `[${children || href}](${href})` : children;
|
|
90
|
+
}
|
|
91
|
+
default:
|
|
92
|
+
return children ? `${children}${["div", "section", "article"].includes(tag) ? "\n\n" : ""}` : "";
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
export function htmlToText(html) {
|
|
96
|
+
const dom = createDom(html);
|
|
97
|
+
return normalizeWhitespace(dom.window.document.body.textContent ?? "");
|
|
98
|
+
}
|
|
99
|
+
export function htmlToMarkdown(html) {
|
|
100
|
+
const dom = createDom(html);
|
|
101
|
+
const body = dom.window.document.body;
|
|
102
|
+
return normalizeWhitespace(Array.from(body.childNodes).map(renderNode).join(""));
|
|
103
|
+
}
|
|
104
|
+
export function extractReadableContent(html, url, extractMode) {
|
|
105
|
+
const dom = createDom(html, url);
|
|
106
|
+
const article = new Readability(dom.window.document).parse();
|
|
107
|
+
if (!article) {
|
|
108
|
+
const fallbackContent = extractMode === "text" ? htmlToText(html) : htmlToMarkdown(html);
|
|
109
|
+
return {
|
|
110
|
+
title: dom.window.document.title?.trim() ?? "",
|
|
111
|
+
content: fallbackContent,
|
|
112
|
+
extractor: extractMode === "text" ? "html-text" : "html-markdown",
|
|
113
|
+
};
|
|
114
|
+
}
|
|
115
|
+
const articleContent = article.content ?? "";
|
|
116
|
+
const content = extractMode === "text" ? htmlToText(articleContent) : htmlToMarkdown(articleContent);
|
|
117
|
+
return {
|
|
118
|
+
title: article.title?.trim() ?? "",
|
|
119
|
+
content,
|
|
120
|
+
extractor: extractMode === "text" ? "readability-text" : "readability-markdown",
|
|
121
|
+
};
|
|
122
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import type { ImageContent, TextContent } from "@mariozechner/pi-ai";
|
|
2
|
+
import type { SecurityConfig } from "../security/types.js";
|
|
3
|
+
import type { PipiclawWebToolsConfig } from "../tools/config.js";
|
|
4
|
+
import { type FormattedFetchDetails } from "./format.js";
|
|
5
|
+
export interface WebFetchExecutionContext {
|
|
6
|
+
webConfig: PipiclawWebToolsConfig;
|
|
7
|
+
securityConfig: SecurityConfig;
|
|
8
|
+
workspaceDir: string;
|
|
9
|
+
channelId?: string;
|
|
10
|
+
}
|
|
11
|
+
export interface WebFetchOutput {
|
|
12
|
+
content: Array<TextContent | ImageContent>;
|
|
13
|
+
details: FormattedFetchDetails;
|
|
14
|
+
}
|
|
15
|
+
export declare function runWebFetch(context: WebFetchExecutionContext, request: {
|
|
16
|
+
url: string;
|
|
17
|
+
extractMode: "markdown" | "text";
|
|
18
|
+
maxChars: number;
|
|
19
|
+
maxImageBytes: number;
|
|
20
|
+
preferJina: boolean;
|
|
21
|
+
enableJinaFallback: boolean;
|
|
22
|
+
}, signal?: AbortSignal): Promise<WebFetchOutput>;
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import { createWebHttpClient } from "./client.js";
|
|
2
|
+
import { extractReadableContent } from "./extract.js";
|
|
3
|
+
import { buildFetchedImageContent, buildFetchedTextContent } from "./format.js";
|
|
4
|
+
function trimToMaxChars(text, maxChars) {
|
|
5
|
+
if (text.length <= maxChars) {
|
|
6
|
+
return { text, truncated: false };
|
|
7
|
+
}
|
|
8
|
+
return {
|
|
9
|
+
text: text.slice(0, maxChars),
|
|
10
|
+
truncated: true,
|
|
11
|
+
};
|
|
12
|
+
}
|
|
13
|
+
function decodeUtf8(body) {
|
|
14
|
+
return new TextDecoder("utf-8", { fatal: false }).decode(body);
|
|
15
|
+
}
|
|
16
|
+
function isHtmlContent(contentType, body) {
|
|
17
|
+
if (contentType.includes("text/html")) {
|
|
18
|
+
return true;
|
|
19
|
+
}
|
|
20
|
+
const head = decodeUtf8(body.subarray(0, Math.min(body.length, 256)))
|
|
21
|
+
.trimStart()
|
|
22
|
+
.toLowerCase();
|
|
23
|
+
return head.startsWith("<!doctype") || head.startsWith("<html");
|
|
24
|
+
}
|
|
25
|
+
async function tryFetchViaJina(context, url, maxChars, signal) {
|
|
26
|
+
const client = createWebHttpClient({
|
|
27
|
+
webConfig: context.webConfig,
|
|
28
|
+
securityConfig: context.securityConfig,
|
|
29
|
+
workspaceDir: context.workspaceDir,
|
|
30
|
+
channelId: context.channelId,
|
|
31
|
+
});
|
|
32
|
+
const headers = { Accept: "application/json" };
|
|
33
|
+
if (context.webConfig.search.apiKey && context.webConfig.search.provider === "jina") {
|
|
34
|
+
headers.Authorization = `Bearer ${context.webConfig.search.apiKey}`;
|
|
35
|
+
}
|
|
36
|
+
const { response, data } = await client.requestJson({
|
|
37
|
+
url: `https://r.jina.ai/${url}`,
|
|
38
|
+
headers,
|
|
39
|
+
timeoutMs: context.webConfig.fetch.timeoutMs,
|
|
40
|
+
signal,
|
|
41
|
+
});
|
|
42
|
+
if (response.status < 200 || response.status >= 300 || !data.data?.content) {
|
|
43
|
+
return null;
|
|
44
|
+
}
|
|
45
|
+
const title = data.data.title?.trim();
|
|
46
|
+
const body = title ? `# ${title}\n\n${data.data.content}` : data.data.content;
|
|
47
|
+
const trimmed = trimToMaxChars(body, maxChars);
|
|
48
|
+
return {
|
|
49
|
+
content: buildFetchedTextContent(trimmed.text),
|
|
50
|
+
details: {
|
|
51
|
+
url,
|
|
52
|
+
finalUrl: data.data.url?.trim() || response.finalUrl,
|
|
53
|
+
status: response.status,
|
|
54
|
+
extractor: "jina",
|
|
55
|
+
truncated: trimmed.truncated,
|
|
56
|
+
length: trimmed.text.length,
|
|
57
|
+
untrusted: true,
|
|
58
|
+
contentType: "text/markdown",
|
|
59
|
+
},
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
async function fetchDirect(context, url, extractMode, maxChars, maxImageBytes, signal) {
|
|
63
|
+
const client = createWebHttpClient({
|
|
64
|
+
webConfig: context.webConfig,
|
|
65
|
+
securityConfig: context.securityConfig,
|
|
66
|
+
workspaceDir: context.workspaceDir,
|
|
67
|
+
channelId: context.channelId,
|
|
68
|
+
});
|
|
69
|
+
const response = await client.request({
|
|
70
|
+
url,
|
|
71
|
+
timeoutMs: context.webConfig.fetch.timeoutMs,
|
|
72
|
+
signal,
|
|
73
|
+
});
|
|
74
|
+
if (response.status < 200 || response.status >= 300) {
|
|
75
|
+
throw new Error(`Failed to fetch ${url}: HTTP ${response.status}`);
|
|
76
|
+
}
|
|
77
|
+
const contentType = response.headers["content-type"]?.toLowerCase() ?? "application/octet-stream";
|
|
78
|
+
if (contentType.startsWith("image/")) {
|
|
79
|
+
if (response.body.length > maxImageBytes) {
|
|
80
|
+
throw new Error(`Fetched image exceeds maxImageBytes (${maxImageBytes} bytes)`);
|
|
81
|
+
}
|
|
82
|
+
return {
|
|
83
|
+
content: buildFetchedImageContent(response.body.toString("base64"), contentType.split(";")[0], response.finalUrl),
|
|
84
|
+
details: {
|
|
85
|
+
url,
|
|
86
|
+
finalUrl: response.finalUrl,
|
|
87
|
+
status: response.status,
|
|
88
|
+
extractor: "direct-image",
|
|
89
|
+
truncated: false,
|
|
90
|
+
length: response.body.length,
|
|
91
|
+
untrusted: true,
|
|
92
|
+
contentType,
|
|
93
|
+
},
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
let text = "";
|
|
97
|
+
let extractor = "raw";
|
|
98
|
+
if (contentType.includes("application/json")) {
|
|
99
|
+
const parsed = JSON.parse(decodeUtf8(response.body));
|
|
100
|
+
text = JSON.stringify(parsed, null, 2);
|
|
101
|
+
extractor = "json";
|
|
102
|
+
}
|
|
103
|
+
else if (isHtmlContent(contentType, response.body)) {
|
|
104
|
+
const html = decodeUtf8(response.body);
|
|
105
|
+
const article = extractReadableContent(html, response.finalUrl, extractMode);
|
|
106
|
+
text = article.title ? `# ${article.title}\n\n${article.content}` : article.content;
|
|
107
|
+
extractor = article.extractor;
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
text = decodeUtf8(response.body);
|
|
111
|
+
extractor = "text";
|
|
112
|
+
}
|
|
113
|
+
const trimmed = trimToMaxChars(text.trim(), maxChars);
|
|
114
|
+
return {
|
|
115
|
+
content: buildFetchedTextContent(trimmed.text),
|
|
116
|
+
details: {
|
|
117
|
+
url,
|
|
118
|
+
finalUrl: response.finalUrl,
|
|
119
|
+
status: response.status,
|
|
120
|
+
extractor,
|
|
121
|
+
truncated: trimmed.truncated,
|
|
122
|
+
length: trimmed.text.length,
|
|
123
|
+
untrusted: true,
|
|
124
|
+
contentType,
|
|
125
|
+
},
|
|
126
|
+
};
|
|
127
|
+
}
|
|
128
|
+
export async function runWebFetch(context, request, signal) {
|
|
129
|
+
if (request.preferJina) {
|
|
130
|
+
const jinaResult = await tryFetchViaJina(context, request.url, request.maxChars, signal);
|
|
131
|
+
if (jinaResult) {
|
|
132
|
+
return jinaResult;
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
try {
|
|
136
|
+
return await fetchDirect(context, request.url, request.extractMode, request.maxChars, request.maxImageBytes, signal);
|
|
137
|
+
}
|
|
138
|
+
catch (error) {
|
|
139
|
+
if (!request.enableJinaFallback) {
|
|
140
|
+
throw error;
|
|
141
|
+
}
|
|
142
|
+
const jinaResult = await tryFetchViaJina(context, request.url, request.maxChars, signal);
|
|
143
|
+
if (jinaResult) {
|
|
144
|
+
return jinaResult;
|
|
145
|
+
}
|
|
146
|
+
throw error;
|
|
147
|
+
}
|
|
148
|
+
}
|