mcp-vision-web-bridge 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +18 -0
- package/CHANGELOG.md +14 -0
- package/LICENSE +21 -0
- package/README.md +201 -0
- package/SECURITY_REVIEW.md +37 -0
- package/package.json +32 -0
- package/scripts/check-secrets.mjs +73 -0
- package/src/model-client.mjs +757 -0
- package/src/server.mjs +201 -0
- package/src/web-reader.mjs +371 -0
package/src/server.mjs
ADDED
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { readFile } from 'node:fs/promises';
|
|
3
|
+
|
|
4
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
5
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
6
|
+
import { z } from 'zod';
|
|
7
|
+
|
|
8
|
+
import { askModel, askModelWithImage } from './model-client.mjs';
|
|
9
|
+
import { buildWebContext, extractUrls } from './web-reader.mjs';
|
|
10
|
+
|
|
11
|
+
const DEFAULT_MODEL = process.env.MODEL_NAME || 'replace-with-your-vision-model';
|
|
12
|
+
const packageInfo = JSON.parse(await readFile(new URL('../package.json', import.meta.url), 'utf8'));
|
|
13
|
+
|
|
14
|
+
const server = new McpServer({
|
|
15
|
+
name: 'mcp-vision-web-bridge',
|
|
16
|
+
version: packageInfo.version
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
server.registerTool(
|
|
20
|
+
'read_image_with_model',
|
|
21
|
+
{
|
|
22
|
+
title: 'Read Image With Model',
|
|
23
|
+
description:
|
|
24
|
+
'Read an image from the latest Claude upload, an image URL, base64, an allowed local path, or clipboard, then send it to the configured vision-capable model.',
|
|
25
|
+
inputSchema: {
|
|
26
|
+
prompt: z.string().default('Please describe this image. If it contains text, extract the text.'),
|
|
27
|
+
image_path: z.string().optional().describe('Local image path. Disabled unless ALLOW_LOCAL_IMAGE_PATHS=true.'),
|
|
28
|
+
image_url: z.string().optional().describe('HTTP/HTTPS image URL or data URL.'),
|
|
29
|
+
image_base64: z.string().optional().describe('Raw base64 or data URL.'),
|
|
30
|
+
mime_type: z.string().default('image/png').describe('MIME type for raw base64 input.'),
|
|
31
|
+
use_latest_upload: z.boolean().default(true).describe('When no image source is provided, read the newest recent Claude upload.'),
|
|
32
|
+
use_clipboard: z.boolean().default(false).describe('Read macOS clipboard image. Disabled unless ALLOW_CLIPBOARD_IMAGES=true.'),
|
|
33
|
+
max_age_minutes: z.number().int().min(1).max(1440).default(240),
|
|
34
|
+
model: z.string().default(DEFAULT_MODEL),
|
|
35
|
+
max_tokens: z.number().int().min(64).max(32768).default(8192),
|
|
36
|
+
temperature: z.number().min(0).max(2).optional()
|
|
37
|
+
}
|
|
38
|
+
},
|
|
39
|
+
async ({
|
|
40
|
+
prompt,
|
|
41
|
+
image_path: imagePath,
|
|
42
|
+
image_url: imageUrl,
|
|
43
|
+
image_base64: imageBase64,
|
|
44
|
+
mime_type: mimeType,
|
|
45
|
+
use_latest_upload: useLatestUpload,
|
|
46
|
+
use_clipboard: useClipboard,
|
|
47
|
+
max_age_minutes: maxAgeMinutes,
|
|
48
|
+
model,
|
|
49
|
+
max_tokens: maxTokens,
|
|
50
|
+
temperature
|
|
51
|
+
}) => {
|
|
52
|
+
const hasExplicitImage = Boolean(imagePath || imageUrl || imageBase64 || useClipboard);
|
|
53
|
+
const result = await askModelWithImage({
|
|
54
|
+
prompt,
|
|
55
|
+
imagePath,
|
|
56
|
+
imageUrl,
|
|
57
|
+
imageBase64,
|
|
58
|
+
mimeType,
|
|
59
|
+
fromClipboard: useClipboard,
|
|
60
|
+
latestClaudeUpload: !hasExplicitImage && useLatestUpload,
|
|
61
|
+
maxAgeMinutes,
|
|
62
|
+
model,
|
|
63
|
+
maxTokens,
|
|
64
|
+
temperature
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
content: [
|
|
69
|
+
{
|
|
70
|
+
type: 'text',
|
|
71
|
+
text: [`Image source: ${formatImageSource(result.source)}`, '', result.text].join('\n')
|
|
72
|
+
}
|
|
73
|
+
]
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
server.registerPrompt(
|
|
79
|
+
'img',
|
|
80
|
+
{
|
|
81
|
+
title: 'Read latest uploaded image',
|
|
82
|
+
description: 'Ask the vision MCP tool to read the latest image uploaded to the client.',
|
|
83
|
+
argsSchema: {
|
|
84
|
+
prompt: z.string().optional().describe('What to analyze in the image')
|
|
85
|
+
}
|
|
86
|
+
},
|
|
87
|
+
async ({ prompt }) => ({
|
|
88
|
+
messages: [
|
|
89
|
+
{
|
|
90
|
+
role: 'user',
|
|
91
|
+
content: {
|
|
92
|
+
type: 'text',
|
|
93
|
+
text: [
|
|
94
|
+
'Use the MCP tool `read_image_with_model` with `use_latest_upload=true` and `use_clipboard=false`.',
|
|
95
|
+
`Prompt: ${prompt || 'Please describe this image. If it contains text, extract the text.'}`
|
|
96
|
+
].join('\n')
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
]
|
|
100
|
+
})
|
|
101
|
+
);
|
|
102
|
+
|
|
103
|
+
server.registerPrompt(
|
|
104
|
+
'clipboard-image',
|
|
105
|
+
{
|
|
106
|
+
title: 'Read clipboard image',
|
|
107
|
+
description: 'Ask the vision MCP tool to read the current clipboard image.',
|
|
108
|
+
argsSchema: {
|
|
109
|
+
prompt: z.string().optional().describe('What to analyze in the image')
|
|
110
|
+
}
|
|
111
|
+
},
|
|
112
|
+
async ({ prompt }) => ({
|
|
113
|
+
messages: [
|
|
114
|
+
{
|
|
115
|
+
role: 'user',
|
|
116
|
+
content: {
|
|
117
|
+
type: 'text',
|
|
118
|
+
text: [
|
|
119
|
+
'Use the MCP tool `read_image_with_model` with `use_clipboard=true` and `use_latest_upload=false`.',
|
|
120
|
+
`Prompt: ${prompt || 'Please describe this image. If it contains text, extract the text.'}`
|
|
121
|
+
].join('\n')
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
]
|
|
125
|
+
})
|
|
126
|
+
);
|
|
127
|
+
|
|
128
|
+
server.registerTool(
|
|
129
|
+
'read_links_with_model',
|
|
130
|
+
{
|
|
131
|
+
title: 'Read Links With Model',
|
|
132
|
+
description:
|
|
133
|
+
'Extract web links from user input, fetch readable page content locally, then ask the configured model to summarize, translate, or answer questions.',
|
|
134
|
+
inputSchema: {
|
|
135
|
+
input: z.string().min(1).describe('User request containing one or more URLs.'),
|
|
136
|
+
instruction: z.string().default('Please read these links, summarize the main points, and answer the user request.'),
|
|
137
|
+
max_urls: z.number().int().min(1).max(8).default(5),
|
|
138
|
+
max_chars_per_url: z.number().int().min(1000).max(60000).default(24000),
|
|
139
|
+
model: z.string().default(DEFAULT_MODEL),
|
|
140
|
+
max_tokens: z.number().int().min(64).max(32768).default(8192),
|
|
141
|
+
temperature: z.number().min(0).max(2).optional()
|
|
142
|
+
}
|
|
143
|
+
},
|
|
144
|
+
async ({
|
|
145
|
+
input,
|
|
146
|
+
instruction,
|
|
147
|
+
max_urls: maxUrls,
|
|
148
|
+
max_chars_per_url: maxCharsPerUrl,
|
|
149
|
+
model,
|
|
150
|
+
max_tokens: maxTokens,
|
|
151
|
+
temperature
|
|
152
|
+
}) => {
|
|
153
|
+
const urls = extractUrls(input);
|
|
154
|
+
if (!urls.length) {
|
|
155
|
+
throw new Error('No URLs found in input');
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const webContext = await buildWebContext(input, {
|
|
159
|
+
maxUrls,
|
|
160
|
+
maxCharsPerUrl
|
|
161
|
+
});
|
|
162
|
+
const prompt = [
|
|
163
|
+
instruction,
|
|
164
|
+
'',
|
|
165
|
+
'User request:',
|
|
166
|
+
input,
|
|
167
|
+
'',
|
|
168
|
+
'Fetched web content:',
|
|
169
|
+
webContext.text
|
|
170
|
+
].join('\n');
|
|
171
|
+
const text = await askModel({
|
|
172
|
+
prompt,
|
|
173
|
+
model,
|
|
174
|
+
maxTokens,
|
|
175
|
+
temperature
|
|
176
|
+
});
|
|
177
|
+
|
|
178
|
+
return {
|
|
179
|
+
content: [
|
|
180
|
+
{
|
|
181
|
+
type: 'text',
|
|
182
|
+
text
|
|
183
|
+
}
|
|
184
|
+
]
|
|
185
|
+
};
|
|
186
|
+
}
|
|
187
|
+
);
|
|
188
|
+
|
|
189
|
+
const transport = new StdioServerTransport();
|
|
190
|
+
await server.connect(transport);
|
|
191
|
+
|
|
192
|
+
function formatImageSource(source) {
|
|
193
|
+
if (!source?.type) return 'unknown';
|
|
194
|
+
if (source.type === 'latest_upload') return 'latest uploaded image';
|
|
195
|
+
if (source.type === 'clipboard') return 'clipboard image';
|
|
196
|
+
if (source.type === 'local_path') return 'local image path';
|
|
197
|
+
if (source.type === 'image_url') return 'image URL';
|
|
198
|
+
if (source.type === 'data_url') return 'data URL';
|
|
199
|
+
if (source.type === 'base64') return 'base64 image';
|
|
200
|
+
return source.type;
|
|
201
|
+
}
|
|
@@ -0,0 +1,371 @@
|
|
|
1
|
+
import { execFile as execFileCallback } from 'node:child_process';
|
|
2
|
+
import { isIP } from 'node:net';
|
|
3
|
+
import { promisify } from 'node:util';
|
|
4
|
+
|
|
5
|
+
const execFile = promisify(execFileCallback);
|
|
6
|
+
|
|
7
|
+
const DEFAULT_USER_AGENT =
|
|
8
|
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36';
|
|
9
|
+
|
|
10
|
+
const DEFAULT_MAX_CHARS_PER_URL = 24000;
|
|
11
|
+
const DEFAULT_MAX_URLS = 5;
|
|
12
|
+
const DEFAULT_TIMEOUT_MS = 20000;
|
|
13
|
+
|
|
14
|
+
const ENTITY_MAP = new Map([
|
|
15
|
+
['amp', '&'],
|
|
16
|
+
['lt', '<'],
|
|
17
|
+
['gt', '>'],
|
|
18
|
+
['quot', '"'],
|
|
19
|
+
['apos', "'"],
|
|
20
|
+
['nbsp', ' ']
|
|
21
|
+
]);
|
|
22
|
+
|
|
23
|
+
export function extractUrls(text) {
|
|
24
|
+
if (!text || typeof text !== 'string') {
|
|
25
|
+
return [];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
const seen = new Set();
|
|
29
|
+
const matches = text.match(/https?:\/\/[^\s<>"'`,。!?、;:]+/g) || [];
|
|
30
|
+
const urls = [];
|
|
31
|
+
|
|
32
|
+
for (const match of matches) {
|
|
33
|
+
const url = trimUrl(match);
|
|
34
|
+
if (!url || seen.has(url)) continue;
|
|
35
|
+
seen.add(url);
|
|
36
|
+
urls.push(url);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
return urls;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function htmlToReadableText(html, { maxChars = DEFAULT_MAX_CHARS_PER_URL } = {}) {
|
|
43
|
+
if (!html || typeof html !== 'string') {
|
|
44
|
+
return '';
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const title = decodeHtmlEntities(extractTitle(html));
|
|
48
|
+
const text = decodeHtmlEntities(
|
|
49
|
+
html
|
|
50
|
+
.replace(/<!--[\s\S]*?-->/g, ' ')
|
|
51
|
+
.replace(/<script\b[\s\S]*?<\/script>/gi, ' ')
|
|
52
|
+
.replace(/<style\b[\s\S]*?<\/style>/gi, ' ')
|
|
53
|
+
.replace(/<noscript\b[\s\S]*?<\/noscript>/gi, ' ')
|
|
54
|
+
.replace(/<svg\b[\s\S]*?<\/svg>/gi, ' ')
|
|
55
|
+
.replace(/<canvas\b[\s\S]*?<\/canvas>/gi, ' ')
|
|
56
|
+
.replace(/<\/?(?:article|aside|blockquote|br|dd|div|dl|dt|figcaption|figure|footer|form|h[1-6]|header|hr|li|main|nav|ol|p|pre|section|table|tbody|td|tfoot|th|thead|tr|ul)\b[^>]*>/gi, '\n')
|
|
57
|
+
.replace(/<[^>]+>/g, ' ')
|
|
58
|
+
);
|
|
59
|
+
const lines = text
|
|
60
|
+
.split(/\n+/)
|
|
61
|
+
.map((line) => line.replace(/[ \t\f\v]+/g, ' ').trim())
|
|
62
|
+
.filter(Boolean);
|
|
63
|
+
const body = truncate(lines.join('\n'), maxChars);
|
|
64
|
+
|
|
65
|
+
return [title ? `Title: ${title}` : '', body].filter(Boolean).join('\n\n');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export async function fetchReadableUrl(
|
|
69
|
+
url,
|
|
70
|
+
{
|
|
71
|
+
fetchImpl = fetch,
|
|
72
|
+
maxChars = DEFAULT_MAX_CHARS_PER_URL,
|
|
73
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
74
|
+
useJinaFallback = process.env.USE_JINA_READER === 'true',
|
|
75
|
+
allowPrivateNetworkUrls = process.env.ALLOW_PRIVATE_NETWORK_URLS === 'true'
|
|
76
|
+
} = {}
|
|
77
|
+
) {
|
|
78
|
+
validateHttpUrl(url, {
|
|
79
|
+
allowPrivateNetworkUrls
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
let primary;
|
|
83
|
+
|
|
84
|
+
try {
|
|
85
|
+
primary = await fetchReadableUrlOnce(url, {
|
|
86
|
+
fetchImpl,
|
|
87
|
+
maxChars,
|
|
88
|
+
timeoutMs
|
|
89
|
+
});
|
|
90
|
+
} catch (error) {
|
|
91
|
+
if (!useJinaFallback) {
|
|
92
|
+
throw error;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
const fallback = await fetchReadableUrlOnce(toJinaReaderUrl(url), {
|
|
96
|
+
fetchImpl,
|
|
97
|
+
maxChars,
|
|
98
|
+
timeoutMs
|
|
99
|
+
});
|
|
100
|
+
|
|
101
|
+
return {
|
|
102
|
+
...fallback,
|
|
103
|
+
url,
|
|
104
|
+
fetchedUrl: fallback.url,
|
|
105
|
+
reader: 'jina'
|
|
106
|
+
};
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const needsFallback = shouldUseJinaFallback(primary);
|
|
110
|
+
if (!needsFallback || !useJinaFallback) {
|
|
111
|
+
return primary;
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
try {
|
|
115
|
+
const fallback = await fetchReadableUrlOnce(toJinaReaderUrl(url), {
|
|
116
|
+
fetchImpl,
|
|
117
|
+
maxChars,
|
|
118
|
+
timeoutMs
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
if (fallback.ok && (needsFallback || fallback.text.length >= primary.text.length)) {
|
|
122
|
+
return {
|
|
123
|
+
...fallback,
|
|
124
|
+
url,
|
|
125
|
+
fetchedUrl: fallback.url,
|
|
126
|
+
reader: 'jina'
|
|
127
|
+
};
|
|
128
|
+
}
|
|
129
|
+
} catch {
|
|
130
|
+
// Keep the primary result when the optional reader is unavailable.
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return primary;
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
export async function buildWebContext(
|
|
137
|
+
input,
|
|
138
|
+
{
|
|
139
|
+
fetchImpl = fetch,
|
|
140
|
+
maxUrls = DEFAULT_MAX_URLS,
|
|
141
|
+
maxCharsPerUrl = DEFAULT_MAX_CHARS_PER_URL,
|
|
142
|
+
timeoutMs = DEFAULT_TIMEOUT_MS,
|
|
143
|
+
useJinaFallback = process.env.USE_JINA_READER === 'true'
|
|
144
|
+
} = {}
|
|
145
|
+
) {
|
|
146
|
+
const urls = extractUrls(input).slice(0, maxUrls);
|
|
147
|
+
const pages = [];
|
|
148
|
+
|
|
149
|
+
for (const url of urls) {
|
|
150
|
+
try {
|
|
151
|
+
const page = await fetchReadableUrl(url, {
|
|
152
|
+
fetchImpl,
|
|
153
|
+
maxChars: maxCharsPerUrl,
|
|
154
|
+
timeoutMs,
|
|
155
|
+
useJinaFallback
|
|
156
|
+
});
|
|
157
|
+
pages.push(page);
|
|
158
|
+
} catch (error) {
|
|
159
|
+
pages.push({
|
|
160
|
+
url,
|
|
161
|
+
status: 0,
|
|
162
|
+
ok: false,
|
|
163
|
+
text: `读取失败:${error.message}`
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return {
|
|
169
|
+
urls,
|
|
170
|
+
pages,
|
|
171
|
+
text: pages
|
|
172
|
+
.map((page, index) => {
|
|
173
|
+
const status = page.status ? `Status: ${page.status}` : 'Status: failed';
|
|
174
|
+
return [`Source ${index + 1}: ${page.url}`, status, page.text].join('\n');
|
|
175
|
+
})
|
|
176
|
+
.join('\n\n---\n\n')
|
|
177
|
+
};
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
async function fetchReadableUrlOnce(url, { fetchImpl, maxChars, timeoutMs }) {
|
|
181
|
+
const controller = new AbortController();
|
|
182
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
183
|
+
|
|
184
|
+
try {
|
|
185
|
+
let response;
|
|
186
|
+
|
|
187
|
+
try {
|
|
188
|
+
response = await fetchImpl(url, {
|
|
189
|
+
redirect: 'follow',
|
|
190
|
+
signal: controller.signal,
|
|
191
|
+
headers: {
|
|
192
|
+
'user-agent': DEFAULT_USER_AGENT,
|
|
193
|
+
accept: 'text/html,application/xhtml+xml,application/xml,text/plain,application/json,*/*;q=0.8',
|
|
194
|
+
'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8'
|
|
195
|
+
}
|
|
196
|
+
});
|
|
197
|
+
} catch (error) {
|
|
198
|
+
if (fetchImpl !== fetch) {
|
|
199
|
+
throw error;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
return fetchReadableUrlWithCurl(url, {
|
|
203
|
+
maxChars,
|
|
204
|
+
timeoutMs
|
|
205
|
+
});
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
const contentType = response.headers.get('content-type') || '';
|
|
209
|
+
const body = await response.text();
|
|
210
|
+
const text = normalizeResponseText(body, contentType, maxChars);
|
|
211
|
+
|
|
212
|
+
return {
|
|
213
|
+
url,
|
|
214
|
+
fetchedUrl: response.url || url,
|
|
215
|
+
status: response.status,
|
|
216
|
+
ok: response.ok,
|
|
217
|
+
contentType,
|
|
218
|
+
text
|
|
219
|
+
};
|
|
220
|
+
} finally {
|
|
221
|
+
clearTimeout(timer);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
async function fetchReadableUrlWithCurl(url, { maxChars, timeoutMs }) {
|
|
226
|
+
const marker = '\n__MCP_WEB_READER_META__\n';
|
|
227
|
+
const timeoutSeconds = Math.max(1, Math.ceil(timeoutMs / 1000));
|
|
228
|
+
const { stdout } = await execFile(
|
|
229
|
+
'curl',
|
|
230
|
+
[
|
|
231
|
+
'-sS',
|
|
232
|
+
'-L',
|
|
233
|
+
'--max-time',
|
|
234
|
+
String(timeoutSeconds),
|
|
235
|
+
'-A',
|
|
236
|
+
DEFAULT_USER_AGENT,
|
|
237
|
+
'-H',
|
|
238
|
+
'Accept: text/html,application/xhtml+xml,application/xml,text/plain,application/json,*/*;q=0.8',
|
|
239
|
+
'-H',
|
|
240
|
+
'Accept-Language: zh-CN,zh;q=0.9,en;q=0.8',
|
|
241
|
+
'-w',
|
|
242
|
+
`${marker}%{http_code}\n%{content_type}\n%{url_effective}`,
|
|
243
|
+
url
|
|
244
|
+
],
|
|
245
|
+
{
|
|
246
|
+
encoding: 'utf8',
|
|
247
|
+
maxBuffer: 16 * 1024 * 1024,
|
|
248
|
+
timeout: timeoutMs + 5000
|
|
249
|
+
}
|
|
250
|
+
);
|
|
251
|
+
const markerIndex = stdout.lastIndexOf(marker);
|
|
252
|
+
|
|
253
|
+
if (markerIndex === -1) {
|
|
254
|
+
throw new Error('curl did not return response metadata');
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const body = stdout.slice(0, markerIndex);
|
|
258
|
+
const [statusLine = '0', contentType = '', effectiveUrl = url] = stdout
|
|
259
|
+
.slice(markerIndex + marker.length)
|
|
260
|
+
.split('\n');
|
|
261
|
+
const status = Number.parseInt(statusLine, 10) || 0;
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
url,
|
|
265
|
+
fetchedUrl: effectiveUrl,
|
|
266
|
+
status,
|
|
267
|
+
ok: status >= 200 && status < 300,
|
|
268
|
+
contentType,
|
|
269
|
+
text: normalizeResponseText(body, contentType, maxChars)
|
|
270
|
+
};
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
function normalizeResponseText(body, contentType, maxChars) {
|
|
274
|
+
if (contentType.includes('text/html') || /<\/?[a-z][\s\S]*>/i.test(body)) {
|
|
275
|
+
return htmlToReadableText(body, {
|
|
276
|
+
maxChars
|
|
277
|
+
});
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
if (contentType.includes('application/json')) {
|
|
281
|
+
try {
|
|
282
|
+
return truncate(JSON.stringify(JSON.parse(body), null, 2), maxChars);
|
|
283
|
+
} catch {
|
|
284
|
+
return truncate(body, maxChars);
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
return truncate(body.replace(/\r\n/g, '\n').replace(/[ \t\f\v]+/g, ' ').trim(), maxChars);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function shouldUseJinaFallback(page) {
|
|
292
|
+
if (page.reader === 'jina') return false;
|
|
293
|
+
if (!page.ok) return true;
|
|
294
|
+
if (/application\/pdf/i.test(page.contentType)) return true;
|
|
295
|
+
if (
|
|
296
|
+
page.text.length < 1000 &&
|
|
297
|
+
/<script|enable javascript|captcha|access denied|cloudflare|something went wrong|try again|privacy related extensions|don.t miss what.s happening/i.test(
|
|
298
|
+
page.text
|
|
299
|
+
)
|
|
300
|
+
) {
|
|
301
|
+
return true;
|
|
302
|
+
}
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
function toJinaReaderUrl(url) {
|
|
307
|
+
return `https://r.jina.ai/${url}`;
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
function extractTitle(html) {
|
|
311
|
+
return html.match(/<title\b[^>]*>([\s\S]*?)<\/title>/i)?.[1]?.replace(/<[^>]+>/g, ' ').trim() || '';
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
function decodeHtmlEntities(text) {
|
|
315
|
+
return text.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (entity, raw) => {
|
|
316
|
+
const lower = raw.toLowerCase();
|
|
317
|
+
if (lower.startsWith('#x')) {
|
|
318
|
+
return String.fromCodePoint(Number.parseInt(lower.slice(2), 16));
|
|
319
|
+
}
|
|
320
|
+
if (lower.startsWith('#')) {
|
|
321
|
+
return String.fromCodePoint(Number.parseInt(lower.slice(1), 10));
|
|
322
|
+
}
|
|
323
|
+
return ENTITY_MAP.get(lower) ?? entity;
|
|
324
|
+
});
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
function truncate(text, maxChars) {
|
|
328
|
+
if (text.length <= maxChars) {
|
|
329
|
+
return text;
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return `${text.slice(0, maxChars)}\n\n[Truncated]`;
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
function validateHttpUrl(url, { allowPrivateNetworkUrls }) {
|
|
336
|
+
const parsed = new URL(url);
|
|
337
|
+
if (!['http:', 'https:'].includes(parsed.protocol)) {
|
|
338
|
+
throw new Error('Only http and https URLs are supported');
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
if (!allowPrivateNetworkUrls && isPrivateHostname(parsed.hostname)) {
|
|
342
|
+
throw new Error('Private network URLs are disabled by default');
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
function isPrivateHostname(hostname) {
|
|
347
|
+
const normalized = hostname.toLowerCase().replace(/^\[|\]$/g, '');
|
|
348
|
+
if (normalized === 'localhost' || normalized.endsWith('.localhost')) return true;
|
|
349
|
+
|
|
350
|
+
const ipVersion = isIP(normalized);
|
|
351
|
+
if (!ipVersion) return false;
|
|
352
|
+
|
|
353
|
+
if (ipVersion === 6) {
|
|
354
|
+
return normalized === '::1' || normalized.startsWith('fc') || normalized.startsWith('fd') || normalized.startsWith('fe80:');
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
const parts = normalized.split('.').map((part) => Number.parseInt(part, 10));
|
|
358
|
+
const [a, b] = parts;
|
|
359
|
+
return (
|
|
360
|
+
a === 10 ||
|
|
361
|
+
a === 127 ||
|
|
362
|
+
(a === 169 && b === 254) ||
|
|
363
|
+
(a === 172 && b >= 16 && b <= 31) ||
|
|
364
|
+
(a === 192 && b === 168) ||
|
|
365
|
+
a === 0
|
|
366
|
+
);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
function trimUrl(value) {
|
|
370
|
+
return value.replace(/[,。!?、;:,.!?;:)\]}>]+$/g, '');
|
|
371
|
+
}
|