@endday/search-mcp 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +4724 -0
- package/dist/search-mcp.js +4715 -0
- package/package.json +14 -14
- package/data/blocklist.generated.js +0 -2
- package/envs.js +0 -129
- package/index.js +0 -6
- package/mcp/search-mcp.js +0 -8
- package/src/content/extract.impl.js +0 -228
- package/src/content/extract.js +0 -1
- package/src/content/fetch.impl.js +0 -400
- package/src/content/fetch.js +0 -1
- package/src/core/crypto.js +0 -7
- package/src/core/errors.impl.js +0 -52
- package/src/core/errors.js +0 -1
- package/src/core/html.impl.js +0 -69
- package/src/core/html.js +0 -1
- package/src/mcp/config.js +0 -75
- package/src/mcp/format.js +0 -44
- package/src/mcp/index.js +0 -10
- package/src/mcp/local/content.js +0 -26
- package/src/mcp/local/search.js +0 -233
- package/src/mcp/schemas.js +0 -132
- package/src/mcp/server.js +0 -97
- package/src/mcp/tools/content.js +0 -31
- package/src/mcp/tools/jinaContent.js +0 -38
- package/src/mcp/tools/newsSearch.js +0 -22
- package/src/mcp/tools/webSearch.js +0 -57
- package/src/platform/auth.impl.js +0 -166
- package/src/platform/auth.js +0 -1
- package/src/platform/cache.impl.js +0 -166
- package/src/platform/cache.js +0 -1
- package/src/platform/health.impl.js +0 -133
- package/src/platform/health.js +0 -1
- package/src/platform/http.impl.js +0 -108
- package/src/platform/http.js +0 -1
- package/src/platform/logger.impl.js +0 -51
- package/src/platform/logger.js +0 -1
- package/src/platform/metrics.impl.js +0 -43
- package/src/platform/metrics.js +0 -1
- package/src/platform/nodeHttpClient.js +0 -104
- package/src/platform/rateLimit.impl.js +0 -141
- package/src/platform/rateLimit.js +0 -1
- package/src/platform/requestContext.impl.js +0 -10
- package/src/platform/requestContext.js +0 -1
- package/src/platform/session.impl.js +0 -198
- package/src/platform/session.js +0 -1
- package/src/platform/stateKv.impl.js +0 -18
- package/src/platform/stateKv.js +0 -1
- package/src/platform/tasks.impl.js +0 -17
- package/src/platform/tasks.js +0 -1
- package/src/routes/requestParams.impl.js +0 -12
- package/src/routes/requestParams.js +0 -1
- package/src/search/engineRegistry.impl.js +0 -117
- package/src/search/engineRegistry.js +0 -1
- package/src/search/engineRequest.impl.js +0 -377
- package/src/search/engineRequest.js +0 -1
- package/src/search/engineUtils.impl.js +0 -227
- package/src/search/engineUtils.js +0 -1
- package/src/search/engines/baidu.impl.js +0 -145
- package/src/search/engines/baidu.js +0 -2
- package/src/search/engines/bing.impl.js +0 -509
- package/src/search/engines/bing.js +0 -2
- package/src/search/engines/brave.impl.js +0 -223
- package/src/search/engines/brave.js +0 -2
- package/src/search/engines/duckduckgo.impl.js +0 -164
- package/src/search/engines/duckduckgo.js +0 -2
- package/src/search/engines/mojeek.impl.js +0 -115
- package/src/search/engines/mojeek.js +0 -2
- package/src/search/engines/qwant.impl.js +0 -188
- package/src/search/engines/qwant.js +0 -2
- package/src/search/engines/startpage.impl.js +0 -237
- package/src/search/engines/startpage.js +0 -2
- package/src/search/engines/toutiao.impl.js +0 -265
- package/src/search/engines/toutiao.js +0 -2
- package/src/search/engines/yahoo.impl.js +0 -379
- package/src/search/engines/yahoo.js +0 -2
- package/src/search/gateway.impl.js +0 -423
- package/src/search/gateway.js +0 -1
- package/src/search/ranking.impl.js +0 -381
- package/src/search/ranking.js +0 -1
- package/src/search/requestPolicy.impl.js +0 -137
- package/src/search/requestPolicy.js +0 -1
- package/src/search/upstreamSession.impl.js +0 -148
- package/src/search/upstreamSession.js +0 -1
- /package/{index.d.ts → dist/index.d.ts} +0 -0
|
@@ -1,237 +0,0 @@
|
|
|
1
|
-
import { ApiError } from "../../core/errors.js";
|
|
2
|
-
import {
|
|
3
|
-
fetchSearchText,
|
|
4
|
-
isChallengeResponse,
|
|
5
|
-
throwBlockedUpstreamError,
|
|
6
|
-
} from "../engineRequest.js";
|
|
7
|
-
import {
|
|
8
|
-
mapLanguage,
|
|
9
|
-
resolvePageNumber,
|
|
10
|
-
} from "../engineUtils.js";
|
|
11
|
-
import { cleanText, extractBalancedSegment } from "../../core/html.js";
|
|
12
|
-
import { normalizeResults } from "../ranking.js";
|
|
13
|
-
|
|
14
|
-
const STARTPAGE_LANGUAGE = {
|
|
15
|
-
en: "english",
|
|
16
|
-
zh: "chinese_simplified",
|
|
17
|
-
"zh-cn": "chinese_simplified",
|
|
18
|
-
"zh-tw": "chinese_traditional",
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
const STARTPAGE_CHALLENGE_PATTERNS = [
|
|
22
|
-
/\/sp\/captcha\b/i,
|
|
23
|
-
/name=["']captcha["']/i,
|
|
24
|
-
];
|
|
25
|
-
const STARTPAGE_SC_TTL_MS = 15 * 60 * 1000;
|
|
26
|
-
let cachedStartpageSc = {
|
|
27
|
-
value: "",
|
|
28
|
-
expiresAt: 0,
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
function isStartpageChallengeResponse(source) {
|
|
32
|
-
const text = String(source || "");
|
|
33
|
-
|
|
34
|
-
return (
|
|
35
|
-
isChallengeResponse(text, STARTPAGE_CHALLENGE_PATTERNS) ||
|
|
36
|
-
((/verify you are human/i.test(text) || /unusual traffic/i.test(text)) &&
|
|
37
|
-
/<form\b/i.test(text))
|
|
38
|
-
);
|
|
39
|
-
}
|
|
40
|
-
|
|
41
|
-
function throwStartpageChallengeError(surface) {
|
|
42
|
-
throwBlockedUpstreamError({
|
|
43
|
-
engine: "Startpage",
|
|
44
|
-
surface,
|
|
45
|
-
});
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
function extractStartpageScToken(html) {
|
|
49
|
-
const match =
|
|
50
|
-
html.match(/<input\b[^>]*name=["']sc["'][^>]*value=["']([^"']+)["'][^>]*>/i) ||
|
|
51
|
-
html.match(/<input\b[^>]*value=["']([^"']+)["'][^>]*name=["']sc["'][^>]*>/i);
|
|
52
|
-
|
|
53
|
-
return match?.[1]?.trim() || "";
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
function buildStartpagePreferences(languageValue) {
|
|
57
|
-
const preferences = [
|
|
58
|
-
["disable_family_filter", "1"],
|
|
59
|
-
["enable_post_method", "1"],
|
|
60
|
-
["instant_answers", "0"],
|
|
61
|
-
["num_of_results", "10"],
|
|
62
|
-
];
|
|
63
|
-
|
|
64
|
-
if (languageValue) {
|
|
65
|
-
preferences.push(
|
|
66
|
-
["lang_homepage", languageValue],
|
|
67
|
-
["language", languageValue],
|
|
68
|
-
["lui", languageValue]
|
|
69
|
-
);
|
|
70
|
-
}
|
|
71
|
-
|
|
72
|
-
return encodeURIComponent(
|
|
73
|
-
preferences.map(([key, value]) => `${key}EEE${value}`).join("N1N")
|
|
74
|
-
);
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
async function fetchStartpageScToken({ signal, language, runtimeContext }) {
|
|
78
|
-
if (cachedStartpageSc.value && cachedStartpageSc.expiresAt > Date.now()) {
|
|
79
|
-
return cachedStartpageSc.value;
|
|
80
|
-
}
|
|
81
|
-
|
|
82
|
-
try {
|
|
83
|
-
const html = await fetchSearchText("https://www.startpage.com/", {
|
|
84
|
-
engine: "startpage",
|
|
85
|
-
engineLabel: "Startpage",
|
|
86
|
-
signal,
|
|
87
|
-
language,
|
|
88
|
-
referrer: "https://www.startpage.com/",
|
|
89
|
-
runtimeContext,
|
|
90
|
-
blockedStatuses: [403, 429],
|
|
91
|
-
isBlocked: isStartpageChallengeResponse,
|
|
92
|
-
blockedSurface: "home",
|
|
93
|
-
});
|
|
94
|
-
const token = extractStartpageScToken(html);
|
|
95
|
-
|
|
96
|
-
if (token) {
|
|
97
|
-
cachedStartpageSc = {
|
|
98
|
-
value: token,
|
|
99
|
-
expiresAt: Date.now() + STARTPAGE_SC_TTL_MS,
|
|
100
|
-
};
|
|
101
|
-
}
|
|
102
|
-
|
|
103
|
-
return token;
|
|
104
|
-
} catch (_) {
|
|
105
|
-
return "";
|
|
106
|
-
}
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
export function resetStartpageRequestState() {
|
|
110
|
-
cachedStartpageSc = {
|
|
111
|
-
value: "",
|
|
112
|
-
expiresAt: 0,
|
|
113
|
-
};
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
function extractStartpageResultArray(html) {
|
|
117
|
-
const markerIndex = [
|
|
118
|
-
'"display_type":"web-google"',
|
|
119
|
-
'"display_type":"web-results"',
|
|
120
|
-
'"display_type":"web"',
|
|
121
|
-
]
|
|
122
|
-
.map((marker) => html.indexOf(marker))
|
|
123
|
-
.filter((index) => index >= 0)
|
|
124
|
-
.sort((left, right) => left - right)[0];
|
|
125
|
-
|
|
126
|
-
if (typeof markerIndex !== "number") {
|
|
127
|
-
return null;
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
const resultsMarker = '"results":';
|
|
131
|
-
const resultsIndex = html.indexOf(resultsMarker, markerIndex);
|
|
132
|
-
if (resultsIndex === -1) {
|
|
133
|
-
return null;
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
const arrayStart = html.indexOf("[", resultsIndex);
|
|
137
|
-
if (arrayStart === -1) {
|
|
138
|
-
return null;
|
|
139
|
-
}
|
|
140
|
-
|
|
141
|
-
return JSON.parse(extractBalancedSegment(html, arrayStart));
|
|
142
|
-
}
|
|
143
|
-
|
|
144
|
-
export function parseStartpageResults(html) {
|
|
145
|
-
if (isStartpageChallengeResponse(html)) {
|
|
146
|
-
throwStartpageChallengeError("html");
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
const items = extractStartpageResultArray(html);
|
|
150
|
-
|
|
151
|
-
if (!Array.isArray(items)) {
|
|
152
|
-
throw new ApiError({
|
|
153
|
-
status: 502,
|
|
154
|
-
code: "UPSTREAM_PARSE_ERROR",
|
|
155
|
-
category: "upstream",
|
|
156
|
-
message: "Startpage parser could not find result payload",
|
|
157
|
-
});
|
|
158
|
-
}
|
|
159
|
-
|
|
160
|
-
return normalizeResults(
|
|
161
|
-
items
|
|
162
|
-
.filter((item) => item?.clickUrl && item?.title)
|
|
163
|
-
.map((item) => ({
|
|
164
|
-
title: cleanText(item.title),
|
|
165
|
-
url: item.clickUrl,
|
|
166
|
-
description: cleanText(item.description || ""),
|
|
167
|
-
}))
|
|
168
|
-
);
|
|
169
|
-
}
|
|
170
|
-
|
|
171
|
-
async function searchStartpage(params) {
|
|
172
|
-
const { query, language, time_range, pageno, signal, runtimeContext } = params;
|
|
173
|
-
|
|
174
|
-
if (time_range) {
|
|
175
|
-
throw new ApiError({
|
|
176
|
-
status: 400,
|
|
177
|
-
code: "UNSUPPORTED_PARAMETER",
|
|
178
|
-
category: "validation",
|
|
179
|
-
message: "Startpage time_range filtering is not supported",
|
|
180
|
-
});
|
|
181
|
-
}
|
|
182
|
-
|
|
183
|
-
const page = resolvePageNumber(pageno);
|
|
184
|
-
const languageValue = mapLanguage(language, STARTPAGE_LANGUAGE, "");
|
|
185
|
-
const sc = await fetchStartpageScToken({ signal, language, runtimeContext });
|
|
186
|
-
const html = await fetchSearchText("https://www.startpage.com/sp/search", {
|
|
187
|
-
engine: "startpage",
|
|
188
|
-
engineLabel: "Startpage",
|
|
189
|
-
signal,
|
|
190
|
-
language,
|
|
191
|
-
method: "POST",
|
|
192
|
-
form: {
|
|
193
|
-
query,
|
|
194
|
-
cat: "web",
|
|
195
|
-
segment: "startpage.udog",
|
|
196
|
-
...(page > 0 ? { page: String(page + 1) } : {}),
|
|
197
|
-
...(languageValue
|
|
198
|
-
? {
|
|
199
|
-
language: languageValue,
|
|
200
|
-
lui: languageValue,
|
|
201
|
-
}
|
|
202
|
-
: {}),
|
|
203
|
-
...(sc ? { sc } : {}),
|
|
204
|
-
},
|
|
205
|
-
cookies: {
|
|
206
|
-
preferences: buildStartpagePreferences(languageValue),
|
|
207
|
-
},
|
|
208
|
-
referrer: "https://www.startpage.com/",
|
|
209
|
-
origin: "https://www.startpage.com",
|
|
210
|
-
runtimeContext,
|
|
211
|
-
blockedStatuses: [403, 429],
|
|
212
|
-
isBlocked: isStartpageChallengeResponse,
|
|
213
|
-
blockedSurface: "html",
|
|
214
|
-
});
|
|
215
|
-
|
|
216
|
-
return parseStartpageResults(html);
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
export const startpageAdapter = {
|
|
220
|
-
name: "startpage",
|
|
221
|
-
label: "Startpage",
|
|
222
|
-
priority: 100,
|
|
223
|
-
tier: "primary",
|
|
224
|
-
requestPolicy: {
|
|
225
|
-
retryAttempts: 0,
|
|
226
|
-
minRequestIntervalMs: 200,
|
|
227
|
-
},
|
|
228
|
-
supports: {
|
|
229
|
-
language: true,
|
|
230
|
-
time_range: false,
|
|
231
|
-
pageno: true,
|
|
232
|
-
},
|
|
233
|
-
isAvailable: () => true,
|
|
234
|
-
search: searchStartpage,
|
|
235
|
-
};
|
|
236
|
-
|
|
237
|
-
export default searchStartpage;
|
|
@@ -1,265 +0,0 @@
|
|
|
1
|
-
import { ApiError } from "../../core/errors.js";
|
|
2
|
-
import {
|
|
3
|
-
fetchSearchText,
|
|
4
|
-
isChallengeResponse,
|
|
5
|
-
throwBlockedUpstreamError,
|
|
6
|
-
} from "../engineRequest.js";
|
|
7
|
-
import { cleanText, parseHtml } from "../../core/html.js";
|
|
8
|
-
import { normalizeResults } from "../ranking.js";
|
|
9
|
-
|
|
10
|
-
const TOUTIAO_CHALLENGE_PATTERNS = [
|
|
11
|
-
/安全验证/i,
|
|
12
|
-
/captcha.*验证/i,
|
|
13
|
-
];
|
|
14
|
-
|
|
15
|
-
function isToutiaoChallengeResponse(source) {
|
|
16
|
-
const text = String(source || "");
|
|
17
|
-
return (
|
|
18
|
-
isChallengeResponse(text, TOUTIAO_CHALLENGE_PATTERNS) ||
|
|
19
|
-
(text.length < 1000 && /<form\b/i.test(text)) ||
|
|
20
|
-
/"challenge_code"\s*:\s*1366/.test(text) ||
|
|
21
|
-
/"template_key"\s*:\s*"71-undefined"/.test(text)
|
|
22
|
-
);
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
function throwToutiaoChallengeError() {
|
|
26
|
-
throwBlockedUpstreamError({
|
|
27
|
-
engine: "Toutiao",
|
|
28
|
-
surface: "html",
|
|
29
|
-
});
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
/**
|
|
33
|
-
* Decode a Toutiao search-jump redirect URL.
|
|
34
|
-
* Links in the SSR HTML look like:
|
|
35
|
-
* https://sou.toutiao.com/search/jump?url=https%3A%2F%2Fexample.com%2Farticle
|
|
36
|
-
* We extract and decode the embedded target URL.
|
|
37
|
-
*/
|
|
38
|
-
function decodeToutiaoJumpUrl(href) {
|
|
39
|
-
try {
|
|
40
|
-
const url = new URL(href);
|
|
41
|
-
const target = url.searchParams.get("url");
|
|
42
|
-
if (!target) return href;
|
|
43
|
-
const decoded = decodeURIComponent(target);
|
|
44
|
-
if (decoded.includes("search/jump?url=")) {
|
|
45
|
-
return decodeToutiaoJumpUrl(decoded);
|
|
46
|
-
}
|
|
47
|
-
return decoded;
|
|
48
|
-
} catch (_) {
|
|
49
|
-
return href;
|
|
50
|
-
}
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
/**
|
|
54
|
-
* Extract description text from a result-content card.
|
|
55
|
-
*
|
|
56
|
-
* Toutiao SSR DOM structure per search result:
|
|
57
|
-
* div.result-content
|
|
58
|
-
* script/style (hydrate/render code, ignored)
|
|
59
|
-
* div
|
|
60
|
-
* div.cs-view.cs-view-block.cs-card
|
|
61
|
-
* div.cs-view.cs-view-block.cs-card-header
|
|
62
|
-
* a[href] (title link)
|
|
63
|
-
* div.cs-view.cs-view-block.cs-card-content
|
|
64
|
-
* (description text, may be inside nested divs)
|
|
65
|
-
*/
|
|
66
|
-
function extractCardDescription(card) {
|
|
67
|
-
// Look for cs-card-content, which holds the description
|
|
68
|
-
const contentNode = card.querySelector(".cs-card-content");
|
|
69
|
-
if (contentNode) {
|
|
70
|
-
// The description text is usually in a direct child div without a link
|
|
71
|
-
const descDivs = contentNode.querySelectorAll("div");
|
|
72
|
-
for (const d of descDivs) {
|
|
73
|
-
const txt = cleanText(d.textContent || "").trim();
|
|
74
|
-
// Skip text that's just the title repeated, or too short, or contains JS
|
|
75
|
-
if (
|
|
76
|
-
txt.length > 15 &&
|
|
77
|
-
txt.length < 500 &&
|
|
78
|
-
!txt.includes("druid") &&
|
|
79
|
-
!txt.includes("PerfTag") &&
|
|
80
|
-
!txt.includes("script")
|
|
81
|
-
) {
|
|
82
|
-
// Prefer longer, more descriptive text
|
|
83
|
-
if (txt.length > 40) {
|
|
84
|
-
return txt.slice(0, 300);
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
// Fallback: the content node's own text (excluding nested links)
|
|
89
|
-
const fullText = cleanText(contentNode.textContent || "").trim();
|
|
90
|
-
// Remove the title portion from the description
|
|
91
|
-
const titleLink = card.querySelector("a[href]");
|
|
92
|
-
const titleText = titleLink ? (titleLink.textContent || "").trim() : "";
|
|
93
|
-
const descOnly = fullText.replace(titleText, "").trim();
|
|
94
|
-
if (descOnly.length > 15 && descOnly.length < 500 && !descOnly.includes("druid")) {
|
|
95
|
-
return descOnly.slice(0, 300);
|
|
96
|
-
}
|
|
97
|
-
}
|
|
98
|
-
|
|
99
|
-
// Fallback: search all divs in the card for description-like text
|
|
100
|
-
const allDivs = card.querySelectorAll("div");
|
|
101
|
-
for (const d of allDivs) {
|
|
102
|
-
const txt = cleanText(d.textContent || "").trim();
|
|
103
|
-
if (
|
|
104
|
-
txt.length > 30 &&
|
|
105
|
-
txt.length < 500 &&
|
|
106
|
-
!txt.includes("druid") &&
|
|
107
|
-
!txt.includes("PerfTag") &&
|
|
108
|
-
!txt.includes("script") &&
|
|
109
|
-
!txt.includes("换一换")
|
|
110
|
-
) {
|
|
111
|
-
return txt.slice(0, 300);
|
|
112
|
-
}
|
|
113
|
-
}
|
|
114
|
-
|
|
115
|
-
return "";
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
export function parseToutiaoResults(html) {
|
|
119
|
-
if (isToutiaoChallengeResponse(html)) {
|
|
120
|
-
throwToutiaoChallengeError();
|
|
121
|
-
}
|
|
122
|
-
|
|
123
|
-
const root = parseHtml(html);
|
|
124
|
-
const seen = new Set();
|
|
125
|
-
const results = [];
|
|
126
|
-
|
|
127
|
-
// Find the search result list container
|
|
128
|
-
const resultList = root.querySelector(".s-result-list");
|
|
129
|
-
|
|
130
|
-
if (!resultList) {
|
|
131
|
-
throw new ApiError({
|
|
132
|
-
status: 502,
|
|
133
|
-
code: "UPSTREAM_PARSE_ERROR",
|
|
134
|
-
category: "upstream",
|
|
135
|
-
message: "Toutiao parser could not find s-result-list container",
|
|
136
|
-
});
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
// Each div.result-content inside s-result-list is one search result card.
|
|
140
|
-
// Skip cards inside s-side-list (hot trending sidebar).
|
|
141
|
-
const cards = resultList.querySelectorAll(".result-content");
|
|
142
|
-
|
|
143
|
-
for (const card of cards) {
|
|
144
|
-
// Exclude the hot trending sidebar entirely
|
|
145
|
-
if (card.closest(".s-side-list")) continue;
|
|
146
|
-
|
|
147
|
-
// Exclude ad/promotion cards: data-test-card-id="67-toutiao_web"
|
|
148
|
-
// Keep organic results: "67-homepage" (official site), "26-aft_ciyu_detail" (word definition)
|
|
149
|
-
const adMarker = card.querySelector("[data-test-card-id='67-toutiao_web']");
|
|
150
|
-
if (adMarker) continue;
|
|
151
|
-
|
|
152
|
-
// Exclude related-search suggestions: data-test-card-id="20-undefined"
|
|
153
|
-
const relatedMarker = card.querySelector("[data-test-card-id^='20-']");
|
|
154
|
-
if (relatedMarker) continue;
|
|
155
|
-
|
|
156
|
-
// Find the primary title link
|
|
157
|
-
const links = card.querySelectorAll("a[href]");
|
|
158
|
-
const titleLink = links.find((a) => {
|
|
159
|
-
const href = a.getAttribute("href") || "";
|
|
160
|
-
const text = (a.textContent || "").trim();
|
|
161
|
-
return (
|
|
162
|
-
href.includes("search/jump?url=") ||
|
|
163
|
-
href.includes("/article/") ||
|
|
164
|
-
href.includes("m.douyinhanyu.com") ||
|
|
165
|
-
href.includes("baike.com") ||
|
|
166
|
-
href.includes("m.toutiaoimg.cn") ||
|
|
167
|
-
href.includes("cloud.tencent.com")
|
|
168
|
-
) && text.length > 3 && !/^\d{1,2}:\d{2}/.test(text);
|
|
169
|
-
});
|
|
170
|
-
|
|
171
|
-
if (!titleLink) continue;
|
|
172
|
-
|
|
173
|
-
const title = cleanText(titleLink.textContent || titleLink.innerHTML || "").trim();
|
|
174
|
-
const href = titleLink.getAttribute("href") || "";
|
|
175
|
-
|
|
176
|
-
if (!title || title.length < 3 || title.length > 150) continue;
|
|
177
|
-
|
|
178
|
-
// Skip UI/navigation titles
|
|
179
|
-
if (
|
|
180
|
-
title.includes("换一换") ||
|
|
181
|
-
title.includes("首页") ||
|
|
182
|
-
title.includes("登录") ||
|
|
183
|
-
title.includes("去西瓜搜") ||
|
|
184
|
-
title.includes("去抖音搜") ||
|
|
185
|
-
title.includes("查看详情") ||
|
|
186
|
-
title.includes("播放") ||
|
|
187
|
-
title.startsWith("无障碍") ||
|
|
188
|
-
title.startsWith("相关搜索")
|
|
189
|
-
) continue;
|
|
190
|
-
|
|
191
|
-
const targetUrl = decodeToutiaoJumpUrl(href);
|
|
192
|
-
if (!targetUrl || targetUrl.startsWith("#") || targetUrl.startsWith("/")) continue;
|
|
193
|
-
|
|
194
|
-
// Skip trending items and internal search navigation
|
|
195
|
-
if (targetUrl.includes("/trending")) continue;
|
|
196
|
-
if (targetUrl.includes("so.toutiao.com/search") && !targetUrl.includes("toutiao.com/a")) continue;
|
|
197
|
-
|
|
198
|
-
// Deduplicate by target URL
|
|
199
|
-
try {
|
|
200
|
-
const canonical = new URL(targetUrl).toString().toLowerCase();
|
|
201
|
-
if (seen.has(canonical)) continue;
|
|
202
|
-
seen.add(canonical);
|
|
203
|
-
} catch (_) {
|
|
204
|
-
if (seen.has(targetUrl)) continue;
|
|
205
|
-
seen.add(targetUrl);
|
|
206
|
-
}
|
|
207
|
-
|
|
208
|
-
// Extract description from cs-card-content
|
|
209
|
-
const description = extractCardDescription(card);
|
|
210
|
-
|
|
211
|
-
results.push({ title, url: targetUrl, description });
|
|
212
|
-
}
|
|
213
|
-
|
|
214
|
-
if (results.length === 0) {
|
|
215
|
-
throw new ApiError({
|
|
216
|
-
status: 502,
|
|
217
|
-
code: "UPSTREAM_PARSE_ERROR",
|
|
218
|
-
category: "upstream",
|
|
219
|
-
message: "Toutiao parser could not find organic results",
|
|
220
|
-
});
|
|
221
|
-
}
|
|
222
|
-
|
|
223
|
-
return normalizeResults(results);
|
|
224
|
-
}
|
|
225
|
-
|
|
226
|
-
async function searchToutiao(params) {
|
|
227
|
-
const { query, signal, runtimeContext } = params;
|
|
228
|
-
const searchUrl = new URL("https://so.toutiao.com/search");
|
|
229
|
-
searchUrl.searchParams.set("keyword", query);
|
|
230
|
-
searchUrl.searchParams.set("dvpf", "pc");
|
|
231
|
-
searchUrl.searchParams.set("source", "input");
|
|
232
|
-
|
|
233
|
-
const html = await fetchSearchText(searchUrl.toString(), {
|
|
234
|
-
engine: "toutiao",
|
|
235
|
-
engineLabel: "Toutiao",
|
|
236
|
-
signal,
|
|
237
|
-
referrer: "https://so.toutiao.com/",
|
|
238
|
-
runtimeContext,
|
|
239
|
-
blockedStatuses: [403, 429],
|
|
240
|
-
isBlocked: isToutiaoChallengeResponse,
|
|
241
|
-
blockedSurface: "html",
|
|
242
|
-
});
|
|
243
|
-
|
|
244
|
-
return parseToutiaoResults(html);
|
|
245
|
-
}
|
|
246
|
-
|
|
247
|
-
export const toutiaoAdapter = {
|
|
248
|
-
name: "toutiao",
|
|
249
|
-
label: "Toutiao",
|
|
250
|
-
priority: 65,
|
|
251
|
-
tier: "experimental",
|
|
252
|
-
requestPolicy: {
|
|
253
|
-
retryAttempts: 0,
|
|
254
|
-
minRequestIntervalMs: 500,
|
|
255
|
-
},
|
|
256
|
-
supports: {
|
|
257
|
-
language: true,
|
|
258
|
-
time_range: false,
|
|
259
|
-
pageno: false,
|
|
260
|
-
},
|
|
261
|
-
isAvailable: () => true,
|
|
262
|
-
search: searchToutiao,
|
|
263
|
-
};
|
|
264
|
-
|
|
265
|
-
export default searchToutiao;
|