@endday/search-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/LICENSE +674 -0
  2. package/README.md +117 -0
  3. package/README.zh.md +116 -0
  4. package/data/blocklist.generated.js +2 -0
  5. package/envs.js +129 -0
  6. package/index.d.ts +191 -0
  7. package/index.js +6 -0
  8. package/mcp/search-mcp.js +8 -0
  9. package/package.json +71 -0
  10. package/src/content/extract.impl.js +228 -0
  11. package/src/content/extract.js +1 -0
  12. package/src/content/fetch.impl.js +400 -0
  13. package/src/content/fetch.js +1 -0
  14. package/src/core/crypto.js +7 -0
  15. package/src/core/errors.impl.js +52 -0
  16. package/src/core/errors.js +1 -0
  17. package/src/core/html.impl.js +69 -0
  18. package/src/core/html.js +1 -0
  19. package/src/mcp/config.js +75 -0
  20. package/src/mcp/format.js +44 -0
  21. package/src/mcp/index.js +10 -0
  22. package/src/mcp/local/content.js +26 -0
  23. package/src/mcp/local/search.js +233 -0
  24. package/src/mcp/schemas.js +132 -0
  25. package/src/mcp/server.js +97 -0
  26. package/src/mcp/tools/content.js +31 -0
  27. package/src/mcp/tools/jinaContent.js +38 -0
  28. package/src/mcp/tools/newsSearch.js +22 -0
  29. package/src/mcp/tools/webSearch.js +57 -0
  30. package/src/platform/auth.impl.js +166 -0
  31. package/src/platform/auth.js +1 -0
  32. package/src/platform/cache.impl.js +166 -0
  33. package/src/platform/cache.js +1 -0
  34. package/src/platform/health.impl.js +133 -0
  35. package/src/platform/health.js +1 -0
  36. package/src/platform/http.impl.js +108 -0
  37. package/src/platform/http.js +1 -0
  38. package/src/platform/logger.impl.js +51 -0
  39. package/src/platform/logger.js +1 -0
  40. package/src/platform/metrics.impl.js +43 -0
  41. package/src/platform/metrics.js +1 -0
  42. package/src/platform/nodeHttpClient.js +104 -0
  43. package/src/platform/rateLimit.impl.js +141 -0
  44. package/src/platform/rateLimit.js +1 -0
  45. package/src/platform/requestContext.impl.js +10 -0
  46. package/src/platform/requestContext.js +1 -0
  47. package/src/platform/session.impl.js +198 -0
  48. package/src/platform/session.js +1 -0
  49. package/src/platform/stateKv.impl.js +18 -0
  50. package/src/platform/stateKv.js +1 -0
  51. package/src/platform/tasks.impl.js +17 -0
  52. package/src/platform/tasks.js +1 -0
  53. package/src/routes/requestParams.impl.js +12 -0
  54. package/src/routes/requestParams.js +1 -0
  55. package/src/search/engineRegistry.impl.js +117 -0
  56. package/src/search/engineRegistry.js +1 -0
  57. package/src/search/engineRequest.impl.js +377 -0
  58. package/src/search/engineRequest.js +1 -0
  59. package/src/search/engineUtils.impl.js +227 -0
  60. package/src/search/engineUtils.js +1 -0
  61. package/src/search/engines/baidu.impl.js +145 -0
  62. package/src/search/engines/baidu.js +2 -0
  63. package/src/search/engines/bing.impl.js +509 -0
  64. package/src/search/engines/bing.js +2 -0
  65. package/src/search/engines/brave.impl.js +223 -0
  66. package/src/search/engines/brave.js +2 -0
  67. package/src/search/engines/duckduckgo.impl.js +164 -0
  68. package/src/search/engines/duckduckgo.js +2 -0
  69. package/src/search/engines/mojeek.impl.js +115 -0
  70. package/src/search/engines/mojeek.js +2 -0
  71. package/src/search/engines/qwant.impl.js +188 -0
  72. package/src/search/engines/qwant.js +2 -0
  73. package/src/search/engines/startpage.impl.js +237 -0
  74. package/src/search/engines/startpage.js +2 -0
  75. package/src/search/engines/toutiao.impl.js +265 -0
  76. package/src/search/engines/toutiao.js +2 -0
  77. package/src/search/engines/yahoo.impl.js +379 -0
  78. package/src/search/engines/yahoo.js +2 -0
  79. package/src/search/gateway.impl.js +423 -0
  80. package/src/search/gateway.js +1 -0
  81. package/src/search/ranking.impl.js +381 -0
  82. package/src/search/ranking.js +1 -0
  83. package/src/search/requestPolicy.impl.js +137 -0
  84. package/src/search/requestPolicy.js +1 -0
  85. package/src/search/upstreamSession.impl.js +148 -0
  86. package/src/search/upstreamSession.js +1 -0
@@ -0,0 +1,237 @@
1
+ import { ApiError } from "../../core/errors.js";
2
+ import {
3
+ fetchSearchText,
4
+ isChallengeResponse,
5
+ throwBlockedUpstreamError,
6
+ } from "../engineRequest.js";
7
+ import {
8
+ mapLanguage,
9
+ resolvePageNumber,
10
+ } from "../engineUtils.js";
11
+ import { cleanText, extractBalancedSegment } from "../../core/html.js";
12
+ import { normalizeResults } from "../ranking.js";
13
+
14
+ const STARTPAGE_LANGUAGE = {
15
+ en: "english",
16
+ zh: "chinese_simplified",
17
+ "zh-cn": "chinese_simplified",
18
+ "zh-tw": "chinese_traditional",
19
+ };
20
+
21
+ const STARTPAGE_CHALLENGE_PATTERNS = [
22
+ /\/sp\/captcha\b/i,
23
+ /name=["']captcha["']/i,
24
+ ];
25
+ const STARTPAGE_SC_TTL_MS = 15 * 60 * 1000;
26
+ let cachedStartpageSc = {
27
+ value: "",
28
+ expiresAt: 0,
29
+ };
30
+
31
+ function isStartpageChallengeResponse(source) {
32
+ const text = String(source || "");
33
+
34
+ return (
35
+ isChallengeResponse(text, STARTPAGE_CHALLENGE_PATTERNS) ||
36
+ ((/verify you are human/i.test(text) || /unusual traffic/i.test(text)) &&
37
+ /<form\b/i.test(text))
38
+ );
39
+ }
40
+
41
+ function throwStartpageChallengeError(surface) {
42
+ throwBlockedUpstreamError({
43
+ engine: "Startpage",
44
+ surface,
45
+ });
46
+ }
47
+
48
+ function extractStartpageScToken(html) {
49
+ const match =
50
+ html.match(/<input\b[^>]*name=["']sc["'][^>]*value=["']([^"']+)["'][^>]*>/i) ||
51
+ html.match(/<input\b[^>]*value=["']([^"']+)["'][^>]*name=["']sc["'][^>]*>/i);
52
+
53
+ return match?.[1]?.trim() || "";
54
+ }
55
+
56
+ function buildStartpagePreferences(languageValue) {
57
+ const preferences = [
58
+ ["disable_family_filter", "1"],
59
+ ["enable_post_method", "1"],
60
+ ["instant_answers", "0"],
61
+ ["num_of_results", "10"],
62
+ ];
63
+
64
+ if (languageValue) {
65
+ preferences.push(
66
+ ["lang_homepage", languageValue],
67
+ ["language", languageValue],
68
+ ["lui", languageValue]
69
+ );
70
+ }
71
+
72
+ return encodeURIComponent(
73
+ preferences.map(([key, value]) => `${key}EEE${value}`).join("N1N")
74
+ );
75
+ }
76
+
77
+ async function fetchStartpageScToken({ signal, language, runtimeContext }) {
78
+ if (cachedStartpageSc.value && cachedStartpageSc.expiresAt > Date.now()) {
79
+ return cachedStartpageSc.value;
80
+ }
81
+
82
+ try {
83
+ const html = await fetchSearchText("https://www.startpage.com/", {
84
+ engine: "startpage",
85
+ engineLabel: "Startpage",
86
+ signal,
87
+ language,
88
+ referrer: "https://www.startpage.com/",
89
+ runtimeContext,
90
+ blockedStatuses: [403, 429],
91
+ isBlocked: isStartpageChallengeResponse,
92
+ blockedSurface: "home",
93
+ });
94
+ const token = extractStartpageScToken(html);
95
+
96
+ if (token) {
97
+ cachedStartpageSc = {
98
+ value: token,
99
+ expiresAt: Date.now() + STARTPAGE_SC_TTL_MS,
100
+ };
101
+ }
102
+
103
+ return token;
104
+ } catch (_) {
105
+ return "";
106
+ }
107
+ }
108
+
109
+ export function resetStartpageRequestState() {
110
+ cachedStartpageSc = {
111
+ value: "",
112
+ expiresAt: 0,
113
+ };
114
+ }
115
+
116
+ function extractStartpageResultArray(html) {
117
+ const markerIndex = [
118
+ '"display_type":"web-google"',
119
+ '"display_type":"web-results"',
120
+ '"display_type":"web"',
121
+ ]
122
+ .map((marker) => html.indexOf(marker))
123
+ .filter((index) => index >= 0)
124
+ .sort((left, right) => left - right)[0];
125
+
126
+ if (typeof markerIndex !== "number") {
127
+ return null;
128
+ }
129
+
130
+ const resultsMarker = '"results":';
131
+ const resultsIndex = html.indexOf(resultsMarker, markerIndex);
132
+ if (resultsIndex === -1) {
133
+ return null;
134
+ }
135
+
136
+ const arrayStart = html.indexOf("[", resultsIndex);
137
+ if (arrayStart === -1) {
138
+ return null;
139
+ }
140
+
141
+ return JSON.parse(extractBalancedSegment(html, arrayStart));
142
+ }
143
+
144
+ export function parseStartpageResults(html) {
145
+ if (isStartpageChallengeResponse(html)) {
146
+ throwStartpageChallengeError("html");
147
+ }
148
+
149
+ const items = extractStartpageResultArray(html);
150
+
151
+ if (!Array.isArray(items)) {
152
+ throw new ApiError({
153
+ status: 502,
154
+ code: "UPSTREAM_PARSE_ERROR",
155
+ category: "upstream",
156
+ message: "Startpage parser could not find result payload",
157
+ });
158
+ }
159
+
160
+ return normalizeResults(
161
+ items
162
+ .filter((item) => item?.clickUrl && item?.title)
163
+ .map((item) => ({
164
+ title: cleanText(item.title),
165
+ url: item.clickUrl,
166
+ description: cleanText(item.description || ""),
167
+ }))
168
+ );
169
+ }
170
+
171
+ async function searchStartpage(params) {
172
+ const { query, language, time_range, pageno, signal, runtimeContext } = params;
173
+
174
+ if (time_range) {
175
+ throw new ApiError({
176
+ status: 400,
177
+ code: "UNSUPPORTED_PARAMETER",
178
+ category: "validation",
179
+ message: "Startpage time_range filtering is not supported",
180
+ });
181
+ }
182
+
183
+ const page = resolvePageNumber(pageno);
184
+ const languageValue = mapLanguage(language, STARTPAGE_LANGUAGE, "");
185
+ const sc = await fetchStartpageScToken({ signal, language, runtimeContext });
186
+ const html = await fetchSearchText("https://www.startpage.com/sp/search", {
187
+ engine: "startpage",
188
+ engineLabel: "Startpage",
189
+ signal,
190
+ language,
191
+ method: "POST",
192
+ form: {
193
+ query,
194
+ cat: "web",
195
+ segment: "startpage.udog",
196
+ ...(page > 0 ? { page: String(page + 1) } : {}),
197
+ ...(languageValue
198
+ ? {
199
+ language: languageValue,
200
+ lui: languageValue,
201
+ }
202
+ : {}),
203
+ ...(sc ? { sc } : {}),
204
+ },
205
+ cookies: {
206
+ preferences: buildStartpagePreferences(languageValue),
207
+ },
208
+ referrer: "https://www.startpage.com/",
209
+ origin: "https://www.startpage.com",
210
+ runtimeContext,
211
+ blockedStatuses: [403, 429],
212
+ isBlocked: isStartpageChallengeResponse,
213
+ blockedSurface: "html",
214
+ });
215
+
216
+ return parseStartpageResults(html);
217
+ }
218
+
219
+ export const startpageAdapter = {
220
+ name: "startpage",
221
+ label: "Startpage",
222
+ priority: 100,
223
+ tier: "primary",
224
+ requestPolicy: {
225
+ retryAttempts: 0,
226
+ minRequestIntervalMs: 200,
227
+ },
228
+ supports: {
229
+ language: true,
230
+ time_range: false,
231
+ pageno: true,
232
+ },
233
+ isAvailable: () => true,
234
+ search: searchStartpage,
235
+ };
236
+
237
+ export default searchStartpage;
@@ -0,0 +1,2 @@
1
+ export * from "./startpage.impl.js";
2
+ export { default } from "./startpage.impl.js";
@@ -0,0 +1,265 @@
1
+ import { ApiError } from "../../core/errors.js";
2
+ import {
3
+ fetchSearchText,
4
+ isChallengeResponse,
5
+ throwBlockedUpstreamError,
6
+ } from "../engineRequest.js";
7
+ import { cleanText, parseHtml } from "../../core/html.js";
8
+ import { normalizeResults } from "../ranking.js";
9
+
10
+ const TOUTIAO_CHALLENGE_PATTERNS = [
11
+ /安全验证/i,
12
+ /captcha.*验证/i,
13
+ ];
14
+
15
+ function isToutiaoChallengeResponse(source) {
16
+ const text = String(source || "");
17
+ return (
18
+ isChallengeResponse(text, TOUTIAO_CHALLENGE_PATTERNS) ||
19
+ (text.length < 1000 && /<form\b/i.test(text)) ||
20
+ /"challenge_code"\s*:\s*1366/.test(text) ||
21
+ /"template_key"\s*:\s*"71-undefined"/.test(text)
22
+ );
23
+ }
24
+
25
+ function throwToutiaoChallengeError() {
26
+ throwBlockedUpstreamError({
27
+ engine: "Toutiao",
28
+ surface: "html",
29
+ });
30
+ }
31
+
32
+ /**
33
+ * Decode a Toutiao search-jump redirect URL.
34
+ * Links in the SSR HTML look like:
35
+ * https://sou.toutiao.com/search/jump?url=https%3A%2F%2Fexample.com%2Farticle
36
+ * We extract and decode the embedded target URL.
37
+ */
38
+ function decodeToutiaoJumpUrl(href) {
39
+ try {
40
+ const url = new URL(href);
41
+ const target = url.searchParams.get("url");
42
+ if (!target) return href;
43
+ const decoded = decodeURIComponent(target);
44
+ if (decoded.includes("search/jump?url=")) {
45
+ return decodeToutiaoJumpUrl(decoded);
46
+ }
47
+ return decoded;
48
+ } catch (_) {
49
+ return href;
50
+ }
51
+ }
52
+
53
+ /**
54
+ * Extract description text from a result-content card.
55
+ *
56
+ * Toutiao SSR DOM structure per search result:
57
+ * div.result-content
58
+ * script/style (hydrate/render code, ignored)
59
+ * div
60
+ * div.cs-view.cs-view-block.cs-card
61
+ * div.cs-view.cs-view-block.cs-card-header
62
+ * a[href] (title link)
63
+ * div.cs-view.cs-view-block.cs-card-content
64
+ * (description text, may be inside nested divs)
65
+ */
66
+ function extractCardDescription(card) {
67
+ // Look for cs-card-content, which holds the description
68
+ const contentNode = card.querySelector(".cs-card-content");
69
+ if (contentNode) {
70
+ // The description text is usually in a direct child div without a link
71
+ const descDivs = contentNode.querySelectorAll("div");
72
+ for (const d of descDivs) {
73
+ const txt = cleanText(d.textContent || "").trim();
74
+ // Skip text that's just the title repeated, or too short, or contains JS
75
+ if (
76
+ txt.length > 15 &&
77
+ txt.length < 500 &&
78
+ !txt.includes("druid") &&
79
+ !txt.includes("PerfTag") &&
80
+ !txt.includes("script")
81
+ ) {
82
+ // Prefer longer, more descriptive text
83
+ if (txt.length > 40) {
84
+ return txt.slice(0, 300);
85
+ }
86
+ }
87
+ }
88
+ // Fallback: the content node's own text (excluding nested links)
89
+ const fullText = cleanText(contentNode.textContent || "").trim();
90
+ // Remove the title portion from the description
91
+ const titleLink = card.querySelector("a[href]");
92
+ const titleText = titleLink ? (titleLink.textContent || "").trim() : "";
93
+ const descOnly = fullText.replace(titleText, "").trim();
94
+ if (descOnly.length > 15 && descOnly.length < 500 && !descOnly.includes("druid")) {
95
+ return descOnly.slice(0, 300);
96
+ }
97
+ }
98
+
99
+ // Fallback: search all divs in the card for description-like text
100
+ const allDivs = card.querySelectorAll("div");
101
+ for (const d of allDivs) {
102
+ const txt = cleanText(d.textContent || "").trim();
103
+ if (
104
+ txt.length > 30 &&
105
+ txt.length < 500 &&
106
+ !txt.includes("druid") &&
107
+ !txt.includes("PerfTag") &&
108
+ !txt.includes("script") &&
109
+ !txt.includes("换一换")
110
+ ) {
111
+ return txt.slice(0, 300);
112
+ }
113
+ }
114
+
115
+ return "";
116
+ }
117
+
118
+ export function parseToutiaoResults(html) {
119
+ if (isToutiaoChallengeResponse(html)) {
120
+ throwToutiaoChallengeError();
121
+ }
122
+
123
+ const root = parseHtml(html);
124
+ const seen = new Set();
125
+ const results = [];
126
+
127
+ // Find the search result list container
128
+ const resultList = root.querySelector(".s-result-list");
129
+
130
+ if (!resultList) {
131
+ throw new ApiError({
132
+ status: 502,
133
+ code: "UPSTREAM_PARSE_ERROR",
134
+ category: "upstream",
135
+ message: "Toutiao parser could not find s-result-list container",
136
+ });
137
+ }
138
+
139
+ // Each div.result-content inside s-result-list is one search result card.
140
+ // Skip cards inside s-side-list (hot trending sidebar).
141
+ const cards = resultList.querySelectorAll(".result-content");
142
+
143
+ for (const card of cards) {
144
+ // Exclude the hot trending sidebar entirely
145
+ if (card.closest(".s-side-list")) continue;
146
+
147
+ // Exclude ad/promotion cards: data-test-card-id="67-toutiao_web"
148
+ // Keep organic results: "67-homepage" (official site), "26-aft_ciyu_detail" (word definition)
149
+ const adMarker = card.querySelector("[data-test-card-id='67-toutiao_web']");
150
+ if (adMarker) continue;
151
+
152
+ // Exclude related-search suggestions: data-test-card-id="20-undefined"
153
+ const relatedMarker = card.querySelector("[data-test-card-id^='20-']");
154
+ if (relatedMarker) continue;
155
+
156
+ // Find the primary title link
157
+ const links = card.querySelectorAll("a[href]");
158
+ const titleLink = links.find((a) => {
159
+ const href = a.getAttribute("href") || "";
160
+ const text = (a.textContent || "").trim();
161
+ return (
162
+ href.includes("search/jump?url=") ||
163
+ href.includes("/article/") ||
164
+ href.includes("m.douyinhanyu.com") ||
165
+ href.includes("baike.com") ||
166
+ href.includes("m.toutiaoimg.cn") ||
167
+ href.includes("cloud.tencent.com")
168
+ ) && text.length > 3 && !/^\d{1,2}:\d{2}/.test(text);
169
+ });
170
+
171
+ if (!titleLink) continue;
172
+
173
+ const title = cleanText(titleLink.textContent || titleLink.innerHTML || "").trim();
174
+ const href = titleLink.getAttribute("href") || "";
175
+
176
+ if (!title || title.length < 3 || title.length > 150) continue;
177
+
178
+ // Skip UI/navigation titles
179
+ if (
180
+ title.includes("换一换") ||
181
+ title.includes("首页") ||
182
+ title.includes("登录") ||
183
+ title.includes("去西瓜搜") ||
184
+ title.includes("去抖音搜") ||
185
+ title.includes("查看详情") ||
186
+ title.includes("播放") ||
187
+ title.startsWith("无障碍") ||
188
+ title.startsWith("相关搜索")
189
+ ) continue;
190
+
191
+ const targetUrl = decodeToutiaoJumpUrl(href);
192
+ if (!targetUrl || targetUrl.startsWith("#") || targetUrl.startsWith("/")) continue;
193
+
194
+ // Skip trending items and internal search navigation
195
+ if (targetUrl.includes("/trending")) continue;
196
+ if (targetUrl.includes("so.toutiao.com/search") && !targetUrl.includes("toutiao.com/a")) continue;
197
+
198
+ // Deduplicate by target URL
199
+ try {
200
+ const canonical = new URL(targetUrl).toString().toLowerCase();
201
+ if (seen.has(canonical)) continue;
202
+ seen.add(canonical);
203
+ } catch (_) {
204
+ if (seen.has(targetUrl)) continue;
205
+ seen.add(targetUrl);
206
+ }
207
+
208
+ // Extract description from cs-card-content
209
+ const description = extractCardDescription(card);
210
+
211
+ results.push({ title, url: targetUrl, description });
212
+ }
213
+
214
+ if (results.length === 0) {
215
+ throw new ApiError({
216
+ status: 502,
217
+ code: "UPSTREAM_PARSE_ERROR",
218
+ category: "upstream",
219
+ message: "Toutiao parser could not find organic results",
220
+ });
221
+ }
222
+
223
+ return normalizeResults(results);
224
+ }
225
+
226
+ async function searchToutiao(params) {
227
+ const { query, signal, runtimeContext } = params;
228
+ const searchUrl = new URL("https://so.toutiao.com/search");
229
+ searchUrl.searchParams.set("keyword", query);
230
+ searchUrl.searchParams.set("dvpf", "pc");
231
+ searchUrl.searchParams.set("source", "input");
232
+
233
+ const html = await fetchSearchText(searchUrl.toString(), {
234
+ engine: "toutiao",
235
+ engineLabel: "Toutiao",
236
+ signal,
237
+ referrer: "https://so.toutiao.com/",
238
+ runtimeContext,
239
+ blockedStatuses: [403, 429],
240
+ isBlocked: isToutiaoChallengeResponse,
241
+ blockedSurface: "html",
242
+ });
243
+
244
+ return parseToutiaoResults(html);
245
+ }
246
+
247
+ export const toutiaoAdapter = {
248
+ name: "toutiao",
249
+ label: "Toutiao",
250
+ priority: 65,
251
+ tier: "experimental",
252
+ requestPolicy: {
253
+ retryAttempts: 0,
254
+ minRequestIntervalMs: 500,
255
+ },
256
+ supports: {
257
+ language: true,
258
+ time_range: false,
259
+ pageno: false,
260
+ },
261
+ isAvailable: () => true,
262
+ search: searchToutiao,
263
+ };
264
+
265
+ export default searchToutiao;
@@ -0,0 +1,2 @@
1
+ export * from "./toutiao.impl.js";
2
+ export { default } from "./toutiao.impl.js";