@endday/search-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/LICENSE +674 -0
  2. package/README.md +117 -0
  3. package/README.zh.md +116 -0
  4. package/data/blocklist.generated.js +2 -0
  5. package/envs.js +129 -0
  6. package/index.d.ts +191 -0
  7. package/index.js +6 -0
  8. package/mcp/search-mcp.js +8 -0
  9. package/package.json +71 -0
  10. package/src/content/extract.impl.js +228 -0
  11. package/src/content/extract.js +1 -0
  12. package/src/content/fetch.impl.js +400 -0
  13. package/src/content/fetch.js +1 -0
  14. package/src/core/crypto.js +7 -0
  15. package/src/core/errors.impl.js +52 -0
  16. package/src/core/errors.js +1 -0
  17. package/src/core/html.impl.js +69 -0
  18. package/src/core/html.js +1 -0
  19. package/src/mcp/config.js +75 -0
  20. package/src/mcp/format.js +44 -0
  21. package/src/mcp/index.js +10 -0
  22. package/src/mcp/local/content.js +26 -0
  23. package/src/mcp/local/search.js +233 -0
  24. package/src/mcp/schemas.js +132 -0
  25. package/src/mcp/server.js +97 -0
  26. package/src/mcp/tools/content.js +31 -0
  27. package/src/mcp/tools/jinaContent.js +38 -0
  28. package/src/mcp/tools/newsSearch.js +22 -0
  29. package/src/mcp/tools/webSearch.js +57 -0
  30. package/src/platform/auth.impl.js +166 -0
  31. package/src/platform/auth.js +1 -0
  32. package/src/platform/cache.impl.js +166 -0
  33. package/src/platform/cache.js +1 -0
  34. package/src/platform/health.impl.js +133 -0
  35. package/src/platform/health.js +1 -0
  36. package/src/platform/http.impl.js +108 -0
  37. package/src/platform/http.js +1 -0
  38. package/src/platform/logger.impl.js +51 -0
  39. package/src/platform/logger.js +1 -0
  40. package/src/platform/metrics.impl.js +43 -0
  41. package/src/platform/metrics.js +1 -0
  42. package/src/platform/nodeHttpClient.js +104 -0
  43. package/src/platform/rateLimit.impl.js +141 -0
  44. package/src/platform/rateLimit.js +1 -0
  45. package/src/platform/requestContext.impl.js +10 -0
  46. package/src/platform/requestContext.js +1 -0
  47. package/src/platform/session.impl.js +198 -0
  48. package/src/platform/session.js +1 -0
  49. package/src/platform/stateKv.impl.js +18 -0
  50. package/src/platform/stateKv.js +1 -0
  51. package/src/platform/tasks.impl.js +17 -0
  52. package/src/platform/tasks.js +1 -0
  53. package/src/routes/requestParams.impl.js +12 -0
  54. package/src/routes/requestParams.js +1 -0
  55. package/src/search/engineRegistry.impl.js +117 -0
  56. package/src/search/engineRegistry.js +1 -0
  57. package/src/search/engineRequest.impl.js +377 -0
  58. package/src/search/engineRequest.js +1 -0
  59. package/src/search/engineUtils.impl.js +227 -0
  60. package/src/search/engineUtils.js +1 -0
  61. package/src/search/engines/baidu.impl.js +145 -0
  62. package/src/search/engines/baidu.js +2 -0
  63. package/src/search/engines/bing.impl.js +509 -0
  64. package/src/search/engines/bing.js +2 -0
  65. package/src/search/engines/brave.impl.js +223 -0
  66. package/src/search/engines/brave.js +2 -0
  67. package/src/search/engines/duckduckgo.impl.js +164 -0
  68. package/src/search/engines/duckduckgo.js +2 -0
  69. package/src/search/engines/mojeek.impl.js +115 -0
  70. package/src/search/engines/mojeek.js +2 -0
  71. package/src/search/engines/qwant.impl.js +188 -0
  72. package/src/search/engines/qwant.js +2 -0
  73. package/src/search/engines/startpage.impl.js +237 -0
  74. package/src/search/engines/startpage.js +2 -0
  75. package/src/search/engines/toutiao.impl.js +265 -0
  76. package/src/search/engines/toutiao.js +2 -0
  77. package/src/search/engines/yahoo.impl.js +379 -0
  78. package/src/search/engines/yahoo.js +2 -0
  79. package/src/search/gateway.impl.js +423 -0
  80. package/src/search/gateway.js +1 -0
  81. package/src/search/ranking.impl.js +381 -0
  82. package/src/search/ranking.js +1 -0
  83. package/src/search/requestPolicy.impl.js +137 -0
  84. package/src/search/requestPolicy.js +1 -0
  85. package/src/search/upstreamSession.impl.js +148 -0
  86. package/src/search/upstreamSession.js +1 -0
package/envs.js ADDED
@@ -0,0 +1,129 @@
1
+ const createDefaultEnv = () => ({
2
+ DEFAULT_TIMEOUT: "4000",
3
+ SUPPORTED_ENGINES: [
4
+ "baidu",
5
+ "startpage",
6
+ "duckduckgo",
7
+ "brave",
8
+ "qwant",
9
+ "yahoo",
10
+ "mojeek",
11
+ "bing",
12
+ "toutiao",
13
+ ],
14
+ DEFAULT_ENGINES: ["bing", "brave", "yahoo", "mojeek"],
15
+ DEFAULT_ENGINES_ZH: ["baidu", "bing"],
16
+ DEFAULT_ENGINES_NON_ZH: ["bing", "brave", "yahoo", "mojeek"],
17
+ DEFAULT_LANGUAGE: "en",
18
+ FALLBACK_MIN_RESULTS: "6",
19
+ FALLBACK_MIN_CONTRIBUTING_ENGINES: "2",
20
+ SEARCH_PRIMARY_TIERS: ["primary"],
21
+ SEARCH_SECONDARY_TIERS: ["secondary"],
22
+ SEARCH_EXPERIMENTAL_TIERS: ["experimental"],
23
+ SEARCH_TIER_HEDGE_DELAY_MS: "250",
24
+ EDGE_CACHE_TTL_SECONDS: "30",
25
+ CACHE_TTL_SECONDS: "300",
26
+ STALE_CACHE_TTL_SECONDS: "1800",
27
+ RATE_LIMIT_WINDOW_SECONDS: "60",
28
+ RATE_LIMIT_MAX_REQUESTS: "60",
29
+ UPSTREAM_RETRY_ATTEMPTS: "1",
30
+ UPSTREAM_RETRY_DELAY_MS: "200",
31
+ UPSTREAM_PRIMARY_RETRY_ATTEMPTS: "1",
32
+ UPSTREAM_SECONDARY_RETRY_ATTEMPTS: "0",
33
+ UPSTREAM_EXPERIMENTAL_RETRY_ATTEMPTS: "0",
34
+ UPSTREAM_SESSION_TTL_SECONDS: "3600",
35
+ UPSTREAM_MIN_REQUEST_INTERVAL_MS: "150",
36
+ UPSTREAM_PRIMARY_MIN_REQUEST_INTERVAL_MS: "100",
37
+ UPSTREAM_SECONDARY_MIN_REQUEST_INTERVAL_MS: "250",
38
+ UPSTREAM_EXPERIMENTAL_MIN_REQUEST_INTERVAL_MS: "500",
39
+ HEALTH_FAILURE_THRESHOLD: "2",
40
+ HEALTH_COOLDOWN_SECONDS: "180",
41
+ HEALTH_STATE_TTL_SECONDS: "3600",
42
+ CORS_ALLOWED_ORIGINS: ["*"],
43
+ CORS_ALLOWED_HEADERS: ["Authorization", "Content-Type", "x-api-key"],
44
+ AUTH_REQUIRED: "false",
45
+ TOKEN: null,
46
+ CF_BROWSER_RENDERING_ACCOUNT_ID: null,
47
+ CF_BROWSER_RENDERING_API_TOKEN: null,
48
+ SEARCH_KV: null,
49
+ SEARCH_STATE_KV: null,
50
+ });
51
+
52
+ function normalizeStringArray(value, fallback) {
53
+ if (Array.isArray(value)) {
54
+ return value.map((item) => String(item).trim()).filter(Boolean);
55
+ }
56
+
57
+ if (typeof value === "string") {
58
+ const trimmed = value.trim();
59
+ if (!trimmed) {
60
+ return [...fallback];
61
+ }
62
+
63
+ if (trimmed.startsWith("[")) {
64
+ try {
65
+ return normalizeStringArray(JSON.parse(trimmed), fallback);
66
+ } catch (_) {
67
+ return [...fallback];
68
+ }
69
+ }
70
+
71
+ return trimmed
72
+ .split(",")
73
+ .map((item) => item.trim())
74
+ .filter(Boolean);
75
+ }
76
+
77
+ return [...fallback];
78
+ }
79
+
80
+ function resetEnv(target) {
81
+ const defaults = createDefaultEnv();
82
+ Object.keys(target).forEach((key) => {
83
+ delete target[key];
84
+ });
85
+ Object.assign(target, defaults);
86
+ }
87
+
88
+ export const env = createDefaultEnv();
89
+
90
+ export const setEnv = (newEnv = {}) => {
91
+ resetEnv(env);
92
+ Object.assign(env, newEnv);
93
+ env.SUPPORTED_ENGINES = normalizeStringArray(
94
+ env.SUPPORTED_ENGINES,
95
+ createDefaultEnv().SUPPORTED_ENGINES
96
+ );
97
+ env.DEFAULT_ENGINES = normalizeStringArray(
98
+ env.DEFAULT_ENGINES,
99
+ createDefaultEnv().DEFAULT_ENGINES
100
+ );
101
+ env.DEFAULT_ENGINES_ZH = normalizeStringArray(
102
+ env.DEFAULT_ENGINES_ZH,
103
+ createDefaultEnv().DEFAULT_ENGINES_ZH
104
+ );
105
+ env.DEFAULT_ENGINES_NON_ZH = normalizeStringArray(
106
+ env.DEFAULT_ENGINES_NON_ZH,
107
+ createDefaultEnv().DEFAULT_ENGINES_NON_ZH
108
+ );
109
+ env.CORS_ALLOWED_ORIGINS = normalizeStringArray(
110
+ env.CORS_ALLOWED_ORIGINS,
111
+ createDefaultEnv().CORS_ALLOWED_ORIGINS
112
+ );
113
+ env.CORS_ALLOWED_HEADERS = normalizeStringArray(
114
+ env.CORS_ALLOWED_HEADERS,
115
+ createDefaultEnv().CORS_ALLOWED_HEADERS
116
+ );
117
+ env.SEARCH_PRIMARY_TIERS = normalizeStringArray(
118
+ env.SEARCH_PRIMARY_TIERS,
119
+ createDefaultEnv().SEARCH_PRIMARY_TIERS
120
+ );
121
+ env.SEARCH_SECONDARY_TIERS = normalizeStringArray(
122
+ env.SEARCH_SECONDARY_TIERS,
123
+ createDefaultEnv().SEARCH_SECONDARY_TIERS
124
+ );
125
+ env.SEARCH_EXPERIMENTAL_TIERS = normalizeStringArray(
126
+ env.SEARCH_EXPERIMENTAL_TIERS,
127
+ createDefaultEnv().SEARCH_EXPERIMENTAL_TIERS
128
+ );
129
+ };
package/index.d.ts ADDED
@@ -0,0 +1,191 @@
1
+ import type { Server } from "@modelcontextprotocol/sdk/server/index.js";
2
+
3
+ export type SourceType =
4
+ | "official"
5
+ | "model_repo"
6
+ | "code_repo"
7
+ | "paper"
8
+ | "benchmark"
9
+ | "analysis"
10
+ | "media"
11
+ | "blog"
12
+ | "community"
13
+ | "low_credibility"
14
+ | "disinformation"
15
+ | "document"
16
+ | "unknown";
17
+
18
+ export type TimeRange = "day" | "week" | "month" | "year";
19
+
20
+ export type ResultItem = {
21
+ title: string;
22
+ description: string;
23
+ url: string;
24
+ source_name?: string;
25
+ published_text?: string;
26
+ source_type?: SourceType;
27
+ authority_score?: number;
28
+ };
29
+
30
+ export type SkippedEngineReason =
31
+ | "unsupported_engine"
32
+ | "unsupported_vertical"
33
+ | "unavailable_engine"
34
+ | "unsupported_time_range"
35
+ | "unsupported_pageno";
36
+
37
+ export type SkippedEngine = {
38
+ engine: string;
39
+ reason: SkippedEngineReason;
40
+ };
41
+
42
+ export type LocationSource =
43
+ | "auto"
44
+ | "explicit"
45
+ | "disabled"
46
+ | "unavailable";
47
+
48
+ export type SourceFilters = {
49
+ include_source_types: string[];
50
+ exclude_source_types: string[];
51
+ min_authority_score: number | null;
52
+ active: boolean;
53
+ };
54
+
55
+ export type SearchLocalOptions = {
56
+ vertical?: "web" | "news";
57
+ language?: string;
58
+ search_lang?: string;
59
+ ui_lang?: string;
60
+ location?: string;
61
+ time_range?: TimeRange;
62
+ pageno?: number;
63
+ count?: number;
64
+ offset?: number;
65
+ min_authority_score?: number | string | null;
66
+ include_source_types?: string[];
67
+ exclude_source_types?: string[];
68
+ clientId?: string;
69
+ };
70
+
71
+ export type SearchAllParams = {
72
+ vertical?: "web" | "news";
73
+ query: string;
74
+ engines: string[];
75
+ language?: string;
76
+ time_range?: TimeRange;
77
+ pageno?: number;
78
+ clientId?: string;
79
+ runtimeContext?: unknown;
80
+ };
81
+
82
+ export type SearchResponse = {
83
+ vertical?: "web" | "news";
84
+ query: string;
85
+ effective_query?: string;
86
+ location?: string | null;
87
+ location_source?: LocationSource;
88
+ location_context?: {
89
+ value: string;
90
+ source: LocationSource;
91
+ mode: string;
92
+ client: {
93
+ city: string;
94
+ region: string;
95
+ country: string;
96
+ timezone: string;
97
+ };
98
+ };
99
+ number_of_results: number;
100
+ count?: number | null;
101
+ offset?: number;
102
+ enabled_engines: string[];
103
+ skipped_engines: SkippedEngine[];
104
+ unresponsive_engines: string[];
105
+ source_filters?: SourceFilters;
106
+ results: Array<ResultItem & { engine: string }>;
107
+ };
108
+
109
+ export type SearchMetaResponse = {
110
+ response: SearchResponse;
111
+ meta: {
112
+ cache_status: "hit" | "miss" | "revalidated" | "stale-if-error";
113
+ cache_layer?: string;
114
+ fallback_order: string[];
115
+ fallback_path: string[];
116
+ strategy?: string;
117
+ engine_timings: Array<{
118
+ engine: string;
119
+ duration_ms: number;
120
+ status: string;
121
+ result_count: number;
122
+ tier?: string;
123
+ }>;
124
+ };
125
+ };
126
+
127
+ export interface Env {
128
+ DEFAULT_TIMEOUT?: string;
129
+ SUPPORTED_ENGINES?: string[];
130
+ DEFAULT_ENGINES?: string[];
131
+ DEFAULT_ENGINES_ZH?: string[];
132
+ DEFAULT_ENGINES_NON_ZH?: string[];
133
+ DEFAULT_LANGUAGE?: string;
134
+ FALLBACK_MIN_RESULTS?: string;
135
+ FALLBACK_MIN_CONTRIBUTING_ENGINES?: string;
136
+ SEARCH_PRIMARY_TIERS?: string[];
137
+ SEARCH_SECONDARY_TIERS?: string[];
138
+ SEARCH_EXPERIMENTAL_TIERS?: string[];
139
+ SEARCH_TIER_HEDGE_DELAY_MS?: string;
140
+ EDGE_CACHE_TTL_SECONDS?: string;
141
+ CACHE_TTL_SECONDS?: string;
142
+ STALE_CACHE_TTL_SECONDS?: string;
143
+ RATE_LIMIT_WINDOW_SECONDS?: string;
144
+ RATE_LIMIT_MAX_REQUESTS?: string;
145
+ UPSTREAM_RETRY_ATTEMPTS?: string;
146
+ UPSTREAM_RETRY_DELAY_MS?: string;
147
+ UPSTREAM_PRIMARY_RETRY_ATTEMPTS?: string;
148
+ UPSTREAM_SECONDARY_RETRY_ATTEMPTS?: string;
149
+ UPSTREAM_EXPERIMENTAL_RETRY_ATTEMPTS?: string;
150
+ UPSTREAM_SESSION_TTL_SECONDS?: string;
151
+ UPSTREAM_MIN_REQUEST_INTERVAL_MS?: string;
152
+ UPSTREAM_PRIMARY_MIN_REQUEST_INTERVAL_MS?: string;
153
+ UPSTREAM_SECONDARY_MIN_REQUEST_INTERVAL_MS?: string;
154
+ UPSTREAM_EXPERIMENTAL_MIN_REQUEST_INTERVAL_MS?: string;
155
+ HEALTH_FAILURE_THRESHOLD?: string;
156
+ HEALTH_COOLDOWN_SECONDS?: string;
157
+ HEALTH_STATE_TTL_SECONDS?: string;
158
+ CORS_ALLOWED_ORIGINS?: string[] | string;
159
+ CORS_ALLOWED_HEADERS?: string[] | string;
160
+ AUTH_REQUIRED?: string;
161
+ TOKEN?: string | null;
162
+ CF_BROWSER_RENDERING_ACCOUNT_ID?: string | null;
163
+ CF_BROWSER_RENDERING_API_TOKEN?: string | null;
164
+ SEARCH_KV?: unknown;
165
+ SEARCH_STATE_KV?: unknown;
166
+ }
167
+
168
+ export type McpConfig = {
169
+ mode: "local";
170
+ jinaApiKey: string;
171
+ jinaBaseUrl: string;
172
+ upstreamClient: string;
173
+ localClientId: string;
174
+ allEngines: string[];
175
+ };
176
+
177
+ export const env: Env;
178
+ export function setEnv(newEnv?: Partial<Env>): void;
179
+ export function loadMcpConfig(): McpConfig;
180
+ export function main(): Promise<void>;
181
+ export function createServer(config: McpConfig): Server;
182
+ export function startServer(server: Server): Promise<void>;
183
+ export function searchLocal(
184
+ query: string,
185
+ engines?: string[] | null,
186
+ options?: SearchLocalOptions
187
+ ): Promise<SearchResponse>;
188
+ export function searchAll(params: SearchAllParams): Promise<SearchResponse>;
189
+ export function searchAllWithMeta(
190
+ params: SearchAllParams
191
+ ): Promise<SearchMetaResponse>;
package/index.js ADDED
@@ -0,0 +1,6 @@
1
+ export { env, setEnv } from "./envs.js";
2
+ export { loadMcpConfig } from "./src/mcp/config.js";
3
+ export { main } from "./src/mcp/index.js";
4
+ export { searchLocal } from "./src/mcp/local/search.js";
5
+ export { createServer, startServer } from "./src/mcp/server.js";
6
+ export { searchAll, searchAllWithMeta } from "./src/search/gateway.js";
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { main } from "../src/mcp/index.js";
4
+
5
+ main().catch((error) => {
6
+ console.error("Fatal error:", error);
7
+ process.exit(1);
8
+ });
package/package.json ADDED
@@ -0,0 +1,71 @@
1
+ {
2
+ "name": "@endday/search-mcp",
3
+ "version": "1.0.0",
4
+ "description": "Local MCP server for aggregated web search and content extraction",
5
+ "type": "module",
6
+ "main": "./index.js",
7
+ "types": "./index.d.ts",
8
+ "publishConfig": {
9
+ "access": "public"
10
+ },
11
+ "bin": {
12
+ "search-mcp": "mcp/search-mcp.js"
13
+ },
14
+ "exports": {
15
+ ".": {
16
+ "types": "./index.d.ts",
17
+ "import": "./index.js"
18
+ },
19
+ "./cli": "./mcp/search-mcp.js"
20
+ },
21
+ "files": [
22
+ "data/blocklist.generated.js",
23
+ "index.js",
24
+ "mcp/search-mcp.js",
25
+ "src/**/*.js",
26
+ "envs.js",
27
+ "index.d.ts"
28
+ ],
29
+ "scripts": {
30
+ "start": "node mcp/search-mcp.js",
31
+ "test": "node --test test",
32
+ "smoke": "node scripts/mcp-local-smoke.js",
33
+ "update:blocklist": "node scripts/update-blocklist.js",
34
+ "docs:dev": "npm --prefix docs-site run docs:dev",
35
+ "docs:build": "npm --prefix docs-site run docs:build",
36
+ "docs:preview": "npm --prefix docs-site run docs:preview"
37
+ },
38
+ "keywords": [
39
+ "mcp",
40
+ "search",
41
+ "web-search",
42
+ "self-hosted",
43
+ "ai-agent",
44
+ "openclaw",
45
+ "privacy",
46
+ "aggregated-search",
47
+ "startpage",
48
+ "mojeek",
49
+ "brave",
50
+ "duckduckgo",
51
+ "bing",
52
+ "qwant",
53
+ "yahoo"
54
+ ],
55
+ "author": "endday",
56
+ "license": "GPL-3.0",
57
+ "repository": {
58
+ "type": "git",
59
+ "url": "git+https://github.com/endday/search-mcp.git"
60
+ },
61
+ "dependencies": {
62
+ "@modelcontextprotocol/sdk": "^1.0.4",
63
+ "@mozilla/readability": "^0.6.0",
64
+ "impit": "^0.14.2",
65
+ "linkedom": "^0.18.12",
66
+ "node-html-parser": "^7.1.0"
67
+ },
68
+ "engines": {
69
+ "node": ">=20.0.0"
70
+ }
71
+ }
@@ -0,0 +1,228 @@
1
+ import { Readability } from "@mozilla/readability";
2
+ import { DOMParser } from "linkedom/worker";
3
+
4
+ import { cleanText, parseHtml } from "../core/html.js";
5
+
6
+ const NOISE_SELECTOR =
7
+ "script, style, noscript, nav, footer, header, aside, form, iframe, svg, canvas, button, input, select, textarea";
8
+ const CANDIDATE_SELECTOR =
9
+ "article, main, section, div, [role=main], .article, .post, .content, .entry-content, #content";
10
+ const POSITIVE_RE =
11
+ /article|body|content|entry|hentry|main|page|post|story|text|正文|内容|文章/i;
12
+ const NEGATIVE_RE =
13
+ /ad|banner|comment|combx|contact|footer|header|menu|meta|nav|promo|related|remark|rss|share|sidebar|social|tag|tool|widget|广告|评论|导航|分享|推荐|相关阅读/i;
14
+
15
+ function getNodeText(node) {
16
+ return cleanText(node?.text || "");
17
+ }
18
+
19
+ function getMeta(root, selector) {
20
+ return cleanText(root.querySelector(selector)?.getAttribute("content") || "");
21
+ }
22
+
23
+ function getNodeSignal(node) {
24
+ return `${node.getAttribute?.("id") || ""} ${node.getAttribute?.("class") || ""}`;
25
+ }
26
+
27
+ function scoreCandidate(node) {
28
+ const text = getNodeText(node);
29
+ const textLength = text.length;
30
+
31
+ if (textLength < 80) {
32
+ return {
33
+ node,
34
+ score: 0,
35
+ textLength,
36
+ linkDensity: 1,
37
+ paragraphCount: 0,
38
+ };
39
+ }
40
+
41
+ const linkTextLength = node
42
+ .querySelectorAll("a")
43
+ .reduce((total, link) => total + getNodeText(link).length, 0);
44
+ const paragraphCount = node.querySelectorAll("p").filter((p) => {
45
+ return getNodeText(p).length >= 20;
46
+ }).length;
47
+ const commaCount = (text.match(/[,,。.!?!?;;]/g) || []).length;
48
+ const signal = getNodeSignal(node);
49
+ const linkDensity = textLength ? linkTextLength / textLength : 1;
50
+ let score = textLength + paragraphCount * 120 + commaCount * 12;
51
+
52
+ if (POSITIVE_RE.test(signal)) {
53
+ score += 350;
54
+ }
55
+
56
+ if (NEGATIVE_RE.test(signal)) {
57
+ score -= 500;
58
+ }
59
+
60
+ score *= Math.max(0.05, 1 - linkDensity);
61
+
62
+ return {
63
+ node,
64
+ score,
65
+ textLength,
66
+ linkDensity,
67
+ paragraphCount,
68
+ };
69
+ }
70
+
71
+ function cleanTree(root) {
72
+ root.querySelectorAll(NOISE_SELECTOR).forEach((node) => node.remove());
73
+ root.querySelectorAll("a").forEach((link) => {
74
+ const href = String(link.getAttribute("href") || "").trim();
75
+ if (/^\s*javascript:/i.test(href)) {
76
+ link.removeAttribute("href");
77
+ }
78
+ });
79
+ }
80
+
81
+ function stripUnsafeHtml(html) {
82
+ return String(html || "")
83
+ .replace(/\s+on[a-z]+\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, "")
84
+ .replace(/\s+style\s*=\s*("[^"]*"|'[^']*'|[^\s>]+)/gi, "")
85
+ .replace(/\s+href\s*=\s*(['"])\s*javascript:[\s\S]*?\1/gi, "");
86
+ }
87
+
88
+ function pickBestCandidate(root) {
89
+ const candidates = root
90
+ .querySelectorAll(CANDIDATE_SELECTOR)
91
+ .map(scoreCandidate)
92
+ .sort((a, b) => b.score - a.score);
93
+
94
+ const best = candidates[0];
95
+ if (best?.score > 0) {
96
+ return best;
97
+ }
98
+
99
+ const body = root.querySelector("body") || root;
100
+ return scoreCandidate(body);
101
+ }
102
+
103
+ function getPageMetadata(html) {
104
+ const root = parseHtml(html);
105
+
106
+ return {
107
+ title:
108
+ getMeta(root, 'meta[property="og:title"]') ||
109
+ getMeta(root, 'meta[name="twitter:title"]') ||
110
+ cleanText(root.querySelector("title")?.text || ""),
111
+ description:
112
+ getMeta(root, 'meta[property="og:description"]') ||
113
+ getMeta(root, 'meta[name="twitter:description"]') ||
114
+ getMeta(root, 'meta[name="description"]'),
115
+ site_name: getMeta(root, 'meta[property="og:site_name"]'),
116
+ author: getMeta(root, 'meta[name="author"]'),
117
+ published_time:
118
+ getMeta(root, 'meta[property="article:published_time"]') ||
119
+ getMeta(root, 'meta[name="date"]') ||
120
+ getMeta(root, 'meta[name="pubdate"]'),
121
+ image:
122
+ getMeta(root, 'meta[property="og:image"]') ||
123
+ getMeta(root, 'meta[name="twitter:image"]'),
124
+ lang: root.querySelector("html")?.getAttribute("lang") || "",
125
+ };
126
+ }
127
+
128
+ function extractWithReadability(html, url) {
129
+ const document = new DOMParser().parseFromString(html, "text/html");
130
+
131
+ if (url && document.head) {
132
+ const base = document.createElement("base");
133
+ base.setAttribute("href", url);
134
+ document.head.appendChild(base);
135
+ }
136
+
137
+ const article = new Readability(document, {
138
+ keepClasses: true,
139
+ }).parse();
140
+
141
+ if (!article?.content) {
142
+ return null;
143
+ }
144
+
145
+ return article;
146
+ }
147
+
148
+ function normalizeReadabilityArticle(article, metadata, url) {
149
+ const text = cleanText(article.textContent || article.content);
150
+ const contentHtml = stripUnsafeHtml(article.content);
151
+
152
+ if (text.length < 80) {
153
+ return null;
154
+ }
155
+
156
+ return {
157
+ url: article.url || url,
158
+ source: "direct-fetch",
159
+ extractor: "readability",
160
+ title: cleanText(article.title || metadata.title || ""),
161
+ description: cleanText(article.excerpt || metadata.description || ""),
162
+ metadata: {
163
+ ...metadata,
164
+ title: cleanText(article.title || metadata.title || ""),
165
+ description: cleanText(article.excerpt || metadata.description || ""),
166
+ site_name: cleanText(article.siteName || metadata.site_name || ""),
167
+ author: cleanText(article.byline || metadata.author || ""),
168
+ lang: article.lang || metadata.lang || "",
169
+ },
170
+ html: contentHtml,
171
+ text,
172
+ excerpt: text.slice(0, 500),
173
+ stats: {
174
+ text_length: text.length,
175
+ html_length: contentHtml.length,
176
+ score: null,
177
+ link_density: null,
178
+ paragraph_count: (contentHtml.match(/<p\b/gi) || []).length,
179
+ },
180
+ };
181
+ }
182
+
183
+ function extractPageContentWithHeuristics(html, url) {
184
+ const root = parseHtml(html);
185
+ const metadata = getPageMetadata(html);
186
+
187
+ cleanTree(root);
188
+
189
+ const candidate = pickBestCandidate(root);
190
+ const text = getNodeText(candidate.node);
191
+ const contentHtml = stripUnsafeHtml(candidate.node?.toString() || "");
192
+
193
+ return {
194
+ url,
195
+ source: "direct-fetch",
196
+ extractor: "heuristic",
197
+ title: metadata.title,
198
+ description: metadata.description,
199
+ metadata,
200
+ html: contentHtml,
201
+ text,
202
+ excerpt: text.slice(0, 500),
203
+ stats: {
204
+ text_length: text.length,
205
+ html_length: contentHtml.length,
206
+ score: Math.round(candidate.score),
207
+ link_density: Number(candidate.linkDensity.toFixed(3)),
208
+ paragraph_count: candidate.paragraphCount,
209
+ },
210
+ };
211
+ }
212
+
213
+ export async function extractPageContent(html, url) {
214
+ try {
215
+ const metadata = getPageMetadata(html);
216
+ const article = extractWithReadability(html, url);
217
+ const extracted = normalizeReadabilityArticle(article, metadata, url);
218
+
219
+ if (extracted) {
220
+ return extracted;
221
+ }
222
+ } catch (_) {
223
+ // Fall back to the local heuristic extractor when the library cannot parse
224
+ // a page or when Worker bundling/runtime behavior differs by site.
225
+ }
226
+
227
+ return extractPageContentWithHeuristics(html, url);
228
+ }
@@ -0,0 +1 @@
1
+ export * from "./extract.impl.js";