atlas-mcp-web 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,125 @@
1
+ # atlas-mcp-web
2
+
3
+ **Give your AI agent the ability to understand any web page.** A premium Model Context Protocol (MCP) server that adds 6 powerful web extraction tools to Claude, Cursor, Windsurf, and any MCP-compatible AI agent.
4
+
5
+ No more "sorry, I can't browse the web for you." Your agent gets instant, structured access to:
6
+
7
+ - Clean article text for RAG and summarization
8
+ - Complete metadata (Open Graph, Twitter Card, JSON-LD)
9
+ - HTML tables as structured data
10
+ - All links, grouped and classified
11
+ - Contact info (emails, phones, socials)
12
+ - Website tech stack detection (70+ technologies)
13
+
14
+ ## Install
15
+
16
+ ### Claude Desktop
17
+
18
+ Edit `~/Library/Application Support/Claude/claude_desktop_config.json` (macOS) or `%APPDATA%\Claude\claude_desktop_config.json` (Windows):
19
+
20
+ ```json
21
+ {
22
+ "mcpServers": {
23
+ "atlas-web": {
24
+ "command": "npx",
25
+ "args": ["-y", "atlas-mcp-web"]
26
+ }
27
+ }
28
+ }
29
+ ```
30
+
31
+ Restart Claude Desktop. The 6 tools will appear in the tool menu.
32
+
33
+ ### Cursor / Windsurf
34
+
35
+ Add to your MCP config:
36
+
37
+ ```json
38
+ {
39
+ "mcpServers": {
40
+ "atlas-web": {
41
+ "command": "npx",
42
+ "args": ["-y", "atlas-mcp-web"]
43
+ }
44
+ }
45
+ }
46
+ ```
47
+
48
+ ### Test locally
49
+
50
+ ```bash
51
+ npx -y atlas-mcp-web
52
+ ```
53
+
54
+ ## Tools
55
+
56
+ ### `extract_article`
57
+
58
+ Pull the main body of any news article or blog post. Strips ads, nav, comments, and boilerplate. Returns clean text plus metadata.
59
+
60
+ **Input**
61
+ ```json
62
+ { "url": "https://www.bbc.com/news/articles/..." }
63
+ ```
64
+
65
+ **Output**
66
+ ```json
67
+ {
68
+ "url": "...",
69
+ "title": "Major AI breakthrough announced",
70
+ "description": "Researchers report...",
71
+ "authors": ["Jane Doe"],
72
+ "publishedAt": "2026-04-13T08:00:00Z",
73
+ "image": "https://...",
74
+ "siteName": "BBC News",
75
+ "language": "en",
76
+ "content": "The full cleaned body of the article...",
77
+ "wordCount": 842,
78
+ "readingTimeMinutes": 4,
79
+ "keywords": ["AI", "research", "machine learning"]
80
+ }
81
+ ```
82
+
83
+ ### `extract_metadata`
84
+
85
+ Complete URL metadata for link previews, SEO audits, and bookmarks: Open Graph, Twitter Card, JSON-LD structured data, favicons, and more.
86
+
87
+ ### `extract_tables`
88
+
89
+ Pull every HTML table on a page as structured arrays with headers and rows. Perfect for financial data, sports stats, product comparisons.
90
+
91
+ ### `extract_links`
92
+
93
+ All links on a page, grouped by internal / external / social / email / phone. With anchor text.
94
+
95
+ ### `extract_contact`
96
+
97
+ Scan a page for contact details: emails, phone numbers, and social media handles (Twitter, LinkedIn, Instagram, Facebook, YouTube, GitHub, TikTok). Ideal for lead generation.
98
+
99
+ ### `detect_tech_stack`
100
+
101
+ Identify the technologies powering a website: CMS, JS frameworks, CDN, analytics, hosting, ecommerce, marketing tools. 70+ signatures supported.
102
+
103
+ ## Why Cortex?
104
+
105
+ - **Free and open source** (MIT license)
106
+ - **No API key required** — everything runs locally through your MCP client
107
+ - **Fast** — pure HTTP + Cheerio, no headless browser
108
+ - **Private** — your agent hits the target site directly, no middleman logs your queries
109
+ - **Premium options** — upgrade to the hosted API at [atlas-agent.dev](https://atlas-agent.dev) for proxy rotation, JS rendering, and higher rate limits
110
+
111
+ ## Use cases
112
+
113
+ - **Research agents** — Give Claude the ability to read any article in full, not just the title
114
+ - **Sales tools** — Automate lead generation by extracting contact info from company pages
115
+ - **SEO audits** — Scan hundreds of competitor URLs for tech stack, metadata, and structured data
116
+ - **RAG pipelines** — Feed clean article content directly into your vector database
117
+ - **Link preview generators** — Power Discord/Slack-style unfurls in your own apps
118
+
119
+ ## License
120
+
121
+ MIT © Cortex
122
+
123
+ ## Support
124
+
125
+ Issues and feature requests: [github.com/atlas-agent/mcp-web-extractor/issues](https://github.com/atlas-agent/mcp-web-extractor/issues)
@@ -0,0 +1,29 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Atlas MCP — Web Extractor for AI Agents
4
+ *
5
+ * A Model Context Protocol server that gives Claude, Cursor, Windsurf and any
6
+ * MCP-compatible AI agent the ability to extract clean, structured data from
7
+ * any web page:
8
+ *
9
+ * - extract_article Clean article body for RAG pipelines
10
+ * - extract_metadata Open Graph, Twitter Card, JSON-LD
11
+ * - extract_tables All HTML tables as structured rows
12
+ * - extract_links All links grouped by category
13
+ * - extract_contact Emails, phones, social handles
14
+ * - detect_tech_stack CMS, framework, CDN, analytics
15
+ *
16
+ * Install:
17
+ * npx -y atlas-mcp-web
18
+ *
19
+ * Or configure in Claude Desktop / Cursor / Windsurf:
20
+ * {
21
+ * "mcpServers": {
22
+ * "atlas-web": {
23
+ * "command": "npx",
24
+ * "args": ["-y", "atlas-mcp-web"]
25
+ * }
26
+ * }
27
+ * }
28
+ */
29
+ export {};
package/dist/index.js ADDED
@@ -0,0 +1,194 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Atlas MCP — Web Extractor for AI Agents
4
+ *
5
+ * A Model Context Protocol server that gives Claude, Cursor, Windsurf and any
6
+ * MCP-compatible AI agent the ability to extract clean, structured data from
7
+ * any web page:
8
+ *
9
+ * - extract_article Clean article body for RAG pipelines
10
+ * - extract_metadata Open Graph, Twitter Card, JSON-LD
11
+ * - extract_tables All HTML tables as structured rows
12
+ * - extract_links All links grouped by category
13
+ * - extract_contact Emails, phones, social handles
14
+ * - detect_tech_stack CMS, framework, CDN, analytics
15
+ *
16
+ * Install:
17
+ * npx -y atlas-mcp-web
18
+ *
19
+ * Or configure in Claude Desktop / Cursor / Windsurf:
20
+ * {
21
+ * "mcpServers": {
22
+ * "atlas-web": {
23
+ * "command": "npx",
24
+ * "args": ["-y", "atlas-mcp-web"]
25
+ * }
26
+ * }
27
+ * }
28
+ */
29
+ import { Server } from '@modelcontextprotocol/sdk/server/index.js';
30
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
31
+ import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
32
+ import { z } from 'zod';
33
+ import { extractArticle } from './tools/article.js';
34
+ import { extractMetadata } from './tools/metadata.js';
35
+ import { extractTables } from './tools/tables.js';
36
+ import { extractLinks } from './tools/links.js';
37
+ import { extractContact } from './tools/contact.js';
38
+ import { detectTechStack } from './tools/techstack.js';
39
+ import { fetchHtml } from './lib/fetch.js';
40
+ const UrlInput = z.object({
41
+ url: z.string().url().describe('The URL to extract data from'),
42
+ });
43
+ const LinksInput = UrlInput.extend({
44
+ sameDomainOnly: z
45
+ .boolean()
46
+ .optional()
47
+ .default(false)
48
+ .describe('If true, only return links on the same domain'),
49
+ });
50
+ const TOOLS = [
51
+ {
52
+ name: 'extract_article',
53
+ description: 'Extract the main article body from any news article, blog post, or long-form web page. Returns clean, readable text plus title, authors, publish date, word count, and keywords. Strips ads, navigation, comments, and boilerplate. Perfect for feeding content into RAG pipelines, summarization, or analysis.',
54
+ inputSchema: {
55
+ type: 'object',
56
+ properties: {
57
+ url: { type: 'string', description: 'URL of the article to extract' },
58
+ },
59
+ required: ['url'],
60
+ },
61
+ },
62
+ {
63
+ name: 'extract_metadata',
64
+ description: 'Extract all metadata from any URL: Open Graph tags (title, description, image, site_name), Twitter Card metadata, JSON-LD structured data, canonical URL, favicons, and more. Ideal for building link previews, SEO audits, or enriching bookmarks.',
65
+ inputSchema: {
66
+ type: 'object',
67
+ properties: {
68
+ url: { type: 'string', description: 'URL to extract metadata from' },
69
+ },
70
+ required: ['url'],
71
+ },
72
+ },
73
+ {
74
+ name: 'extract_tables',
75
+ description: 'Extract all HTML tables from a web page as structured arrays. Each table is returned with its headers and rows, ready for analysis. Perfect for scraping financial data, sports stats, product comparisons, or any tabular content.',
76
+ inputSchema: {
77
+ type: 'object',
78
+ properties: {
79
+ url: { type: 'string', description: 'URL containing the tables to extract' },
80
+ },
81
+ required: ['url'],
82
+ },
83
+ },
84
+ {
85
+ name: 'extract_links',
86
+ description: 'Extract all links from a web page grouped by category: internal, external, social media, email, and phone. Returns anchor text alongside each URL. Useful for link audits, sitemap generation, and finding contact information.',
87
+ inputSchema: {
88
+ type: 'object',
89
+ properties: {
90
+ url: { type: 'string', description: 'URL to extract links from' },
91
+ sameDomainOnly: {
92
+ type: 'boolean',
93
+ description: 'If true, only return links on the same domain as the source URL',
94
+ default: false,
95
+ },
96
+ },
97
+ required: ['url'],
98
+ },
99
+ },
100
+ {
101
+ name: 'extract_contact',
102
+ description: "Scan a web page for contact information: email addresses, phone numbers, and social media handles (Twitter, LinkedIn, Instagram, Facebook, YouTube, GitHub). Returns normalized, deduplicated results. Perfect for lead generation and sales prospecting.",
103
+ inputSchema: {
104
+ type: 'object',
105
+ properties: {
106
+ url: { type: 'string', description: 'URL to scan for contact info' },
107
+ },
108
+ required: ['url'],
109
+ },
110
+ },
111
+ {
112
+ name: 'detect_tech_stack',
113
+ description: 'Detect the technologies powering a website: CMS (WordPress, Shopify, etc), JS frameworks (React, Next.js, Vue), CDN (Cloudflare, Fastly), analytics tools (Google Analytics, Mixpanel), hosting (Vercel, Netlify), ecommerce platform, and more. 70+ technologies supported.',
114
+ inputSchema: {
115
+ type: 'object',
116
+ properties: {
117
+ url: { type: 'string', description: 'URL of the website to analyze' },
118
+ },
119
+ required: ['url'],
120
+ },
121
+ },
122
+ ];
123
+ const server = new Server({ name: 'atlas-web', version: '0.1.0' }, { capabilities: { tools: {} } });
124
+ server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
125
+ server.setRequestHandler(CallToolRequestSchema, async (request) => {
126
+ const { name, arguments: args } = request.params;
127
+ try {
128
+ if (name === 'extract_article') {
129
+ const { url } = UrlInput.parse(args);
130
+ const { $, html, finalUrl } = await fetchHtml(url);
131
+ const result = extractArticle({ url: finalUrl, $, html });
132
+ return toolResult(result);
133
+ }
134
+ if (name === 'extract_metadata') {
135
+ const { url } = UrlInput.parse(args);
136
+ const { $, finalUrl, headers } = await fetchHtml(url);
137
+ const result = extractMetadata({ url: finalUrl, $, headers });
138
+ return toolResult(result);
139
+ }
140
+ if (name === 'extract_tables') {
141
+ const { url } = UrlInput.parse(args);
142
+ const { $, finalUrl } = await fetchHtml(url);
143
+ const result = extractTables({ url: finalUrl, $ });
144
+ return toolResult(result);
145
+ }
146
+ if (name === 'extract_links') {
147
+ const { url, sameDomainOnly } = LinksInput.parse(args);
148
+ const { $, finalUrl } = await fetchHtml(url);
149
+ const result = extractLinks({ url: finalUrl, $, sameDomainOnly });
150
+ return toolResult(result);
151
+ }
152
+ if (name === 'extract_contact') {
153
+ const { url } = UrlInput.parse(args);
154
+ const { $, html, finalUrl } = await fetchHtml(url);
155
+ const result = extractContact({ url: finalUrl, $, html });
156
+ return toolResult(result);
157
+ }
158
+ if (name === 'detect_tech_stack') {
159
+ const { url } = UrlInput.parse(args);
160
+ const { $, html, finalUrl, headers } = await fetchHtml(url);
161
+ const result = detectTechStack({ url: finalUrl, $, html, headers });
162
+ return toolResult(result);
163
+ }
164
+ throw new Error(`Unknown tool: ${name}`);
165
+ }
166
+ catch (error) {
167
+ const message = error instanceof Error ? error.message : String(error);
168
+ return {
169
+ content: [{ type: 'text', text: `Error: ${message}` }],
170
+ isError: true,
171
+ };
172
+ }
173
+ });
174
+ function toolResult(data) {
175
+ return {
176
+ content: [
177
+ {
178
+ type: 'text',
179
+ text: JSON.stringify(data, null, 2),
180
+ },
181
+ ],
182
+ };
183
+ }
184
+ async function main() {
185
+ const transport = new StdioServerTransport();
186
+ await server.connect(transport);
187
+ // eslint-disable-next-line no-console
188
+ console.error('Atlas MCP Web Extractor ready');
189
+ }
190
+ main().catch((err) => {
191
+ // eslint-disable-next-line no-console
192
+ console.error('Fatal error:', err);
193
+ process.exit(1);
194
+ });
@@ -0,0 +1,12 @@
1
+ /**
2
+ * HTTP fetch with polite defaults, redirect following, and Cheerio parsing.
3
+ */
4
+ import { type CheerioAPI } from 'cheerio';
5
+ export interface FetchResult {
6
+ $: CheerioAPI;
7
+ html: string;
8
+ finalUrl: string;
9
+ headers: Record<string, string>;
10
+ statusCode: number;
11
+ }
12
+ export declare function fetchHtml(url: string, timeoutMs?: number): Promise<FetchResult>;
@@ -0,0 +1,39 @@
1
+ /**
2
+ * HTTP fetch with polite defaults, redirect following, and Cheerio parsing.
3
+ */
4
+ import { load } from 'cheerio';
5
+ const USER_AGENT = 'Mozilla/5.0 (compatible; Cortex-MCP/0.1; +https://atlas-agent.dev/bot)';
6
+ export async function fetchHtml(url, timeoutMs = 20000) {
7
+ const controller = new AbortController();
8
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
9
+ try {
10
+ const response = await fetch(url, {
11
+ headers: {
12
+ 'User-Agent': USER_AGENT,
13
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
14
+ 'Accept-Language': 'en-US,en;q=0.9',
15
+ },
16
+ redirect: 'follow',
17
+ signal: controller.signal,
18
+ });
19
+ if (!response.ok) {
20
+ throw new Error(`HTTP ${response.status} ${response.statusText} when fetching ${url}`);
21
+ }
22
+ const html = await response.text();
23
+ const $ = load(html);
24
+ const headers = {};
25
+ response.headers.forEach((value, key) => {
26
+ headers[key.toLowerCase()] = value;
27
+ });
28
+ return {
29
+ $,
30
+ html,
31
+ finalUrl: response.url,
32
+ headers,
33
+ statusCode: response.status,
34
+ };
35
+ }
36
+ finally {
37
+ clearTimeout(timeout);
38
+ }
39
+ }
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Clean article extraction: title, authors, date, content.
3
+ * JSON-LD first, then OG/meta, then readability heuristics.
4
+ */
5
+ import type { CheerioAPI } from 'cheerio';
6
+ export declare function extractArticle({ url, $, }: {
7
+ url: string;
8
+ $: CheerioAPI;
9
+ html: string;
10
+ }): {
11
+ url: string;
12
+ title: string;
13
+ description: string;
14
+ authors: string[];
15
+ publishedAt: string | null;
16
+ image: string | null;
17
+ siteName: string | null;
18
+ language: string | null;
19
+ content: string;
20
+ wordCount: number;
21
+ readingTimeMinutes: number;
22
+ keywords: string[];
23
+ };
@@ -0,0 +1,260 @@
1
+ /**
2
+ * Clean article extraction: title, authors, date, content.
3
+ * JSON-LD first, then OG/meta, then readability heuristics.
4
+ */
5
+ const MIN_CHARS = 280;
6
+ export function extractArticle({ url, $, }) {
7
+ const meta = extractMeta($);
8
+ const jsonLd = extractJsonLd($);
9
+ const title = jsonLd.headline ||
10
+ meta.ogTitle ||
11
+ meta.twitterTitle ||
12
+ ($('h1').first().text() || '').trim() ||
13
+ ($('title').first().text() || '').trim();
14
+ const description = jsonLd.description ||
15
+ meta.ogDescription ||
16
+ meta.twitterDescription ||
17
+ meta.description ||
18
+ '';
19
+ const authors = jsonLd.authors.length
20
+ ? jsonLd.authors
21
+ : extractAuthorsFromMeta($, meta);
22
+ const publishedAt = jsonLd.datePublished ||
23
+ meta.articlePublishedTime ||
24
+ $('time[datetime]').first().attr('datetime') ||
25
+ null;
26
+ const image = jsonLd.image || meta.ogImage || meta.twitterImage || null;
27
+ const siteName = meta.ogSiteName || extractDomain(url);
28
+ const language = ($('html').attr('lang') || meta.ogLocale || '').split('-')[0].toLowerCase() ||
29
+ null;
30
+ let content = extractContentFromArticle($);
31
+ if (!content || content.length < MIN_CHARS) {
32
+ content = extractContentByReadability($);
33
+ }
34
+ content = cleanText(content);
35
+ const words = content.split(/\s+/).filter(Boolean);
36
+ const wordCount = words.length;
37
+ const readingTimeMinutes = Math.max(1, Math.round(wordCount / 220));
38
+ return {
39
+ url,
40
+ title: cleanText(title),
41
+ description: cleanText(description),
42
+ authors,
43
+ publishedAt,
44
+ image,
45
+ siteName,
46
+ language,
47
+ content,
48
+ wordCount,
49
+ readingTimeMinutes,
50
+ keywords: extractKeywords($, meta, jsonLd),
51
+ };
52
+ }
53
+ function extractMeta($) {
54
+ const meta = {};
55
+ $('meta').each((_, el) => {
56
+ const name = ($(el).attr('name') || '').toLowerCase();
57
+ const property = ($(el).attr('property') || '').toLowerCase();
58
+ const content = $(el).attr('content') || '';
59
+ if (!content)
60
+ return;
61
+ if (name === 'description')
62
+ meta.description = content;
63
+ if (name === 'keywords')
64
+ meta.keywords = content;
65
+ if (name === 'author')
66
+ meta.author = content;
67
+ if (name === 'twitter:title')
68
+ meta.twitterTitle = content;
69
+ if (name === 'twitter:description')
70
+ meta.twitterDescription = content;
71
+ if (name === 'twitter:image')
72
+ meta.twitterImage = content;
73
+ if (property === 'og:title')
74
+ meta.ogTitle = content;
75
+ if (property === 'og:description')
76
+ meta.ogDescription = content;
77
+ if (property === 'og:image')
78
+ meta.ogImage = content;
79
+ if (property === 'og:site_name')
80
+ meta.ogSiteName = content;
81
+ if (property === 'og:locale')
82
+ meta.ogLocale = content;
83
+ if (property === 'article:published_time')
84
+ meta.articlePublishedTime = content;
85
+ if (property === 'article:modified_time')
86
+ meta.articleModifiedTime = content;
87
+ if (property === 'article:author')
88
+ meta.articleAuthor = content;
89
+ if (property === 'article:tag') {
90
+ if (!Array.isArray(meta.articleTags))
91
+ meta.articleTags = [];
92
+ meta.articleTags.push(content);
93
+ }
94
+ });
95
+ return meta;
96
+ }
97
+ function extractJsonLd($) {
98
+ const out = {
99
+ headline: null,
100
+ description: null,
101
+ authors: [],
102
+ datePublished: null,
103
+ image: null,
104
+ keywords: [],
105
+ };
106
+ $('script[type="application/ld+json"]').each((_, el) => {
107
+ const raw = $(el).contents().text();
108
+ if (!raw)
109
+ return;
110
+ let data;
111
+ try {
112
+ data = JSON.parse(raw);
113
+ }
114
+ catch {
115
+ return;
116
+ }
117
+ const items = Array.isArray(data) ? data : [data];
118
+ for (const item of items) {
119
+ collectJsonLdArticle(item, out);
120
+ if (item && typeof item === 'object' && '@graph' in item) {
121
+ const graph = item['@graph'];
122
+ if (Array.isArray(graph))
123
+ for (const sub of graph)
124
+ collectJsonLdArticle(sub, out);
125
+ }
126
+ }
127
+ });
128
+ return out;
129
+ }
130
+ function collectJsonLdArticle(item, out) {
131
+ if (!item || typeof item !== 'object')
132
+ return;
133
+ const obj = item;
134
+ const type = obj['@type'];
135
+ const isArticle = type === 'NewsArticle' ||
136
+ type === 'Article' ||
137
+ type === 'BlogPosting' ||
138
+ type === 'Report' ||
139
+ (Array.isArray(type) && type.some((t) => typeof t === 'string' && /Article|Posting/.test(t)));
140
+ if (!isArticle)
141
+ return;
142
+ if (!out.headline && typeof obj.headline === 'string')
143
+ out.headline = obj.headline;
144
+ if (!out.description && typeof obj.description === 'string')
145
+ out.description = obj.description;
146
+ if (!out.datePublished && typeof obj.datePublished === 'string')
147
+ out.datePublished = obj.datePublished;
148
+ if (!out.image) {
149
+ const img = obj.image;
150
+ if (typeof img === 'string')
151
+ out.image = img;
152
+ else if (Array.isArray(img) && img.length) {
153
+ const first = img[0];
154
+ out.image = typeof first === 'string' ? first : first?.url || null;
155
+ }
156
+ else if (img && typeof img === 'object') {
157
+ out.image = img.url || null;
158
+ }
159
+ }
160
+ const authors = [].concat(obj.author || []).flat();
161
+ for (const a of authors) {
162
+ if (!a)
163
+ continue;
164
+ const name = typeof a === 'string' ? a : a?.name || null;
165
+ if (name && !out.authors.includes(name))
166
+ out.authors.push(name);
167
+ }
168
+ if (obj.keywords) {
169
+ const kws = Array.isArray(obj.keywords)
170
+ ? obj.keywords
171
+ : String(obj.keywords).split(',').map((k) => k.trim());
172
+ for (const k of kws)
173
+ if (k && !out.keywords.includes(k))
174
+ out.keywords.push(k);
175
+ }
176
+ }
177
+ function extractAuthorsFromMeta($, meta) {
178
+ const authors = new Set();
179
+ if (meta.author)
180
+ authors.add(meta.author);
181
+ if (meta.articleAuthor)
182
+ authors.add(meta.articleAuthor);
183
+ $('[rel="author"], .author, .byline, .post-author').each((_, el) => {
184
+ const t = $(el).text().trim();
185
+ if (t && t.length < 100)
186
+ authors.add(t);
187
+ });
188
+ return Array.from(authors).filter(Boolean).slice(0, 10);
189
+ }
190
+ function extractContentFromArticle($) {
191
+ const noiseSelectors = [
192
+ 'script', 'style', 'nav', 'header', 'footer', 'aside',
193
+ '.ad', '.ads', '.advertisement', '.social', '.share', '.newsletter',
194
+ '[class*="subscribe"]', '[class*="paywall"]', '[class*="related"]',
195
+ '[class*="comment"]', '[class*="sidebar"]',
196
+ ];
197
+ const $clone = $.root().clone();
198
+ noiseSelectors.forEach((sel) => $clone.find(sel).remove());
199
+ const selectors = [
200
+ 'article',
201
+ '[itemprop="articleBody"]',
202
+ '[role="main"] article',
203
+ 'main article',
204
+ '.article-body',
205
+ '.post-content',
206
+ '.entry-content',
207
+ '.story-body',
208
+ 'main',
209
+ ];
210
+ for (const sel of selectors) {
211
+ const el = $clone.find(sel).first();
212
+ if (!el.length)
213
+ continue;
214
+ const text = el.find('p').map((_, p) => $(p).text()).get().join('\n\n').trim();
215
+ if (text.length >= MIN_CHARS)
216
+ return text;
217
+ }
218
+ return '';
219
+ }
220
+ function extractContentByReadability($) {
221
+ const paragraphs = [];
222
+ $('p').each((_, el) => {
223
+ const text = $(el).text().trim();
224
+ if (text.length >= 40)
225
+ paragraphs.push(text);
226
+ });
227
+ return paragraphs.join('\n\n');
228
+ }
229
+ function extractKeywords($, meta, jsonLd) {
230
+ const set = new Set();
231
+ if (meta.keywords) {
232
+ for (const k of String(meta.keywords).split(',')) {
233
+ const trimmed = k.trim();
234
+ if (trimmed)
235
+ set.add(trimmed);
236
+ }
237
+ }
238
+ if (Array.isArray(meta.articleTags))
239
+ for (const t of meta.articleTags)
240
+ set.add(t);
241
+ for (const k of jsonLd.keywords)
242
+ set.add(k);
243
+ return Array.from(set).slice(0, 25);
244
+ }
245
+ function cleanText(text) {
246
+ if (!text)
247
+ return '';
248
+ return String(text)
249
+ .replace(/\s+/g, ' ')
250
+ .replace(/\s([.,;:!?])/g, '$1')
251
+ .trim();
252
+ }
253
+ function extractDomain(url) {
254
+ try {
255
+ return new URL(url).hostname.replace(/^www\./, '');
256
+ }
257
+ catch {
258
+ return null;
259
+ }
260
+ }