atlas-mcp-web 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +125 -0
- package/dist/index.d.ts +29 -0
- package/dist/index.js +194 -0
- package/dist/lib/fetch.d.ts +12 -0
- package/dist/lib/fetch.js +39 -0
- package/dist/tools/article.d.ts +23 -0
- package/dist/tools/article.js +260 -0
- package/dist/tools/contact.d.ts +19 -0
- package/dist/tools/contact.js +85 -0
- package/dist/tools/links.d.ts +34 -0
- package/dist/tools/links.js +107 -0
- package/dist/tools/metadata.d.ts +32 -0
- package/dist/tools/metadata.js +112 -0
- package/dist/tools/tables.d.ts +18 -0
- package/dist/tools/tables.js +75 -0
- package/dist/tools/techstack.d.ts +19 -0
- package/dist/tools/techstack.js +138 -0
- package/package.json +48 -0
package/README.md
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# atlas-mcp-web
|
|
2
|
+
|
|
3
|
+
**Give your AI agent the ability to understand any web page.** A premium Model Context Protocol (MCP) server that adds 6 powerful web extraction tools to Claude, Cursor, Windsurf, and any MCP-compatible AI agent.
|
|
4
|
+
|
|
5
|
+
No more "sorry, I can't browse the web for you." Your agent gets instant, structured access to:
|
|
6
|
+
|
|
7
|
+
- Clean article text for RAG and summarization
|
|
8
|
+
- Complete metadata (Open Graph, Twitter Card, JSON-LD)
|
|
9
|
+
- HTML tables as structured data
|
|
10
|
+
- All links, grouped and classified
|
|
11
|
+
- Contact info (emails, phones, socials)
|
|
12
|
+
- Website tech stack detection (70+ technologies)
|
|
13
|
+
|
|
14
|
+
## Install
|
|
15
|
+
|
|
16
|
+
### Claude Desktop
|
|
17
|
+
|
|
18
|
+
Edit `~/Library/Application Support/Claude/claude_desktop_config.json` (macOS) or `%APPDATA%\Claude\claude_desktop_config.json` (Windows):
|
|
19
|
+
|
|
20
|
+
```json
|
|
21
|
+
{
|
|
22
|
+
"mcpServers": {
|
|
23
|
+
"atlas-web": {
|
|
24
|
+
"command": "npx",
|
|
25
|
+
"args": ["-y", "atlas-mcp-web"]
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Restart Claude Desktop. The 6 tools will appear in the tool menu.
|
|
32
|
+
|
|
33
|
+
### Cursor / Windsurf
|
|
34
|
+
|
|
35
|
+
Add to your MCP config:
|
|
36
|
+
|
|
37
|
+
```json
|
|
38
|
+
{
|
|
39
|
+
"mcpServers": {
|
|
40
|
+
"atlas-web": {
|
|
41
|
+
"command": "npx",
|
|
42
|
+
"args": ["-y", "atlas-mcp-web"]
|
|
43
|
+
}
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Test locally
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
npx -y atlas-mcp-web
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
## Tools
|
|
55
|
+
|
|
56
|
+
### `extract_article`
|
|
57
|
+
|
|
58
|
+
Pull the main body of any news article or blog post. Strips ads, nav, comments, and boilerplate. Returns clean text plus metadata.
|
|
59
|
+
|
|
60
|
+
**Input**
|
|
61
|
+
```json
|
|
62
|
+
{ "url": "https://www.bbc.com/news/articles/..." }
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
**Output**
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"url": "...",
|
|
69
|
+
"title": "Major AI breakthrough announced",
|
|
70
|
+
"description": "Researchers report...",
|
|
71
|
+
"authors": ["Jane Doe"],
|
|
72
|
+
"publishedAt": "2026-04-13T08:00:00Z",
|
|
73
|
+
"image": "https://...",
|
|
74
|
+
"siteName": "BBC News",
|
|
75
|
+
"language": "en",
|
|
76
|
+
"content": "The full cleaned body of the article...",
|
|
77
|
+
"wordCount": 842,
|
|
78
|
+
"readingTimeMinutes": 4,
|
|
79
|
+
"keywords": ["AI", "research", "machine learning"]
|
|
80
|
+
}
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
### `extract_metadata`
|
|
84
|
+
|
|
85
|
+
Complete URL metadata for link previews, SEO audits, and bookmarks: Open Graph, Twitter Card, JSON-LD structured data, favicons, and more.
|
|
86
|
+
|
|
87
|
+
### `extract_tables`
|
|
88
|
+
|
|
89
|
+
Pull every HTML table on a page as structured arrays with headers and rows. Perfect for financial data, sports stats, product comparisons.
|
|
90
|
+
|
|
91
|
+
### `extract_links`
|
|
92
|
+
|
|
93
|
+
All links on a page, grouped by internal / external / social / email / phone. With anchor text.
|
|
94
|
+
|
|
95
|
+
### `extract_contact`
|
|
96
|
+
|
|
97
|
+
Scan a page for contact details: emails, phone numbers, and social media handles (Twitter, LinkedIn, Instagram, Facebook, YouTube, GitHub, TikTok). Ideal for lead generation.
|
|
98
|
+
|
|
99
|
+
### `detect_tech_stack`
|
|
100
|
+
|
|
101
|
+
Identify the technologies powering a website: CMS, JS frameworks, CDN, analytics, hosting, ecommerce, marketing tools. 70+ signatures supported.
|
|
102
|
+
|
|
103
|
+
## Why Cortex?
|
|
104
|
+
|
|
105
|
+
- **Free and open source** (MIT license)
|
|
106
|
+
- **No API key required** — everything runs locally through your MCP client
|
|
107
|
+
- **Fast** — pure HTTP + Cheerio, no headless browser
|
|
108
|
+
- **Private** — your agent hits the target site directly, no middleman logs your queries
|
|
109
|
+
- **Premium options** — upgrade to the hosted API at [atlas-agent.dev](https://atlas-agent.dev) for proxy rotation, JS rendering, and higher rate limits
|
|
110
|
+
|
|
111
|
+
## Use cases
|
|
112
|
+
|
|
113
|
+
- **Research agents** — Give Claude the ability to read any article in full, not just the title
|
|
114
|
+
- **Sales tools** — Automate lead generation by extracting contact info from company pages
|
|
115
|
+
- **SEO audits** — Scan hundreds of competitor URLs for tech stack, metadata, and structured data
|
|
116
|
+
- **RAG pipelines** — Feed clean article content directly into your vector database
|
|
117
|
+
- **Link preview generators** — Power Discord/Slack-style unfurls in your own apps
|
|
118
|
+
|
|
119
|
+
## License
|
|
120
|
+
|
|
121
|
+
MIT © Cortex
|
|
122
|
+
|
|
123
|
+
## Support
|
|
124
|
+
|
|
125
|
+
Issues and feature requests: [github.com/atlas-agent/mcp-web-extractor/issues](https://github.com/atlas-agent/mcp-web-extractor/issues)
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Atlas MCP — Web Extractor for AI Agents
|
|
4
|
+
*
|
|
5
|
+
* A Model Context Protocol server that gives Claude, Cursor, Windsurf and any
|
|
6
|
+
* MCP-compatible AI agent the ability to extract clean, structured data from
|
|
7
|
+
* any web page:
|
|
8
|
+
*
|
|
9
|
+
* - extract_article Clean article body for RAG pipelines
|
|
10
|
+
* - extract_metadata Open Graph, Twitter Card, JSON-LD
|
|
11
|
+
* - extract_tables All HTML tables as structured rows
|
|
12
|
+
* - extract_links All links grouped by category
|
|
13
|
+
* - extract_contact Emails, phones, social handles
|
|
14
|
+
* - detect_tech_stack CMS, framework, CDN, analytics
|
|
15
|
+
*
|
|
16
|
+
* Install:
|
|
17
|
+
* npx -y atlas-mcp-web
|
|
18
|
+
*
|
|
19
|
+
* Or configure in Claude Desktop / Cursor / Windsurf:
|
|
20
|
+
* {
|
|
21
|
+
* "mcpServers": {
|
|
22
|
+
* "atlas-web": {
|
|
23
|
+
* "command": "npx",
|
|
24
|
+
* "args": ["-y", "atlas-mcp-web"]
|
|
25
|
+
* }
|
|
26
|
+
* }
|
|
27
|
+
* }
|
|
28
|
+
*/
|
|
29
|
+
export {};
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Atlas MCP — Web Extractor for AI Agents
|
|
4
|
+
*
|
|
5
|
+
* A Model Context Protocol server that gives Claude, Cursor, Windsurf and any
|
|
6
|
+
* MCP-compatible AI agent the ability to extract clean, structured data from
|
|
7
|
+
* any web page:
|
|
8
|
+
*
|
|
9
|
+
* - extract_article Clean article body for RAG pipelines
|
|
10
|
+
* - extract_metadata Open Graph, Twitter Card, JSON-LD
|
|
11
|
+
* - extract_tables All HTML tables as structured rows
|
|
12
|
+
* - extract_links All links grouped by category
|
|
13
|
+
* - extract_contact Emails, phones, social handles
|
|
14
|
+
* - detect_tech_stack CMS, framework, CDN, analytics
|
|
15
|
+
*
|
|
16
|
+
* Install:
|
|
17
|
+
* npx -y atlas-mcp-web
|
|
18
|
+
*
|
|
19
|
+
* Or configure in Claude Desktop / Cursor / Windsurf:
|
|
20
|
+
* {
|
|
21
|
+
* "mcpServers": {
|
|
22
|
+
* "atlas-web": {
|
|
23
|
+
* "command": "npx",
|
|
24
|
+
* "args": ["-y", "atlas-mcp-web"]
|
|
25
|
+
* }
|
|
26
|
+
* }
|
|
27
|
+
* }
|
|
28
|
+
*/
|
|
29
|
+
import { Server } from '@modelcontextprotocol/sdk/server/index.js';
|
|
30
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
31
|
+
import { CallToolRequestSchema, ListToolsRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
|
|
32
|
+
import { z } from 'zod';
|
|
33
|
+
import { extractArticle } from './tools/article.js';
|
|
34
|
+
import { extractMetadata } from './tools/metadata.js';
|
|
35
|
+
import { extractTables } from './tools/tables.js';
|
|
36
|
+
import { extractLinks } from './tools/links.js';
|
|
37
|
+
import { extractContact } from './tools/contact.js';
|
|
38
|
+
import { detectTechStack } from './tools/techstack.js';
|
|
39
|
+
import { fetchHtml } from './lib/fetch.js';
|
|
40
|
+
const UrlInput = z.object({
|
|
41
|
+
url: z.string().url().describe('The URL to extract data from'),
|
|
42
|
+
});
|
|
43
|
+
const LinksInput = UrlInput.extend({
|
|
44
|
+
sameDomainOnly: z
|
|
45
|
+
.boolean()
|
|
46
|
+
.optional()
|
|
47
|
+
.default(false)
|
|
48
|
+
.describe('If true, only return links on the same domain'),
|
|
49
|
+
});
|
|
50
|
+
const TOOLS = [
|
|
51
|
+
{
|
|
52
|
+
name: 'extract_article',
|
|
53
|
+
description: 'Extract the main article body from any news article, blog post, or long-form web page. Returns clean, readable text plus title, authors, publish date, word count, and keywords. Strips ads, navigation, comments, and boilerplate. Perfect for feeding content into RAG pipelines, summarization, or analysis.',
|
|
54
|
+
inputSchema: {
|
|
55
|
+
type: 'object',
|
|
56
|
+
properties: {
|
|
57
|
+
url: { type: 'string', description: 'URL of the article to extract' },
|
|
58
|
+
},
|
|
59
|
+
required: ['url'],
|
|
60
|
+
},
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
name: 'extract_metadata',
|
|
64
|
+
description: 'Extract all metadata from any URL: Open Graph tags (title, description, image, site_name), Twitter Card metadata, JSON-LD structured data, canonical URL, favicons, and more. Ideal for building link previews, SEO audits, or enriching bookmarks.',
|
|
65
|
+
inputSchema: {
|
|
66
|
+
type: 'object',
|
|
67
|
+
properties: {
|
|
68
|
+
url: { type: 'string', description: 'URL to extract metadata from' },
|
|
69
|
+
},
|
|
70
|
+
required: ['url'],
|
|
71
|
+
},
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
name: 'extract_tables',
|
|
75
|
+
description: 'Extract all HTML tables from a web page as structured arrays. Each table is returned with its headers and rows, ready for analysis. Perfect for scraping financial data, sports stats, product comparisons, or any tabular content.',
|
|
76
|
+
inputSchema: {
|
|
77
|
+
type: 'object',
|
|
78
|
+
properties: {
|
|
79
|
+
url: { type: 'string', description: 'URL containing the tables to extract' },
|
|
80
|
+
},
|
|
81
|
+
required: ['url'],
|
|
82
|
+
},
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
name: 'extract_links',
|
|
86
|
+
description: 'Extract all links from a web page grouped by category: internal, external, social media, email, and phone. Returns anchor text alongside each URL. Useful for link audits, sitemap generation, and finding contact information.',
|
|
87
|
+
inputSchema: {
|
|
88
|
+
type: 'object',
|
|
89
|
+
properties: {
|
|
90
|
+
url: { type: 'string', description: 'URL to extract links from' },
|
|
91
|
+
sameDomainOnly: {
|
|
92
|
+
type: 'boolean',
|
|
93
|
+
description: 'If true, only return links on the same domain as the source URL',
|
|
94
|
+
default: false,
|
|
95
|
+
},
|
|
96
|
+
},
|
|
97
|
+
required: ['url'],
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
name: 'extract_contact',
|
|
102
|
+
description: "Scan a web page for contact information: email addresses, phone numbers, and social media handles (Twitter, LinkedIn, Instagram, Facebook, YouTube, GitHub). Returns normalized, deduplicated results. Perfect for lead generation and sales prospecting.",
|
|
103
|
+
inputSchema: {
|
|
104
|
+
type: 'object',
|
|
105
|
+
properties: {
|
|
106
|
+
url: { type: 'string', description: 'URL to scan for contact info' },
|
|
107
|
+
},
|
|
108
|
+
required: ['url'],
|
|
109
|
+
},
|
|
110
|
+
},
|
|
111
|
+
{
|
|
112
|
+
name: 'detect_tech_stack',
|
|
113
|
+
description: 'Detect the technologies powering a website: CMS (WordPress, Shopify, etc), JS frameworks (React, Next.js, Vue), CDN (Cloudflare, Fastly), analytics tools (Google Analytics, Mixpanel), hosting (Vercel, Netlify), ecommerce platform, and more. 70+ technologies supported.',
|
|
114
|
+
inputSchema: {
|
|
115
|
+
type: 'object',
|
|
116
|
+
properties: {
|
|
117
|
+
url: { type: 'string', description: 'URL of the website to analyze' },
|
|
118
|
+
},
|
|
119
|
+
required: ['url'],
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
];
|
|
123
|
+
const server = new Server({ name: 'atlas-web', version: '0.1.0' }, { capabilities: { tools: {} } });
|
|
124
|
+
server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: TOOLS }));
|
|
125
|
+
server.setRequestHandler(CallToolRequestSchema, async (request) => {
|
|
126
|
+
const { name, arguments: args } = request.params;
|
|
127
|
+
try {
|
|
128
|
+
if (name === 'extract_article') {
|
|
129
|
+
const { url } = UrlInput.parse(args);
|
|
130
|
+
const { $, html, finalUrl } = await fetchHtml(url);
|
|
131
|
+
const result = extractArticle({ url: finalUrl, $, html });
|
|
132
|
+
return toolResult(result);
|
|
133
|
+
}
|
|
134
|
+
if (name === 'extract_metadata') {
|
|
135
|
+
const { url } = UrlInput.parse(args);
|
|
136
|
+
const { $, finalUrl, headers } = await fetchHtml(url);
|
|
137
|
+
const result = extractMetadata({ url: finalUrl, $, headers });
|
|
138
|
+
return toolResult(result);
|
|
139
|
+
}
|
|
140
|
+
if (name === 'extract_tables') {
|
|
141
|
+
const { url } = UrlInput.parse(args);
|
|
142
|
+
const { $, finalUrl } = await fetchHtml(url);
|
|
143
|
+
const result = extractTables({ url: finalUrl, $ });
|
|
144
|
+
return toolResult(result);
|
|
145
|
+
}
|
|
146
|
+
if (name === 'extract_links') {
|
|
147
|
+
const { url, sameDomainOnly } = LinksInput.parse(args);
|
|
148
|
+
const { $, finalUrl } = await fetchHtml(url);
|
|
149
|
+
const result = extractLinks({ url: finalUrl, $, sameDomainOnly });
|
|
150
|
+
return toolResult(result);
|
|
151
|
+
}
|
|
152
|
+
if (name === 'extract_contact') {
|
|
153
|
+
const { url } = UrlInput.parse(args);
|
|
154
|
+
const { $, html, finalUrl } = await fetchHtml(url);
|
|
155
|
+
const result = extractContact({ url: finalUrl, $, html });
|
|
156
|
+
return toolResult(result);
|
|
157
|
+
}
|
|
158
|
+
if (name === 'detect_tech_stack') {
|
|
159
|
+
const { url } = UrlInput.parse(args);
|
|
160
|
+
const { $, html, finalUrl, headers } = await fetchHtml(url);
|
|
161
|
+
const result = detectTechStack({ url: finalUrl, $, html, headers });
|
|
162
|
+
return toolResult(result);
|
|
163
|
+
}
|
|
164
|
+
throw new Error(`Unknown tool: ${name}`);
|
|
165
|
+
}
|
|
166
|
+
catch (error) {
|
|
167
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
168
|
+
return {
|
|
169
|
+
content: [{ type: 'text', text: `Error: ${message}` }],
|
|
170
|
+
isError: true,
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
});
|
|
174
|
+
function toolResult(data) {
|
|
175
|
+
return {
|
|
176
|
+
content: [
|
|
177
|
+
{
|
|
178
|
+
type: 'text',
|
|
179
|
+
text: JSON.stringify(data, null, 2),
|
|
180
|
+
},
|
|
181
|
+
],
|
|
182
|
+
};
|
|
183
|
+
}
|
|
184
|
+
async function main() {
|
|
185
|
+
const transport = new StdioServerTransport();
|
|
186
|
+
await server.connect(transport);
|
|
187
|
+
// eslint-disable-next-line no-console
|
|
188
|
+
console.error('Atlas MCP Web Extractor ready');
|
|
189
|
+
}
|
|
190
|
+
main().catch((err) => {
|
|
191
|
+
// eslint-disable-next-line no-console
|
|
192
|
+
console.error('Fatal error:', err);
|
|
193
|
+
process.exit(1);
|
|
194
|
+
});
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTTP fetch with polite defaults, redirect following, and Cheerio parsing.
|
|
3
|
+
*/
|
|
4
|
+
import { type CheerioAPI } from 'cheerio';
|
|
5
|
+
export interface FetchResult {
|
|
6
|
+
$: CheerioAPI;
|
|
7
|
+
html: string;
|
|
8
|
+
finalUrl: string;
|
|
9
|
+
headers: Record<string, string>;
|
|
10
|
+
statusCode: number;
|
|
11
|
+
}
|
|
12
|
+
export declare function fetchHtml(url: string, timeoutMs?: number): Promise<FetchResult>;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTTP fetch with polite defaults, redirect following, and Cheerio parsing.
|
|
3
|
+
*/
|
|
4
|
+
import { load } from 'cheerio';
|
|
5
|
+
const USER_AGENT = 'Mozilla/5.0 (compatible; Cortex-MCP/0.1; +https://atlas-agent.dev/bot)';
|
|
6
|
+
export async function fetchHtml(url, timeoutMs = 20000) {
|
|
7
|
+
const controller = new AbortController();
|
|
8
|
+
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
9
|
+
try {
|
|
10
|
+
const response = await fetch(url, {
|
|
11
|
+
headers: {
|
|
12
|
+
'User-Agent': USER_AGENT,
|
|
13
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
14
|
+
'Accept-Language': 'en-US,en;q=0.9',
|
|
15
|
+
},
|
|
16
|
+
redirect: 'follow',
|
|
17
|
+
signal: controller.signal,
|
|
18
|
+
});
|
|
19
|
+
if (!response.ok) {
|
|
20
|
+
throw new Error(`HTTP ${response.status} ${response.statusText} when fetching ${url}`);
|
|
21
|
+
}
|
|
22
|
+
const html = await response.text();
|
|
23
|
+
const $ = load(html);
|
|
24
|
+
const headers = {};
|
|
25
|
+
response.headers.forEach((value, key) => {
|
|
26
|
+
headers[key.toLowerCase()] = value;
|
|
27
|
+
});
|
|
28
|
+
return {
|
|
29
|
+
$,
|
|
30
|
+
html,
|
|
31
|
+
finalUrl: response.url,
|
|
32
|
+
headers,
|
|
33
|
+
statusCode: response.status,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
finally {
|
|
37
|
+
clearTimeout(timeout);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clean article extraction: title, authors, date, content.
|
|
3
|
+
* JSON-LD first, then OG/meta, then readability heuristics.
|
|
4
|
+
*/
|
|
5
|
+
import type { CheerioAPI } from 'cheerio';
|
|
6
|
+
export declare function extractArticle({ url, $, }: {
|
|
7
|
+
url: string;
|
|
8
|
+
$: CheerioAPI;
|
|
9
|
+
html: string;
|
|
10
|
+
}): {
|
|
11
|
+
url: string;
|
|
12
|
+
title: string;
|
|
13
|
+
description: string;
|
|
14
|
+
authors: string[];
|
|
15
|
+
publishedAt: string | null;
|
|
16
|
+
image: string | null;
|
|
17
|
+
siteName: string | null;
|
|
18
|
+
language: string | null;
|
|
19
|
+
content: string;
|
|
20
|
+
wordCount: number;
|
|
21
|
+
readingTimeMinutes: number;
|
|
22
|
+
keywords: string[];
|
|
23
|
+
};
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Clean article extraction: title, authors, date, content.
|
|
3
|
+
* JSON-LD first, then OG/meta, then readability heuristics.
|
|
4
|
+
*/
|
|
5
|
+
const MIN_CHARS = 280;
|
|
6
|
+
export function extractArticle({ url, $, }) {
|
|
7
|
+
const meta = extractMeta($);
|
|
8
|
+
const jsonLd = extractJsonLd($);
|
|
9
|
+
const title = jsonLd.headline ||
|
|
10
|
+
meta.ogTitle ||
|
|
11
|
+
meta.twitterTitle ||
|
|
12
|
+
($('h1').first().text() || '').trim() ||
|
|
13
|
+
($('title').first().text() || '').trim();
|
|
14
|
+
const description = jsonLd.description ||
|
|
15
|
+
meta.ogDescription ||
|
|
16
|
+
meta.twitterDescription ||
|
|
17
|
+
meta.description ||
|
|
18
|
+
'';
|
|
19
|
+
const authors = jsonLd.authors.length
|
|
20
|
+
? jsonLd.authors
|
|
21
|
+
: extractAuthorsFromMeta($, meta);
|
|
22
|
+
const publishedAt = jsonLd.datePublished ||
|
|
23
|
+
meta.articlePublishedTime ||
|
|
24
|
+
$('time[datetime]').first().attr('datetime') ||
|
|
25
|
+
null;
|
|
26
|
+
const image = jsonLd.image || meta.ogImage || meta.twitterImage || null;
|
|
27
|
+
const siteName = meta.ogSiteName || extractDomain(url);
|
|
28
|
+
const language = ($('html').attr('lang') || meta.ogLocale || '').split('-')[0].toLowerCase() ||
|
|
29
|
+
null;
|
|
30
|
+
let content = extractContentFromArticle($);
|
|
31
|
+
if (!content || content.length < MIN_CHARS) {
|
|
32
|
+
content = extractContentByReadability($);
|
|
33
|
+
}
|
|
34
|
+
content = cleanText(content);
|
|
35
|
+
const words = content.split(/\s+/).filter(Boolean);
|
|
36
|
+
const wordCount = words.length;
|
|
37
|
+
const readingTimeMinutes = Math.max(1, Math.round(wordCount / 220));
|
|
38
|
+
return {
|
|
39
|
+
url,
|
|
40
|
+
title: cleanText(title),
|
|
41
|
+
description: cleanText(description),
|
|
42
|
+
authors,
|
|
43
|
+
publishedAt,
|
|
44
|
+
image,
|
|
45
|
+
siteName,
|
|
46
|
+
language,
|
|
47
|
+
content,
|
|
48
|
+
wordCount,
|
|
49
|
+
readingTimeMinutes,
|
|
50
|
+
keywords: extractKeywords($, meta, jsonLd),
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
function extractMeta($) {
|
|
54
|
+
const meta = {};
|
|
55
|
+
$('meta').each((_, el) => {
|
|
56
|
+
const name = ($(el).attr('name') || '').toLowerCase();
|
|
57
|
+
const property = ($(el).attr('property') || '').toLowerCase();
|
|
58
|
+
const content = $(el).attr('content') || '';
|
|
59
|
+
if (!content)
|
|
60
|
+
return;
|
|
61
|
+
if (name === 'description')
|
|
62
|
+
meta.description = content;
|
|
63
|
+
if (name === 'keywords')
|
|
64
|
+
meta.keywords = content;
|
|
65
|
+
if (name === 'author')
|
|
66
|
+
meta.author = content;
|
|
67
|
+
if (name === 'twitter:title')
|
|
68
|
+
meta.twitterTitle = content;
|
|
69
|
+
if (name === 'twitter:description')
|
|
70
|
+
meta.twitterDescription = content;
|
|
71
|
+
if (name === 'twitter:image')
|
|
72
|
+
meta.twitterImage = content;
|
|
73
|
+
if (property === 'og:title')
|
|
74
|
+
meta.ogTitle = content;
|
|
75
|
+
if (property === 'og:description')
|
|
76
|
+
meta.ogDescription = content;
|
|
77
|
+
if (property === 'og:image')
|
|
78
|
+
meta.ogImage = content;
|
|
79
|
+
if (property === 'og:site_name')
|
|
80
|
+
meta.ogSiteName = content;
|
|
81
|
+
if (property === 'og:locale')
|
|
82
|
+
meta.ogLocale = content;
|
|
83
|
+
if (property === 'article:published_time')
|
|
84
|
+
meta.articlePublishedTime = content;
|
|
85
|
+
if (property === 'article:modified_time')
|
|
86
|
+
meta.articleModifiedTime = content;
|
|
87
|
+
if (property === 'article:author')
|
|
88
|
+
meta.articleAuthor = content;
|
|
89
|
+
if (property === 'article:tag') {
|
|
90
|
+
if (!Array.isArray(meta.articleTags))
|
|
91
|
+
meta.articleTags = [];
|
|
92
|
+
meta.articleTags.push(content);
|
|
93
|
+
}
|
|
94
|
+
});
|
|
95
|
+
return meta;
|
|
96
|
+
}
|
|
97
|
+
function extractJsonLd($) {
|
|
98
|
+
const out = {
|
|
99
|
+
headline: null,
|
|
100
|
+
description: null,
|
|
101
|
+
authors: [],
|
|
102
|
+
datePublished: null,
|
|
103
|
+
image: null,
|
|
104
|
+
keywords: [],
|
|
105
|
+
};
|
|
106
|
+
$('script[type="application/ld+json"]').each((_, el) => {
|
|
107
|
+
const raw = $(el).contents().text();
|
|
108
|
+
if (!raw)
|
|
109
|
+
return;
|
|
110
|
+
let data;
|
|
111
|
+
try {
|
|
112
|
+
data = JSON.parse(raw);
|
|
113
|
+
}
|
|
114
|
+
catch {
|
|
115
|
+
return;
|
|
116
|
+
}
|
|
117
|
+
const items = Array.isArray(data) ? data : [data];
|
|
118
|
+
for (const item of items) {
|
|
119
|
+
collectJsonLdArticle(item, out);
|
|
120
|
+
if (item && typeof item === 'object' && '@graph' in item) {
|
|
121
|
+
const graph = item['@graph'];
|
|
122
|
+
if (Array.isArray(graph))
|
|
123
|
+
for (const sub of graph)
|
|
124
|
+
collectJsonLdArticle(sub, out);
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
});
|
|
128
|
+
return out;
|
|
129
|
+
}
|
|
130
|
+
function collectJsonLdArticle(item, out) {
|
|
131
|
+
if (!item || typeof item !== 'object')
|
|
132
|
+
return;
|
|
133
|
+
const obj = item;
|
|
134
|
+
const type = obj['@type'];
|
|
135
|
+
const isArticle = type === 'NewsArticle' ||
|
|
136
|
+
type === 'Article' ||
|
|
137
|
+
type === 'BlogPosting' ||
|
|
138
|
+
type === 'Report' ||
|
|
139
|
+
(Array.isArray(type) && type.some((t) => typeof t === 'string' && /Article|Posting/.test(t)));
|
|
140
|
+
if (!isArticle)
|
|
141
|
+
return;
|
|
142
|
+
if (!out.headline && typeof obj.headline === 'string')
|
|
143
|
+
out.headline = obj.headline;
|
|
144
|
+
if (!out.description && typeof obj.description === 'string')
|
|
145
|
+
out.description = obj.description;
|
|
146
|
+
if (!out.datePublished && typeof obj.datePublished === 'string')
|
|
147
|
+
out.datePublished = obj.datePublished;
|
|
148
|
+
if (!out.image) {
|
|
149
|
+
const img = obj.image;
|
|
150
|
+
if (typeof img === 'string')
|
|
151
|
+
out.image = img;
|
|
152
|
+
else if (Array.isArray(img) && img.length) {
|
|
153
|
+
const first = img[0];
|
|
154
|
+
out.image = typeof first === 'string' ? first : first?.url || null;
|
|
155
|
+
}
|
|
156
|
+
else if (img && typeof img === 'object') {
|
|
157
|
+
out.image = img.url || null;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
const authors = [].concat(obj.author || []).flat();
|
|
161
|
+
for (const a of authors) {
|
|
162
|
+
if (!a)
|
|
163
|
+
continue;
|
|
164
|
+
const name = typeof a === 'string' ? a : a?.name || null;
|
|
165
|
+
if (name && !out.authors.includes(name))
|
|
166
|
+
out.authors.push(name);
|
|
167
|
+
}
|
|
168
|
+
if (obj.keywords) {
|
|
169
|
+
const kws = Array.isArray(obj.keywords)
|
|
170
|
+
? obj.keywords
|
|
171
|
+
: String(obj.keywords).split(',').map((k) => k.trim());
|
|
172
|
+
for (const k of kws)
|
|
173
|
+
if (k && !out.keywords.includes(k))
|
|
174
|
+
out.keywords.push(k);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
function extractAuthorsFromMeta($, meta) {
|
|
178
|
+
const authors = new Set();
|
|
179
|
+
if (meta.author)
|
|
180
|
+
authors.add(meta.author);
|
|
181
|
+
if (meta.articleAuthor)
|
|
182
|
+
authors.add(meta.articleAuthor);
|
|
183
|
+
$('[rel="author"], .author, .byline, .post-author').each((_, el) => {
|
|
184
|
+
const t = $(el).text().trim();
|
|
185
|
+
if (t && t.length < 100)
|
|
186
|
+
authors.add(t);
|
|
187
|
+
});
|
|
188
|
+
return Array.from(authors).filter(Boolean).slice(0, 10);
|
|
189
|
+
}
|
|
190
|
+
function extractContentFromArticle($) {
|
|
191
|
+
const noiseSelectors = [
|
|
192
|
+
'script', 'style', 'nav', 'header', 'footer', 'aside',
|
|
193
|
+
'.ad', '.ads', '.advertisement', '.social', '.share', '.newsletter',
|
|
194
|
+
'[class*="subscribe"]', '[class*="paywall"]', '[class*="related"]',
|
|
195
|
+
'[class*="comment"]', '[class*="sidebar"]',
|
|
196
|
+
];
|
|
197
|
+
const $clone = $.root().clone();
|
|
198
|
+
noiseSelectors.forEach((sel) => $clone.find(sel).remove());
|
|
199
|
+
const selectors = [
|
|
200
|
+
'article',
|
|
201
|
+
'[itemprop="articleBody"]',
|
|
202
|
+
'[role="main"] article',
|
|
203
|
+
'main article',
|
|
204
|
+
'.article-body',
|
|
205
|
+
'.post-content',
|
|
206
|
+
'.entry-content',
|
|
207
|
+
'.story-body',
|
|
208
|
+
'main',
|
|
209
|
+
];
|
|
210
|
+
for (const sel of selectors) {
|
|
211
|
+
const el = $clone.find(sel).first();
|
|
212
|
+
if (!el.length)
|
|
213
|
+
continue;
|
|
214
|
+
const text = el.find('p').map((_, p) => $(p).text()).get().join('\n\n').trim();
|
|
215
|
+
if (text.length >= MIN_CHARS)
|
|
216
|
+
return text;
|
|
217
|
+
}
|
|
218
|
+
return '';
|
|
219
|
+
}
|
|
220
|
+
function extractContentByReadability($) {
|
|
221
|
+
const paragraphs = [];
|
|
222
|
+
$('p').each((_, el) => {
|
|
223
|
+
const text = $(el).text().trim();
|
|
224
|
+
if (text.length >= 40)
|
|
225
|
+
paragraphs.push(text);
|
|
226
|
+
});
|
|
227
|
+
return paragraphs.join('\n\n');
|
|
228
|
+
}
|
|
229
|
+
function extractKeywords($, meta, jsonLd) {
|
|
230
|
+
const set = new Set();
|
|
231
|
+
if (meta.keywords) {
|
|
232
|
+
for (const k of String(meta.keywords).split(',')) {
|
|
233
|
+
const trimmed = k.trim();
|
|
234
|
+
if (trimmed)
|
|
235
|
+
set.add(trimmed);
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
if (Array.isArray(meta.articleTags))
|
|
239
|
+
for (const t of meta.articleTags)
|
|
240
|
+
set.add(t);
|
|
241
|
+
for (const k of jsonLd.keywords)
|
|
242
|
+
set.add(k);
|
|
243
|
+
return Array.from(set).slice(0, 25);
|
|
244
|
+
}
|
|
245
|
+
function cleanText(text) {
|
|
246
|
+
if (!text)
|
|
247
|
+
return '';
|
|
248
|
+
return String(text)
|
|
249
|
+
.replace(/\s+/g, ' ')
|
|
250
|
+
.replace(/\s([.,;:!?])/g, '$1')
|
|
251
|
+
.trim();
|
|
252
|
+
}
|
|
253
|
+
function extractDomain(url) {
|
|
254
|
+
try {
|
|
255
|
+
return new URL(url).hostname.replace(/^www\./, '');
|
|
256
|
+
}
|
|
257
|
+
catch {
|
|
258
|
+
return null;
|
|
259
|
+
}
|
|
260
|
+
}
|