webpeel 0.20.2 → 0.20.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/server/app.d.ts +14 -0
- package/dist/server/app.js +384 -0
- package/dist/server/auth-store.d.ts +27 -0
- package/dist/server/auth-store.js +88 -0
- package/dist/server/email-service.d.ts +21 -0
- package/dist/server/email-service.js +79 -0
- package/dist/server/job-queue.d.ts +100 -0
- package/dist/server/job-queue.js +145 -0
- package/dist/server/logger.d.ts +10 -0
- package/dist/server/logger.js +37 -0
- package/dist/server/middleware/auth.d.ts +28 -0
- package/dist/server/middleware/auth.js +221 -0
- package/dist/server/middleware/rate-limit.d.ts +24 -0
- package/dist/server/middleware/rate-limit.js +167 -0
- package/dist/server/middleware/url-validator.d.ts +15 -0
- package/dist/server/middleware/url-validator.js +186 -0
- package/dist/server/openapi.yaml +6418 -0
- package/dist/server/pg-auth-store.d.ts +132 -0
- package/dist/server/pg-auth-store.js +472 -0
- package/dist/server/pg-job-queue.d.ts +59 -0
- package/dist/server/pg-job-queue.js +375 -0
- package/dist/server/premium/domain-intel.d.ts +16 -0
- package/dist/server/premium/domain-intel.js +133 -0
- package/dist/server/premium/index.d.ts +17 -0
- package/dist/server/premium/index.js +35 -0
- package/dist/server/premium/swr-cache.d.ts +14 -0
- package/dist/server/premium/swr-cache.js +34 -0
- package/dist/server/routes/activity.d.ts +6 -0
- package/dist/server/routes/activity.js +74 -0
- package/dist/server/routes/answer.d.ts +5 -0
- package/dist/server/routes/answer.js +125 -0
- package/dist/server/routes/ask.d.ts +28 -0
- package/dist/server/routes/ask.js +229 -0
- package/dist/server/routes/batch.d.ts +6 -0
- package/dist/server/routes/batch.js +493 -0
- package/dist/server/routes/cli-usage.d.ts +6 -0
- package/dist/server/routes/cli-usage.js +127 -0
- package/dist/server/routes/compat.d.ts +23 -0
- package/dist/server/routes/compat.js +652 -0
- package/dist/server/routes/deep-fetch.d.ts +8 -0
- package/dist/server/routes/deep-fetch.js +57 -0
- package/dist/server/routes/demo.d.ts +24 -0
- package/dist/server/routes/demo.js +517 -0
- package/dist/server/routes/do.d.ts +8 -0
- package/dist/server/routes/do.js +72 -0
- package/dist/server/routes/extract.d.ts +8 -0
- package/dist/server/routes/extract.js +235 -0
- package/dist/server/routes/fetch.d.ts +7 -0
- package/dist/server/routes/fetch.js +999 -0
- package/dist/server/routes/health.d.ts +7 -0
- package/dist/server/routes/health.js +19 -0
- package/dist/server/routes/jobs.d.ts +7 -0
- package/dist/server/routes/jobs.js +573 -0
- package/dist/server/routes/mcp.d.ts +14 -0
- package/dist/server/routes/mcp.js +141 -0
- package/dist/server/routes/oauth.d.ts +9 -0
- package/dist/server/routes/oauth.js +396 -0
- package/dist/server/routes/playground.d.ts +17 -0
- package/dist/server/routes/playground.js +283 -0
- package/dist/server/routes/screenshot.d.ts +22 -0
- package/dist/server/routes/screenshot.js +816 -0
- package/dist/server/routes/search.d.ts +6 -0
- package/dist/server/routes/search.js +303 -0
- package/dist/server/routes/session.d.ts +15 -0
- package/dist/server/routes/session.js +397 -0
- package/dist/server/routes/stats.d.ts +6 -0
- package/dist/server/routes/stats.js +71 -0
- package/dist/server/routes/stripe.d.ts +15 -0
- package/dist/server/routes/stripe.js +294 -0
- package/dist/server/routes/users.d.ts +8 -0
- package/dist/server/routes/users.js +1671 -0
- package/dist/server/routes/watch.d.ts +15 -0
- package/dist/server/routes/watch.js +309 -0
- package/dist/server/routes/webhooks.d.ts +26 -0
- package/dist/server/routes/webhooks.js +170 -0
- package/dist/server/routes/youtube.d.ts +6 -0
- package/dist/server/routes/youtube.js +130 -0
- package/dist/server/sentry.d.ts +13 -0
- package/dist/server/sentry.js +38 -0
- package/dist/server/types.d.ts +15 -0
- package/dist/server/types.js +7 -0
- package/dist/server/utils/response.d.ts +44 -0
- package/dist/server/utils/response.js +69 -0
- package/dist/server/utils/sse.d.ts +22 -0
- package/dist/server/utils/sse.js +38 -0
- package/package.json +2 -1
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* POST /v1/deep-fetch
|
|
3
|
+
*
|
|
4
|
+
* Deep web intelligence endpoint: search + fetch + synthesize + structure.
|
|
5
|
+
* Body: { query, count?, format?, maxChars? }
|
|
6
|
+
*/
|
|
7
|
+
import { Router } from 'express';
|
|
8
|
+
import { deepFetch } from '../../core/deep-fetch.js';
|
|
9
|
+
export function createDeepFetchRouter() {
|
|
10
|
+
const router = Router();
|
|
11
|
+
router.post('/v1/deep-fetch', async (req, res) => {
|
|
12
|
+
// Deprecation notice — prefer /v1/search?depth=deep
|
|
13
|
+
res.setHeader('X-Deprecated', 'true');
|
|
14
|
+
res.setHeader('X-Deprecated-Use', '/v1/search?depth=deep');
|
|
15
|
+
// AUTH: require authentication (global middleware sets req.auth)
|
|
16
|
+
const dfAuthId = req.auth?.keyInfo?.accountId || req.user?.userId;
|
|
17
|
+
if (!dfAuthId) {
|
|
18
|
+
res.status(401).json({ success: false, error: { type: 'authentication_required', message: 'API key required. Get one at https://app.webpeel.dev/keys', hint: 'Get a free API key at https://app.webpeel.dev/keys', docs: 'https://webpeel.dev/docs/errors#authentication_required' }, requestId: req.requestId });
|
|
19
|
+
return;
|
|
20
|
+
}
|
|
21
|
+
try {
|
|
22
|
+
const body = req.body;
|
|
23
|
+
const query = body.query;
|
|
24
|
+
if (!query || typeof query !== 'string' || !query.trim()) {
|
|
25
|
+
res.status(400).json({ success: false, error: { type: 'bad_request', message: 'Missing required field: query', hint: 'Include a "query" string in the request body', docs: 'https://webpeel.dev/docs/errors#bad_request' }, requestId: req.requestId });
|
|
26
|
+
return;
|
|
27
|
+
}
|
|
28
|
+
const options = {
|
|
29
|
+
query: query.trim(),
|
|
30
|
+
count: typeof body.count === 'number' ? Math.min(Math.max(body.count, 1), 10) : 5,
|
|
31
|
+
format: ['merged', 'structured', 'comparison'].includes(body.format)
|
|
32
|
+
? body.format
|
|
33
|
+
: 'merged',
|
|
34
|
+
maxChars: typeof body.maxChars === 'number' ? body.maxChars : 32000,
|
|
35
|
+
};
|
|
36
|
+
const result = await deepFetch(options);
|
|
37
|
+
res.json({
|
|
38
|
+
...result,
|
|
39
|
+
content: result.merged || '', // expose as `content` for consistency
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
catch (err) {
|
|
43
|
+
const message = err instanceof Error ? err.message : 'Unknown error';
|
|
44
|
+
console.error('[deep-fetch] error:', message);
|
|
45
|
+
res.status(500).json({
|
|
46
|
+
success: false,
|
|
47
|
+
error: {
|
|
48
|
+
type: 'internal_error',
|
|
49
|
+
message,
|
|
50
|
+
docs: 'https://webpeel.dev/docs/errors#internal_error',
|
|
51
|
+
},
|
|
52
|
+
requestId: req.requestId,
|
|
53
|
+
});
|
|
54
|
+
}
|
|
55
|
+
});
|
|
56
|
+
return router;
|
|
57
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Demo endpoint — GET /v1/demo?url=<encoded_url>
|
|
3
|
+
*
|
|
4
|
+
* Unauthenticated endpoint for the WebPeel landing page hero demo.
|
|
5
|
+
* Returns a truncated fetch result for allowed domains only.
|
|
6
|
+
*
|
|
7
|
+
* Security:
|
|
8
|
+
* - Domain allowlist (no arbitrary URLs)
|
|
9
|
+
* - Separate rate limiter: 3 req/min, 30 req/day per IP
|
|
10
|
+
* - SSRF validation via validateUrl()
|
|
11
|
+
* - HTTP-only fetch (no Puppeteer/browser rendering)
|
|
12
|
+
* - 5s timeout, 3000 char content truncation
|
|
13
|
+
* - CORS: only webpeel.dev + localhost
|
|
14
|
+
* - In-memory cache: 10 min per URL
|
|
15
|
+
*/
|
|
16
|
+
import { Router } from 'express';
|
|
17
|
+
import { RateLimiter } from '../middleware/rate-limit.js';
|
|
18
|
+
export interface DemoRouterOptions {
|
|
19
|
+
/** Inject custom per-minute rate limiter (useful for testing) */
|
|
20
|
+
perMinute?: RateLimiter;
|
|
21
|
+
/** Inject custom per-day rate limiter (useful for testing) */
|
|
22
|
+
perDay?: RateLimiter;
|
|
23
|
+
}
|
|
24
|
+
export declare function createDemoRouter(options?: DemoRouterOptions): Router;
|
|
@@ -0,0 +1,517 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Demo endpoint — GET /v1/demo?url=<encoded_url>
|
|
3
|
+
*
|
|
4
|
+
* Unauthenticated endpoint for the WebPeel landing page hero demo.
|
|
5
|
+
* Returns a truncated fetch result for allowed domains only.
|
|
6
|
+
*
|
|
7
|
+
* Security:
|
|
8
|
+
* - Domain allowlist (no arbitrary URLs)
|
|
9
|
+
* - Separate rate limiter: 3 req/min, 30 req/day per IP
|
|
10
|
+
* - SSRF validation via validateUrl()
|
|
11
|
+
* - HTTP-only fetch (no Puppeteer/browser rendering)
|
|
12
|
+
* - 5s timeout, 3000 char content truncation
|
|
13
|
+
* - CORS: only webpeel.dev + localhost
|
|
14
|
+
* - In-memory cache: 10 min per URL
|
|
15
|
+
*/
|
|
16
|
+
import { Router } from 'express';
|
|
17
|
+
import crypto from 'crypto';
|
|
18
|
+
import { RateLimiter } from '../middleware/rate-limit.js';
|
|
19
|
+
import { simpleFetch } from '../../core/http-fetch.js';
|
|
20
|
+
import { validateUrl } from '../../core/http-fetch.js';
|
|
21
|
+
import { htmlToMarkdown, detectMainContent, countRemovedElements } from '../../core/markdown.js';
|
|
22
|
+
import { extractMetadata } from '../../core/metadata.js';
|
|
23
|
+
// ── Domain allowlist ──────────────────────────────────────────────────────────
|
|
24
|
+
const ALLOWED_DOMAINS = new Set([
|
|
25
|
+
'stripe.com',
|
|
26
|
+
'wikipedia.org',
|
|
27
|
+
'en.wikipedia.org',
|
|
28
|
+
'news.ycombinator.com',
|
|
29
|
+
'github.com',
|
|
30
|
+
'reddit.com',
|
|
31
|
+
'www.reddit.com',
|
|
32
|
+
'bbc.com',
|
|
33
|
+
'www.bbc.com',
|
|
34
|
+
'nytimes.com',
|
|
35
|
+
'www.nytimes.com',
|
|
36
|
+
'techcrunch.com',
|
|
37
|
+
'arxiv.org',
|
|
38
|
+
'stackoverflow.com',
|
|
39
|
+
'producthunt.com',
|
|
40
|
+
'www.producthunt.com',
|
|
41
|
+
'theverge.com',
|
|
42
|
+
'www.theverge.com',
|
|
43
|
+
'arstechnica.com',
|
|
44
|
+
'www.arstechnica.com',
|
|
45
|
+
'docs.python.org',
|
|
46
|
+
'developer.mozilla.org',
|
|
47
|
+
]);
|
|
48
|
+
// ── Rate limiters (demo-specific, separate from main API) ─────────────────────
|
|
49
|
+
// 3 requests per minute per IP
|
|
50
|
+
const perMinuteLimiter = new RateLimiter(60_000);
|
|
51
|
+
// 30 requests per day per IP
|
|
52
|
+
const perDayLimiter = new RateLimiter(24 * 60 * 60 * 1000);
|
|
53
|
+
// Cleanup every 10 minutes
|
|
54
|
+
setInterval(() => {
|
|
55
|
+
perMinuteLimiter.cleanup();
|
|
56
|
+
perDayLimiter.cleanup();
|
|
57
|
+
}, 10 * 60 * 1000);
|
|
58
|
+
const CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes
|
|
59
|
+
const demoCache = new Map();
|
|
60
|
+
const MAX_CONTENT_LENGTH = 3000;
|
|
61
|
+
const FETCH_TIMEOUT_MS = 5000;
|
|
62
|
+
const SIGN_UP_URL = 'https://app.webpeel.dev';
|
|
63
|
+
// ── Wikipedia REST API headers (per Wikimedia User-Agent policy) ──────────────
|
|
64
|
+
const WIKI_HEADERS = {
|
|
65
|
+
'User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me) Node.js',
|
|
66
|
+
'Api-User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me)',
|
|
67
|
+
'Cache-Control': 'no-cache',
|
|
68
|
+
'If-None-Match': '', // Prevent 304 responses from Wikipedia REST API
|
|
69
|
+
};
|
|
70
|
+
// ── Helper: strip HTML tags and decode common entities ────────────────────────
|
|
71
|
+
// Uses a quote-aware regex to handle `>` inside attribute values (e.g. data-mw='{"type":"..."}')
|
|
72
|
+
function stripHtmlTags(str) {
|
|
73
|
+
return str
|
|
74
|
+
// Remove tags, handling quoted attribute values containing >
|
|
75
|
+
.replace(/<(?:[^>"']|"[^"]*"|'[^']*')*>/g, '')
|
|
76
|
+
.replace(/&/g, '&')
|
|
77
|
+
.replace(/</g, '<')
|
|
78
|
+
.replace(/>/g, '>')
|
|
79
|
+
.replace(/"/g, '"')
|
|
80
|
+
.replace(/'/g, "'")
|
|
81
|
+
.replace(/ /g, ' ')
|
|
82
|
+
.trim();
|
|
83
|
+
}
|
|
84
|
+
// ── Wikipedia-specific content cleaner (mirrors domain-extractors.ts) ─────────
|
|
85
|
+
function cleanWikipediaContent(content) {
|
|
86
|
+
return content
|
|
87
|
+
// Remove [edit] links
|
|
88
|
+
.replace(/\[edit\]/gi, '')
|
|
89
|
+
// Remove citation brackets [1], [2], etc.
|
|
90
|
+
.replace(/\[\d+\]/g, '')
|
|
91
|
+
// Remove [citation needed], [verification], etc.
|
|
92
|
+
.replace(/\[(citation needed|verification|improve this article|adding citations[^\]]*|when\?|where\?|who\?|clarification needed|dubious[^\]]*|failed verification[^\]]*|unreliable source[^\]]*)\]/gi, '')
|
|
93
|
+
// Remove [Learn how and when to remove this message]
|
|
94
|
+
.replace(/\[Learn how and when to remove this message\]/gi, '')
|
|
95
|
+
// Clean up excess whitespace
|
|
96
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
97
|
+
.trim();
|
|
98
|
+
}
|
|
99
|
+
// ── General post-processing for all demo content ──────────────────────────────
|
|
100
|
+
function cleanDemoContent(content) {
|
|
101
|
+
return content
|
|
102
|
+
// Remove empty markdown links: [](/path "tooltip") or [ ](...)
|
|
103
|
+
.replace(/\[(?:\s*)\]\([^)]*\)/g, '')
|
|
104
|
+
// Remove Wikipedia boilerplate
|
|
105
|
+
.replace(/From Wikipedia, the free encyclopedia\s*/gi, '')
|
|
106
|
+
// Remove redirect notices
|
|
107
|
+
.replace(/"[^"]*" (?:and "[^"]*" )?redirect(?:s)? here\.\s*(?:For[^.]*\.\s*)?/gi, '')
|
|
108
|
+
// Remove [edit] links
|
|
109
|
+
.replace(/\[edit\]/gi, '')
|
|
110
|
+
// Remove citation brackets
|
|
111
|
+
.replace(/\[\d+\]/g, '')
|
|
112
|
+
// Remove stray JSON attribute value artifacts from HTML parsing (e.g. "}"> )
|
|
113
|
+
.replace(/^["}'>]+\s*$/gm, '')
|
|
114
|
+
// Clean up excess whitespace
|
|
115
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
116
|
+
.trim();
|
|
117
|
+
}
|
|
118
|
+
// ── Wikipedia REST API fetcher ────────────────────────────────────────────────
|
|
119
|
+
async function fetchWikipediaContent(url) {
|
|
120
|
+
try {
|
|
121
|
+
const urlObj = new URL(url);
|
|
122
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
123
|
+
// Only handle article pages: /wiki/Article_Title
|
|
124
|
+
if (pathParts[0] !== 'wiki' || pathParts.length < 2)
|
|
125
|
+
return null;
|
|
126
|
+
const articleTitle = decodeURIComponent(pathParts[1]);
|
|
127
|
+
// Skip special pages (contain a colon, e.g. Special:Random, Talk:Article)
|
|
128
|
+
if (articleTitle.includes(':'))
|
|
129
|
+
return null;
|
|
130
|
+
const lang = urlObj.hostname.split('.')[0] || 'en';
|
|
131
|
+
// Fetch summary for title/description
|
|
132
|
+
const summaryUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
|
|
133
|
+
const summaryResult = await simpleFetch(summaryUrl, undefined, 8000, {
|
|
134
|
+
...WIKI_HEADERS,
|
|
135
|
+
'Accept': 'application/json',
|
|
136
|
+
});
|
|
137
|
+
let summaryData = null;
|
|
138
|
+
try {
|
|
139
|
+
summaryData = JSON.parse(summaryResult.html || '');
|
|
140
|
+
}
|
|
141
|
+
catch {
|
|
142
|
+
summaryData = null;
|
|
143
|
+
}
|
|
144
|
+
if (!summaryData || summaryData.type === 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found') {
|
|
145
|
+
return null;
|
|
146
|
+
}
|
|
147
|
+
const articleTitleClean = summaryData.title || articleTitle.replace(/_/g, ' ');
|
|
148
|
+
const description = summaryData.description || '';
|
|
149
|
+
// Fetch full content via mobile-html
|
|
150
|
+
let fullContent = '';
|
|
151
|
+
try {
|
|
152
|
+
const mobileUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
|
|
153
|
+
const mobileResult = await simpleFetch(mobileUrl, undefined, 15000, {
|
|
154
|
+
...WIKI_HEADERS,
|
|
155
|
+
'Accept': 'text/html',
|
|
156
|
+
});
|
|
157
|
+
if (mobileResult?.html) {
|
|
158
|
+
const sectionMatches = mobileResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
|
|
159
|
+
for (const section of sectionMatches) {
|
|
160
|
+
// Extract section heading
|
|
161
|
+
const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
|
|
162
|
+
const heading = headingMatch ? stripHtmlTags(headingMatch[2]).trim() : '';
|
|
163
|
+
// Extract paragraphs
|
|
164
|
+
const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
|
|
165
|
+
const sectionText = paragraphs
|
|
166
|
+
.map((p) => stripHtmlTags(p).trim())
|
|
167
|
+
.filter((t) => t.length > 0)
|
|
168
|
+
.join('\n\n');
|
|
169
|
+
if (sectionText) {
|
|
170
|
+
const prefix = heading ? `## ${heading}\n\n` : '';
|
|
171
|
+
fullContent += `\n\n${prefix}${sectionText}`;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
catch {
|
|
177
|
+
// mobile-html failed — fall back to summary extract
|
|
178
|
+
fullContent = summaryData.extract || '';
|
|
179
|
+
}
|
|
180
|
+
// Clean Wikipedia noise
|
|
181
|
+
fullContent = cleanWikipediaContent(fullContent);
|
|
182
|
+
const cleanContent = `# ${articleTitleClean}\n\n${description ? `*${description}*\n\n` : ''}${fullContent || summaryData.extract || ''}`;
|
|
183
|
+
return cleanContent;
|
|
184
|
+
}
|
|
185
|
+
catch {
|
|
186
|
+
return null;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
// ── CORS helper ───────────────────────────────────────────────────────────────
|
|
190
|
+
function setCorsHeaders(req, res) {
|
|
191
|
+
const origin = req.headers.origin || '';
|
|
192
|
+
// Allow webpeel.dev or any localhost:* origin
|
|
193
|
+
if (origin === 'https://webpeel.dev' ||
|
|
194
|
+
/^http:\/\/localhost(:\d+)?$/.test(origin) ||
|
|
195
|
+
/^http:\/\/127\.0\.0\.1(:\d+)?$/.test(origin)) {
|
|
196
|
+
res.setHeader('Access-Control-Allow-Origin', origin);
|
|
197
|
+
res.setHeader('Vary', 'Origin');
|
|
198
|
+
}
|
|
199
|
+
res.setHeader('Access-Control-Allow-Methods', 'GET, OPTIONS');
|
|
200
|
+
res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
|
|
201
|
+
}
|
|
202
|
+
// ── IP extraction ─────────────────────────────────────────────────────────────
|
|
203
|
+
function getClientIp(req) {
|
|
204
|
+
const forwardedFor = req.headers['x-forwarded-for'];
|
|
205
|
+
const firstForwardedIp = typeof forwardedFor === 'string'
|
|
206
|
+
? forwardedFor.split(',')[0].trim()
|
|
207
|
+
: Array.isArray(forwardedFor) ? forwardedFor[0] : undefined;
|
|
208
|
+
return req.headers['cf-connecting-ip']
|
|
209
|
+
|| firstForwardedIp
|
|
210
|
+
|| req.headers['x-real-ip']
|
|
211
|
+
|| req.ip
|
|
212
|
+
|| 'unknown';
|
|
213
|
+
}
|
|
214
|
+
export function createDemoRouter(options = {}) {
|
|
215
|
+
const minuteLimiter = options.perMinute ?? perMinuteLimiter;
|
|
216
|
+
const dayLimiter = options.perDay ?? perDayLimiter;
|
|
217
|
+
const router = Router();
|
|
218
|
+
// Handle CORS preflight
|
|
219
|
+
router.options('/v1/demo', (req, res) => {
|
|
220
|
+
setCorsHeaders(req, res);
|
|
221
|
+
res.status(204).end();
|
|
222
|
+
});
|
|
223
|
+
router.get('/v1/demo', async (req, res) => {
|
|
224
|
+
// Always set CORS headers
|
|
225
|
+
setCorsHeaders(req, res);
|
|
226
|
+
try {
|
|
227
|
+
// ── 1. Validate URL parameter ────────────────────────────────────────────
|
|
228
|
+
const { url } = req.query;
|
|
229
|
+
if (!url || typeof url !== 'string') {
|
|
230
|
+
res.status(400).json({
|
|
231
|
+
success: false,
|
|
232
|
+
error: {
|
|
233
|
+
type: 'missing_url',
|
|
234
|
+
message: 'Missing required query parameter: url',
|
|
235
|
+
hint: 'Pass a URL: GET /v1/demo?url=https://example.com',
|
|
236
|
+
docs: 'https://webpeel.dev/docs/errors#missing-url',
|
|
237
|
+
},
|
|
238
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
239
|
+
});
|
|
240
|
+
return;
|
|
241
|
+
}
|
|
242
|
+
if (url.length > 2048) {
|
|
243
|
+
res.status(400).json({
|
|
244
|
+
success: false,
|
|
245
|
+
error: {
|
|
246
|
+
type: 'invalid_url',
|
|
247
|
+
message: 'URL too long (max 2048 characters)',
|
|
248
|
+
hint: 'Shorten the URL to under 2048 characters.',
|
|
249
|
+
docs: 'https://webpeel.dev/docs/errors#invalid-url',
|
|
250
|
+
},
|
|
251
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
252
|
+
});
|
|
253
|
+
return;
|
|
254
|
+
}
|
|
255
|
+
// Parse URL to extract hostname
|
|
256
|
+
let parsedUrl;
|
|
257
|
+
try {
|
|
258
|
+
parsedUrl = new URL(url);
|
|
259
|
+
}
|
|
260
|
+
catch {
|
|
261
|
+
res.status(400).json({
|
|
262
|
+
success: false,
|
|
263
|
+
error: {
|
|
264
|
+
type: 'invalid_url',
|
|
265
|
+
message: 'Invalid URL format',
|
|
266
|
+
hint: 'Ensure the URL is well-formed: https://example.com',
|
|
267
|
+
docs: 'https://webpeel.dev/docs/errors#invalid-url',
|
|
268
|
+
},
|
|
269
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
270
|
+
});
|
|
271
|
+
return;
|
|
272
|
+
}
|
|
273
|
+
if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
|
|
274
|
+
res.status(400).json({
|
|
275
|
+
success: false,
|
|
276
|
+
error: {
|
|
277
|
+
type: 'invalid_url',
|
|
278
|
+
message: 'Only HTTP and HTTPS URLs are allowed',
|
|
279
|
+
hint: 'Ensure the URL starts with http:// or https://',
|
|
280
|
+
docs: 'https://webpeel.dev/docs/errors#invalid-url',
|
|
281
|
+
},
|
|
282
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
283
|
+
});
|
|
284
|
+
return;
|
|
285
|
+
}
|
|
286
|
+
// ── 2. Domain allowlist check ────────────────────────────────────────────
|
|
287
|
+
const hostname = parsedUrl.hostname.toLowerCase();
|
|
288
|
+
if (!ALLOWED_DOMAINS.has(hostname)) {
|
|
289
|
+
res.status(403).json({
|
|
290
|
+
success: false,
|
|
291
|
+
error: {
|
|
292
|
+
type: 'domain_not_allowed',
|
|
293
|
+
message: 'Domain not allowed for demo. Sign up for full API access.',
|
|
294
|
+
hint: `Sign up at ${SIGN_UP_URL} for unrestricted access.`,
|
|
295
|
+
docs: 'https://webpeel.dev/docs/errors#domain-not-allowed',
|
|
296
|
+
},
|
|
297
|
+
signUpUrl: SIGN_UP_URL,
|
|
298
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
299
|
+
});
|
|
300
|
+
return;
|
|
301
|
+
}
|
|
302
|
+
// ── 3. SSRF validation ───────────────────────────────────────────────────
|
|
303
|
+
try {
|
|
304
|
+
validateUrl(url);
|
|
305
|
+
}
|
|
306
|
+
catch {
|
|
307
|
+
res.status(400).json({
|
|
308
|
+
success: false,
|
|
309
|
+
error: {
|
|
310
|
+
type: 'url_blocked',
|
|
311
|
+
message: 'URL blocked for security reasons',
|
|
312
|
+
hint: 'Internal and private network URLs are not allowed.',
|
|
313
|
+
docs: 'https://webpeel.dev/docs/errors#url-blocked',
|
|
314
|
+
},
|
|
315
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
316
|
+
});
|
|
317
|
+
return;
|
|
318
|
+
}
|
|
319
|
+
// ── 4. Rate limiting ─────────────────────────────────────────────────────
|
|
320
|
+
const clientIp = getClientIp(req);
|
|
321
|
+
const minuteResult = minuteLimiter.checkLimit(clientIp, 3);
|
|
322
|
+
if (!minuteResult.allowed) {
|
|
323
|
+
res.setHeader('Retry-After', String(minuteResult.retryAfter || 60));
|
|
324
|
+
res.setHeader('X-RateLimit-Limit', '3');
|
|
325
|
+
res.setHeader('X-RateLimit-Remaining', '0');
|
|
326
|
+
res.status(429).json({
|
|
327
|
+
success: false,
|
|
328
|
+
error: {
|
|
329
|
+
type: 'rate_limit_exceeded',
|
|
330
|
+
message: 'Rate limit exceeded. Demo allows 3 requests per minute.',
|
|
331
|
+
hint: `Sign up at ${SIGN_UP_URL} for higher rate limits.`,
|
|
332
|
+
docs: 'https://webpeel.dev/docs/errors#rate-limit-exceeded',
|
|
333
|
+
},
|
|
334
|
+
retryAfter: minuteResult.retryAfter,
|
|
335
|
+
signUpUrl: SIGN_UP_URL,
|
|
336
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
337
|
+
});
|
|
338
|
+
return;
|
|
339
|
+
}
|
|
340
|
+
const dayResult = dayLimiter.checkLimit(clientIp, 30);
|
|
341
|
+
if (!dayResult.allowed) {
|
|
342
|
+
res.setHeader('Retry-After', String(dayResult.retryAfter || 86400));
|
|
343
|
+
res.setHeader('X-RateLimit-Limit', '30');
|
|
344
|
+
res.setHeader('X-RateLimit-Remaining', '0');
|
|
345
|
+
res.status(429).json({
|
|
346
|
+
success: false,
|
|
347
|
+
error: {
|
|
348
|
+
type: 'daily_rate_limit_exceeded',
|
|
349
|
+
message: 'Daily rate limit exceeded. Demo allows 30 requests per day.',
|
|
350
|
+
hint: `Sign up at ${SIGN_UP_URL} for higher rate limits.`,
|
|
351
|
+
docs: 'https://webpeel.dev/docs/errors#daily-rate-limit-exceeded',
|
|
352
|
+
},
|
|
353
|
+
retryAfter: dayResult.retryAfter,
|
|
354
|
+
signUpUrl: SIGN_UP_URL,
|
|
355
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
356
|
+
});
|
|
357
|
+
return;
|
|
358
|
+
}
|
|
359
|
+
// ── 5. Cache lookup ──────────────────────────────────────────────────────
|
|
360
|
+
const cacheKey = url;
|
|
361
|
+
const cached = demoCache.get(cacheKey);
|
|
362
|
+
if (cached && Date.now() - cached.timestamp < CACHE_TTL_MS) {
|
|
363
|
+
res.setHeader('X-Cache', 'HIT');
|
|
364
|
+
res.json(cached.result);
|
|
365
|
+
return;
|
|
366
|
+
}
|
|
367
|
+
// ── 6. Fetch the page (HTTP-only, 5s timeout) ────────────────────────────
|
|
368
|
+
const startTime = Date.now();
|
|
369
|
+
let fetchResult;
|
|
370
|
+
try {
|
|
371
|
+
const abortController = new AbortController();
|
|
372
|
+
const timeoutHandle = setTimeout(() => abortController.abort(), FETCH_TIMEOUT_MS);
|
|
373
|
+
try {
|
|
374
|
+
fetchResult = await simpleFetch(url, undefined, // default user agent
|
|
375
|
+
FETCH_TIMEOUT_MS, undefined, // no custom headers
|
|
376
|
+
abortController.signal);
|
|
377
|
+
}
|
|
378
|
+
finally {
|
|
379
|
+
clearTimeout(timeoutHandle);
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
catch (err) {
|
|
383
|
+
const msg = err?.message || 'Failed to fetch URL';
|
|
384
|
+
res.status(502).json({
|
|
385
|
+
success: false,
|
|
386
|
+
error: {
|
|
387
|
+
type: 'fetch_failed',
|
|
388
|
+
message: `Fetch failed: ${msg.replace(/[<>"']/g, '')}`,
|
|
389
|
+
hint: 'Check that the URL is publicly accessible.',
|
|
390
|
+
docs: 'https://webpeel.dev/docs/errors#fetch-failed',
|
|
391
|
+
},
|
|
392
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
393
|
+
});
|
|
394
|
+
return;
|
|
395
|
+
}
|
|
396
|
+
const fetchTimeMs = Date.now() - startTime;
|
|
397
|
+
// ── 7. Extract title and content ─────────────────────────────────────────
|
|
398
|
+
const html = fetchResult.html || '';
|
|
399
|
+
// Count what will be removed BEFORE cleaning runs
|
|
400
|
+
let cleaningStats;
|
|
401
|
+
try {
|
|
402
|
+
cleaningStats = countRemovedElements(html);
|
|
403
|
+
}
|
|
404
|
+
catch {
|
|
405
|
+
cleaningStats = {
|
|
406
|
+
scripts: 0, styles: 0, ads: 0, tracking: 0,
|
|
407
|
+
navigation: 0, socialWidgets: 0, popups: 0, totalRemoved: 0,
|
|
408
|
+
originalSizeBytes: Buffer.byteLength(html, 'utf8'),
|
|
409
|
+
cleanedSizeBytes: 0, reductionPercent: 0,
|
|
410
|
+
};
|
|
411
|
+
}
|
|
412
|
+
// Extract title from metadata
|
|
413
|
+
let title = '';
|
|
414
|
+
try {
|
|
415
|
+
const meta = extractMetadata(html, url);
|
|
416
|
+
title = meta.title || '';
|
|
417
|
+
}
|
|
418
|
+
catch {
|
|
419
|
+
title = '';
|
|
420
|
+
}
|
|
421
|
+
// Extract main content and convert to markdown
|
|
422
|
+
// For Wikipedia URLs: use the REST API for clean structured content
|
|
423
|
+
let markdownContent = '';
|
|
424
|
+
const isWikipedia = /(?:^|\.)wikipedia\.org$/.test(parsedUrl.hostname.toLowerCase());
|
|
425
|
+
if (isWikipedia) {
|
|
426
|
+
try {
|
|
427
|
+
const wikiContent = await fetchWikipediaContent(url);
|
|
428
|
+
if (wikiContent) {
|
|
429
|
+
markdownContent = wikiContent;
|
|
430
|
+
}
|
|
431
|
+
}
|
|
432
|
+
catch {
|
|
433
|
+
markdownContent = '';
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
// Fall back to generic HTML→markdown pipeline if Wikipedia fetch failed/N/A
|
|
437
|
+
if (!markdownContent) {
|
|
438
|
+
try {
|
|
439
|
+
const detected = detectMainContent(html);
|
|
440
|
+
const contentHtml = detected.html || html;
|
|
441
|
+
markdownContent = htmlToMarkdown(contentHtml, { prune: true });
|
|
442
|
+
}
|
|
443
|
+
catch {
|
|
444
|
+
markdownContent = '';
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
// Apply general post-processing to remove common noise artifacts
|
|
448
|
+
markdownContent = cleanDemoContent(markdownContent);
|
|
449
|
+
// Finalize cleaning stats now that we have the cleaned content size
|
|
450
|
+
const cleanedSizeBytes = Buffer.byteLength(markdownContent, 'utf8');
|
|
451
|
+
const originalSizeBytes = cleaningStats.originalSizeBytes;
|
|
452
|
+
const reductionPercent = originalSizeBytes > 0
|
|
453
|
+
? Math.round(((originalSizeBytes - cleanedSizeBytes) / originalSizeBytes) * 100)
|
|
454
|
+
: 0;
|
|
455
|
+
const cleaned = {
|
|
456
|
+
scripts: cleaningStats.scripts,
|
|
457
|
+
styles: cleaningStats.styles,
|
|
458
|
+
ads: cleaningStats.ads,
|
|
459
|
+
tracking: cleaningStats.tracking,
|
|
460
|
+
navigation: cleaningStats.navigation,
|
|
461
|
+
socialWidgets: cleaningStats.socialWidgets,
|
|
462
|
+
popups: cleaningStats.popups,
|
|
463
|
+
totalRemoved: cleaningStats.totalRemoved,
|
|
464
|
+
originalSizeKB: Math.round(originalSizeBytes / 1024 * 10) / 10,
|
|
465
|
+
cleanedSizeKB: Math.round(cleanedSizeBytes / 1024 * 10) / 10,
|
|
466
|
+
reductionPercent: Math.max(0, Math.min(100, reductionPercent)),
|
|
467
|
+
};
|
|
468
|
+
// ── 7b. Token savings metrics ────────────────────────────────────────────
|
|
469
|
+
const rawTokens = Math.round(html.length / 4);
|
|
470
|
+
const cleanTokens = Math.round(markdownContent.length / 4);
|
|
471
|
+
const savingsPercent = rawTokens > 0
|
|
472
|
+
? Math.max(0, Math.round((1 - cleanTokens / rawTokens) * 100))
|
|
473
|
+
: 0;
|
|
474
|
+
const tokenEstimate = {
|
|
475
|
+
raw: rawTokens,
|
|
476
|
+
clean: cleanTokens,
|
|
477
|
+
savings: savingsPercent,
|
|
478
|
+
};
|
|
479
|
+
// ── 8. Truncate content ──────────────────────────────────────────────────
|
|
480
|
+
const truncated = markdownContent.length > MAX_CONTENT_LENGTH;
|
|
481
|
+
const content = truncated
|
|
482
|
+
? markdownContent.slice(0, MAX_CONTENT_LENGTH)
|
|
483
|
+
: markdownContent;
|
|
484
|
+
// Count words in the truncated content
|
|
485
|
+
const wordCount = content.split(/\s+/).filter(Boolean).length;
|
|
486
|
+
// ── 9. Build response and cache ──────────────────────────────────────────
|
|
487
|
+
const response = {
|
|
488
|
+
url: fetchResult.url || url,
|
|
489
|
+
title,
|
|
490
|
+
content,
|
|
491
|
+
wordCount,
|
|
492
|
+
fetchTimeMs,
|
|
493
|
+
truncated,
|
|
494
|
+
demo: true,
|
|
495
|
+
signUpUrl: SIGN_UP_URL,
|
|
496
|
+
cleaned,
|
|
497
|
+
tokenEstimate,
|
|
498
|
+
};
|
|
499
|
+
demoCache.set(cacheKey, { result: response, timestamp: Date.now() });
|
|
500
|
+
res.setHeader('X-Cache', 'MISS');
|
|
501
|
+
res.json(response);
|
|
502
|
+
}
|
|
503
|
+
catch (error) {
|
|
504
|
+
console.error('Demo endpoint error:', error);
|
|
505
|
+
res.status(500).json({
|
|
506
|
+
success: false,
|
|
507
|
+
error: {
|
|
508
|
+
type: 'internal_error',
|
|
509
|
+
message: 'Internal server error',
|
|
510
|
+
docs: 'https://webpeel.dev/docs/errors#internal-error',
|
|
511
|
+
},
|
|
512
|
+
requestId: req.requestId || crypto.randomUUID(),
|
|
513
|
+
});
|
|
514
|
+
}
|
|
515
|
+
});
|
|
516
|
+
return router;
|
|
517
|
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* /v1/do — Intent-based endpoint.
|
|
3
|
+
* One endpoint that understands natural language and routes internally.
|
|
4
|
+
* POST /v1/do { task: "find Stripe fees" }
|
|
5
|
+
* GET /v1/do?task=find+Stripe+fees
|
|
6
|
+
*/
|
|
7
|
+
import { Router } from 'express';
|
|
8
|
+
export declare function createDoRouter(): Router;
|