webpeel 0.20.2 → 0.20.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. package/dist/server/app.d.ts +14 -0
  2. package/dist/server/app.js +384 -0
  3. package/dist/server/auth-store.d.ts +27 -0
  4. package/dist/server/auth-store.js +88 -0
  5. package/dist/server/email-service.d.ts +21 -0
  6. package/dist/server/email-service.js +79 -0
  7. package/dist/server/job-queue.d.ts +100 -0
  8. package/dist/server/job-queue.js +145 -0
  9. package/dist/server/logger.d.ts +10 -0
  10. package/dist/server/logger.js +37 -0
  11. package/dist/server/middleware/auth.d.ts +28 -0
  12. package/dist/server/middleware/auth.js +221 -0
  13. package/dist/server/middleware/rate-limit.d.ts +24 -0
  14. package/dist/server/middleware/rate-limit.js +167 -0
  15. package/dist/server/middleware/url-validator.d.ts +15 -0
  16. package/dist/server/middleware/url-validator.js +186 -0
  17. package/dist/server/openapi.yaml +6418 -0
  18. package/dist/server/pg-auth-store.d.ts +132 -0
  19. package/dist/server/pg-auth-store.js +472 -0
  20. package/dist/server/pg-job-queue.d.ts +59 -0
  21. package/dist/server/pg-job-queue.js +375 -0
  22. package/dist/server/premium/domain-intel.d.ts +16 -0
  23. package/dist/server/premium/domain-intel.js +133 -0
  24. package/dist/server/premium/index.d.ts +17 -0
  25. package/dist/server/premium/index.js +35 -0
  26. package/dist/server/premium/swr-cache.d.ts +14 -0
  27. package/dist/server/premium/swr-cache.js +34 -0
  28. package/dist/server/routes/activity.d.ts +6 -0
  29. package/dist/server/routes/activity.js +74 -0
  30. package/dist/server/routes/answer.d.ts +5 -0
  31. package/dist/server/routes/answer.js +125 -0
  32. package/dist/server/routes/ask.d.ts +28 -0
  33. package/dist/server/routes/ask.js +229 -0
  34. package/dist/server/routes/batch.d.ts +6 -0
  35. package/dist/server/routes/batch.js +493 -0
  36. package/dist/server/routes/cli-usage.d.ts +6 -0
  37. package/dist/server/routes/cli-usage.js +127 -0
  38. package/dist/server/routes/compat.d.ts +23 -0
  39. package/dist/server/routes/compat.js +652 -0
  40. package/dist/server/routes/deep-fetch.d.ts +8 -0
  41. package/dist/server/routes/deep-fetch.js +57 -0
  42. package/dist/server/routes/demo.d.ts +24 -0
  43. package/dist/server/routes/demo.js +517 -0
  44. package/dist/server/routes/do.d.ts +8 -0
  45. package/dist/server/routes/do.js +72 -0
  46. package/dist/server/routes/extract.d.ts +8 -0
  47. package/dist/server/routes/extract.js +235 -0
  48. package/dist/server/routes/fetch.d.ts +7 -0
  49. package/dist/server/routes/fetch.js +999 -0
  50. package/dist/server/routes/health.d.ts +7 -0
  51. package/dist/server/routes/health.js +19 -0
  52. package/dist/server/routes/jobs.d.ts +7 -0
  53. package/dist/server/routes/jobs.js +573 -0
  54. package/dist/server/routes/mcp.d.ts +14 -0
  55. package/dist/server/routes/mcp.js +141 -0
  56. package/dist/server/routes/oauth.d.ts +9 -0
  57. package/dist/server/routes/oauth.js +396 -0
  58. package/dist/server/routes/playground.d.ts +17 -0
  59. package/dist/server/routes/playground.js +283 -0
  60. package/dist/server/routes/screenshot.d.ts +22 -0
  61. package/dist/server/routes/screenshot.js +816 -0
  62. package/dist/server/routes/search.d.ts +6 -0
  63. package/dist/server/routes/search.js +303 -0
  64. package/dist/server/routes/session.d.ts +15 -0
  65. package/dist/server/routes/session.js +397 -0
  66. package/dist/server/routes/stats.d.ts +6 -0
  67. package/dist/server/routes/stats.js +71 -0
  68. package/dist/server/routes/stripe.d.ts +15 -0
  69. package/dist/server/routes/stripe.js +294 -0
  70. package/dist/server/routes/users.d.ts +8 -0
  71. package/dist/server/routes/users.js +1671 -0
  72. package/dist/server/routes/watch.d.ts +15 -0
  73. package/dist/server/routes/watch.js +309 -0
  74. package/dist/server/routes/webhooks.d.ts +26 -0
  75. package/dist/server/routes/webhooks.js +170 -0
  76. package/dist/server/routes/youtube.d.ts +6 -0
  77. package/dist/server/routes/youtube.js +130 -0
  78. package/dist/server/sentry.d.ts +13 -0
  79. package/dist/server/sentry.js +38 -0
  80. package/dist/server/types.d.ts +15 -0
  81. package/dist/server/types.js +7 -0
  82. package/dist/server/utils/response.d.ts +44 -0
  83. package/dist/server/utils/response.js +69 -0
  84. package/dist/server/utils/sse.d.ts +22 -0
  85. package/dist/server/utils/sse.js +38 -0
  86. package/package.json +2 -1
@@ -0,0 +1,57 @@
1
+ /**
2
+ * POST /v1/deep-fetch
3
+ *
4
+ * Deep web intelligence endpoint: search + fetch + synthesize + structure.
5
+ * Body: { query, count?, format?, maxChars? }
6
+ */
7
+ import { Router } from 'express';
8
+ import { deepFetch } from '../../core/deep-fetch.js';
9
+ export function createDeepFetchRouter() {
10
+ const router = Router();
11
+ router.post('/v1/deep-fetch', async (req, res) => {
12
+ // Deprecation notice — prefer /v1/search?depth=deep
13
+ res.setHeader('X-Deprecated', 'true');
14
+ res.setHeader('X-Deprecated-Use', '/v1/search?depth=deep');
15
+ // AUTH: require authentication (global middleware sets req.auth)
16
+ const dfAuthId = req.auth?.keyInfo?.accountId || req.user?.userId;
17
+ if (!dfAuthId) {
18
+ res.status(401).json({ success: false, error: { type: 'authentication_required', message: 'API key required. Get one at https://app.webpeel.dev/keys', hint: 'Get a free API key at https://app.webpeel.dev/keys', docs: 'https://webpeel.dev/docs/errors#authentication_required' }, requestId: req.requestId });
19
+ return;
20
+ }
21
+ try {
22
+ const body = req.body;
23
+ const query = body.query;
24
+ if (!query || typeof query !== 'string' || !query.trim()) {
25
+ res.status(400).json({ success: false, error: { type: 'bad_request', message: 'Missing required field: query', hint: 'Include a "query" string in the request body', docs: 'https://webpeel.dev/docs/errors#bad_request' }, requestId: req.requestId });
26
+ return;
27
+ }
28
+ const options = {
29
+ query: query.trim(),
30
+ count: typeof body.count === 'number' ? Math.min(Math.max(body.count, 1), 10) : 5,
31
+ format: ['merged', 'structured', 'comparison'].includes(body.format)
32
+ ? body.format
33
+ : 'merged',
34
+ maxChars: typeof body.maxChars === 'number' ? body.maxChars : 32000,
35
+ };
36
+ const result = await deepFetch(options);
37
+ res.json({
38
+ ...result,
39
+ content: result.merged || '', // expose as `content` for consistency
40
+ });
41
+ }
42
+ catch (err) {
43
+ const message = err instanceof Error ? err.message : 'Unknown error';
44
+ console.error('[deep-fetch] error:', message);
45
+ res.status(500).json({
46
+ success: false,
47
+ error: {
48
+ type: 'internal_error',
49
+ message,
50
+ docs: 'https://webpeel.dev/docs/errors#internal_error',
51
+ },
52
+ requestId: req.requestId,
53
+ });
54
+ }
55
+ });
56
+ return router;
57
+ }
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Demo endpoint — GET /v1/demo?url=<encoded_url>
3
+ *
4
+ * Unauthenticated endpoint for the WebPeel landing page hero demo.
5
+ * Returns a truncated fetch result for allowed domains only.
6
+ *
7
+ * Security:
8
+ * - Domain allowlist (no arbitrary URLs)
9
+ * - Separate rate limiter: 3 req/min, 30 req/day per IP
10
+ * - SSRF validation via validateUrl()
11
+ * - HTTP-only fetch (no Puppeteer/browser rendering)
12
+ * - 5s timeout, 3000 char content truncation
13
+ * - CORS: only webpeel.dev + localhost
14
+ * - In-memory cache: 10 min per URL
15
+ */
16
+ import { Router } from 'express';
17
+ import { RateLimiter } from '../middleware/rate-limit.js';
18
+ export interface DemoRouterOptions {
19
+ /** Inject custom per-minute rate limiter (useful for testing) */
20
+ perMinute?: RateLimiter;
21
+ /** Inject custom per-day rate limiter (useful for testing) */
22
+ perDay?: RateLimiter;
23
+ }
24
+ export declare function createDemoRouter(options?: DemoRouterOptions): Router;
@@ -0,0 +1,517 @@
1
+ /**
2
+ * Demo endpoint — GET /v1/demo?url=<encoded_url>
3
+ *
4
+ * Unauthenticated endpoint for the WebPeel landing page hero demo.
5
+ * Returns a truncated fetch result for allowed domains only.
6
+ *
7
+ * Security:
8
+ * - Domain allowlist (no arbitrary URLs)
9
+ * - Separate rate limiter: 3 req/min, 30 req/day per IP
10
+ * - SSRF validation via validateUrl()
11
+ * - HTTP-only fetch (no Puppeteer/browser rendering)
12
+ * - 5s timeout, 3000 char content truncation
13
+ * - CORS: only webpeel.dev + localhost
14
+ * - In-memory cache: 10 min per URL
15
+ */
16
+ import { Router } from 'express';
17
+ import crypto from 'crypto';
18
+ import { RateLimiter } from '../middleware/rate-limit.js';
19
+ import { simpleFetch } from '../../core/http-fetch.js';
20
+ import { validateUrl } from '../../core/http-fetch.js';
21
+ import { htmlToMarkdown, detectMainContent, countRemovedElements } from '../../core/markdown.js';
22
+ import { extractMetadata } from '../../core/metadata.js';
23
+ // ── Domain allowlist ──────────────────────────────────────────────────────────
24
+ const ALLOWED_DOMAINS = new Set([
25
+ 'stripe.com',
26
+ 'wikipedia.org',
27
+ 'en.wikipedia.org',
28
+ 'news.ycombinator.com',
29
+ 'github.com',
30
+ 'reddit.com',
31
+ 'www.reddit.com',
32
+ 'bbc.com',
33
+ 'www.bbc.com',
34
+ 'nytimes.com',
35
+ 'www.nytimes.com',
36
+ 'techcrunch.com',
37
+ 'arxiv.org',
38
+ 'stackoverflow.com',
39
+ 'producthunt.com',
40
+ 'www.producthunt.com',
41
+ 'theverge.com',
42
+ 'www.theverge.com',
43
+ 'arstechnica.com',
44
+ 'www.arstechnica.com',
45
+ 'docs.python.org',
46
+ 'developer.mozilla.org',
47
+ ]);
48
+ // ── Rate limiters (demo-specific, separate from main API) ─────────────────────
49
+ // 3 requests per minute per IP
50
+ const perMinuteLimiter = new RateLimiter(60_000);
51
+ // 30 requests per day per IP
52
+ const perDayLimiter = new RateLimiter(24 * 60 * 60 * 1000);
53
+ // Cleanup every 10 minutes
54
+ setInterval(() => {
55
+ perMinuteLimiter.cleanup();
56
+ perDayLimiter.cleanup();
57
+ }, 10 * 60 * 1000);
58
+ const CACHE_TTL_MS = 10 * 60 * 1000; // 10 minutes
59
+ const demoCache = new Map();
60
+ const MAX_CONTENT_LENGTH = 3000;
61
+ const FETCH_TIMEOUT_MS = 5000;
62
+ const SIGN_UP_URL = 'https://app.webpeel.dev';
63
+ // ── Wikipedia REST API headers (per Wikimedia User-Agent policy) ──────────────
64
+ const WIKI_HEADERS = {
65
+ 'User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me) Node.js',
66
+ 'Api-User-Agent': 'WebPeel/0.17.1 (https://webpeel.dev; jake@jakeliu.me)',
67
+ 'Cache-Control': 'no-cache',
68
+ 'If-None-Match': '', // Prevent 304 responses from Wikipedia REST API
69
+ };
70
+ // ── Helper: strip HTML tags and decode common entities ────────────────────────
71
+ // Uses a quote-aware regex to handle `>` inside attribute values (e.g. data-mw='{"type":"..."}')
72
+ function stripHtmlTags(str) {
73
+ return str
74
+ // Remove tags, handling quoted attribute values containing >
75
+ .replace(/<(?:[^>"']|"[^"]*"|'[^']*')*>/g, '')
76
+ .replace(/&amp;/g, '&')
77
+ .replace(/&lt;/g, '<')
78
+ .replace(/&gt;/g, '>')
79
+ .replace(/&quot;/g, '"')
80
+ .replace(/&#39;/g, "'")
81
+ .replace(/&nbsp;/g, ' ')
82
+ .trim();
83
+ }
84
+ // ── Wikipedia-specific content cleaner (mirrors domain-extractors.ts) ─────────
85
+ function cleanWikipediaContent(content) {
86
+ return content
87
+ // Remove [edit] links
88
+ .replace(/\[edit\]/gi, '')
89
+ // Remove citation brackets [1], [2], etc.
90
+ .replace(/\[\d+\]/g, '')
91
+ // Remove [citation needed], [verification], etc.
92
+ .replace(/\[(citation needed|verification|improve this article|adding citations[^\]]*|when\?|where\?|who\?|clarification needed|dubious[^\]]*|failed verification[^\]]*|unreliable source[^\]]*)\]/gi, '')
93
+ // Remove [Learn how and when to remove this message]
94
+ .replace(/\[Learn how and when to remove this message\]/gi, '')
95
+ // Clean up excess whitespace
96
+ .replace(/\n{3,}/g, '\n\n')
97
+ .trim();
98
+ }
99
+ // ── General post-processing for all demo content ──────────────────────────────
100
+ function cleanDemoContent(content) {
101
+ return content
102
+ // Remove empty markdown links: [](/path "tooltip") or [ ](...)
103
+ .replace(/\[(?:\s*)\]\([^)]*\)/g, '')
104
+ // Remove Wikipedia boilerplate
105
+ .replace(/From Wikipedia, the free encyclopedia\s*/gi, '')
106
+ // Remove redirect notices
107
+ .replace(/"[^"]*" (?:and "[^"]*" )?redirect(?:s)? here\.\s*(?:For[^.]*\.\s*)?/gi, '')
108
+ // Remove [edit] links
109
+ .replace(/\[edit\]/gi, '')
110
+ // Remove citation brackets
111
+ .replace(/\[\d+\]/g, '')
112
+ // Remove stray JSON attribute value artifacts from HTML parsing (e.g. "}"> )
113
+ .replace(/^["}'>]+\s*$/gm, '')
114
+ // Clean up excess whitespace
115
+ .replace(/\n{3,}/g, '\n\n')
116
+ .trim();
117
+ }
118
+ // ── Wikipedia REST API fetcher ────────────────────────────────────────────────
119
+ async function fetchWikipediaContent(url) {
120
+ try {
121
+ const urlObj = new URL(url);
122
+ const pathParts = urlObj.pathname.split('/').filter(Boolean);
123
+ // Only handle article pages: /wiki/Article_Title
124
+ if (pathParts[0] !== 'wiki' || pathParts.length < 2)
125
+ return null;
126
+ const articleTitle = decodeURIComponent(pathParts[1]);
127
+ // Skip special pages (contain a colon, e.g. Special:Random, Talk:Article)
128
+ if (articleTitle.includes(':'))
129
+ return null;
130
+ const lang = urlObj.hostname.split('.')[0] || 'en';
131
+ // Fetch summary for title/description
132
+ const summaryUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/summary/${encodeURIComponent(articleTitle)}`;
133
+ const summaryResult = await simpleFetch(summaryUrl, undefined, 8000, {
134
+ ...WIKI_HEADERS,
135
+ 'Accept': 'application/json',
136
+ });
137
+ let summaryData = null;
138
+ try {
139
+ summaryData = JSON.parse(summaryResult.html || '');
140
+ }
141
+ catch {
142
+ summaryData = null;
143
+ }
144
+ if (!summaryData || summaryData.type === 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found') {
145
+ return null;
146
+ }
147
+ const articleTitleClean = summaryData.title || articleTitle.replace(/_/g, ' ');
148
+ const description = summaryData.description || '';
149
+ // Fetch full content via mobile-html
150
+ let fullContent = '';
151
+ try {
152
+ const mobileUrl = `https://${lang}.wikipedia.org/api/rest_v1/page/mobile-html/${encodeURIComponent(articleTitle)}`;
153
+ const mobileResult = await simpleFetch(mobileUrl, undefined, 15000, {
154
+ ...WIKI_HEADERS,
155
+ 'Accept': 'text/html',
156
+ });
157
+ if (mobileResult?.html) {
158
+ const sectionMatches = mobileResult.html.match(/<section[^>]*>([\s\S]*?)<\/section>/gi) || [];
159
+ for (const section of sectionMatches) {
160
+ // Extract section heading
161
+ const headingMatch = section.match(/<h[2-6][^>]*id="([^"]*)"[^>]*class="[^"]*pcs-edit-section-title[^"]*"[^>]*>([\s\S]*?)<\/h[2-6]>/i);
162
+ const heading = headingMatch ? stripHtmlTags(headingMatch[2]).trim() : '';
163
+ // Extract paragraphs
164
+ const paragraphs = section.match(/<p[^>]*>([\s\S]*?)<\/p>/gi) || [];
165
+ const sectionText = paragraphs
166
+ .map((p) => stripHtmlTags(p).trim())
167
+ .filter((t) => t.length > 0)
168
+ .join('\n\n');
169
+ if (sectionText) {
170
+ const prefix = heading ? `## ${heading}\n\n` : '';
171
+ fullContent += `\n\n${prefix}${sectionText}`;
172
+ }
173
+ }
174
+ }
175
+ }
176
+ catch {
177
+ // mobile-html failed — fall back to summary extract
178
+ fullContent = summaryData.extract || '';
179
+ }
180
+ // Clean Wikipedia noise
181
+ fullContent = cleanWikipediaContent(fullContent);
182
+ const cleanContent = `# ${articleTitleClean}\n\n${description ? `*${description}*\n\n` : ''}${fullContent || summaryData.extract || ''}`;
183
+ return cleanContent;
184
+ }
185
+ catch {
186
+ return null;
187
+ }
188
+ }
189
+ // ── CORS helper ───────────────────────────────────────────────────────────────
190
+ function setCorsHeaders(req, res) {
191
+ const origin = req.headers.origin || '';
192
+ // Allow webpeel.dev or any localhost:* origin
193
+ if (origin === 'https://webpeel.dev' ||
194
+ /^http:\/\/localhost(:\d+)?$/.test(origin) ||
195
+ /^http:\/\/127\.0\.0\.1(:\d+)?$/.test(origin)) {
196
+ res.setHeader('Access-Control-Allow-Origin', origin);
197
+ res.setHeader('Vary', 'Origin');
198
+ }
199
+ res.setHeader('Access-Control-Allow-Methods', 'GET, OPTIONS');
200
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type');
201
+ }
202
+ // ── IP extraction ─────────────────────────────────────────────────────────────
203
+ function getClientIp(req) {
204
+ const forwardedFor = req.headers['x-forwarded-for'];
205
+ const firstForwardedIp = typeof forwardedFor === 'string'
206
+ ? forwardedFor.split(',')[0].trim()
207
+ : Array.isArray(forwardedFor) ? forwardedFor[0] : undefined;
208
+ return req.headers['cf-connecting-ip']
209
+ || firstForwardedIp
210
+ || req.headers['x-real-ip']
211
+ || req.ip
212
+ || 'unknown';
213
+ }
214
+ export function createDemoRouter(options = {}) {
215
+ const minuteLimiter = options.perMinute ?? perMinuteLimiter;
216
+ const dayLimiter = options.perDay ?? perDayLimiter;
217
+ const router = Router();
218
+ // Handle CORS preflight
219
+ router.options('/v1/demo', (req, res) => {
220
+ setCorsHeaders(req, res);
221
+ res.status(204).end();
222
+ });
223
+ router.get('/v1/demo', async (req, res) => {
224
+ // Always set CORS headers
225
+ setCorsHeaders(req, res);
226
+ try {
227
+ // ── 1. Validate URL parameter ────────────────────────────────────────────
228
+ const { url } = req.query;
229
+ if (!url || typeof url !== 'string') {
230
+ res.status(400).json({
231
+ success: false,
232
+ error: {
233
+ type: 'missing_url',
234
+ message: 'Missing required query parameter: url',
235
+ hint: 'Pass a URL: GET /v1/demo?url=https://example.com',
236
+ docs: 'https://webpeel.dev/docs/errors#missing-url',
237
+ },
238
+ requestId: req.requestId || crypto.randomUUID(),
239
+ });
240
+ return;
241
+ }
242
+ if (url.length > 2048) {
243
+ res.status(400).json({
244
+ success: false,
245
+ error: {
246
+ type: 'invalid_url',
247
+ message: 'URL too long (max 2048 characters)',
248
+ hint: 'Shorten the URL to under 2048 characters.',
249
+ docs: 'https://webpeel.dev/docs/errors#invalid-url',
250
+ },
251
+ requestId: req.requestId || crypto.randomUUID(),
252
+ });
253
+ return;
254
+ }
255
+ // Parse URL to extract hostname
256
+ let parsedUrl;
257
+ try {
258
+ parsedUrl = new URL(url);
259
+ }
260
+ catch {
261
+ res.status(400).json({
262
+ success: false,
263
+ error: {
264
+ type: 'invalid_url',
265
+ message: 'Invalid URL format',
266
+ hint: 'Ensure the URL is well-formed: https://example.com',
267
+ docs: 'https://webpeel.dev/docs/errors#invalid-url',
268
+ },
269
+ requestId: req.requestId || crypto.randomUUID(),
270
+ });
271
+ return;
272
+ }
273
+ if (!['http:', 'https:'].includes(parsedUrl.protocol)) {
274
+ res.status(400).json({
275
+ success: false,
276
+ error: {
277
+ type: 'invalid_url',
278
+ message: 'Only HTTP and HTTPS URLs are allowed',
279
+ hint: 'Ensure the URL starts with http:// or https://',
280
+ docs: 'https://webpeel.dev/docs/errors#invalid-url',
281
+ },
282
+ requestId: req.requestId || crypto.randomUUID(),
283
+ });
284
+ return;
285
+ }
286
+ // ── 2. Domain allowlist check ────────────────────────────────────────────
287
+ const hostname = parsedUrl.hostname.toLowerCase();
288
+ if (!ALLOWED_DOMAINS.has(hostname)) {
289
+ res.status(403).json({
290
+ success: false,
291
+ error: {
292
+ type: 'domain_not_allowed',
293
+ message: 'Domain not allowed for demo. Sign up for full API access.',
294
+ hint: `Sign up at ${SIGN_UP_URL} for unrestricted access.`,
295
+ docs: 'https://webpeel.dev/docs/errors#domain-not-allowed',
296
+ },
297
+ signUpUrl: SIGN_UP_URL,
298
+ requestId: req.requestId || crypto.randomUUID(),
299
+ });
300
+ return;
301
+ }
302
+ // ── 3. SSRF validation ───────────────────────────────────────────────────
303
+ try {
304
+ validateUrl(url);
305
+ }
306
+ catch {
307
+ res.status(400).json({
308
+ success: false,
309
+ error: {
310
+ type: 'url_blocked',
311
+ message: 'URL blocked for security reasons',
312
+ hint: 'Internal and private network URLs are not allowed.',
313
+ docs: 'https://webpeel.dev/docs/errors#url-blocked',
314
+ },
315
+ requestId: req.requestId || crypto.randomUUID(),
316
+ });
317
+ return;
318
+ }
319
+ // ── 4. Rate limiting ─────────────────────────────────────────────────────
320
+ const clientIp = getClientIp(req);
321
+ const minuteResult = minuteLimiter.checkLimit(clientIp, 3);
322
+ if (!minuteResult.allowed) {
323
+ res.setHeader('Retry-After', String(minuteResult.retryAfter || 60));
324
+ res.setHeader('X-RateLimit-Limit', '3');
325
+ res.setHeader('X-RateLimit-Remaining', '0');
326
+ res.status(429).json({
327
+ success: false,
328
+ error: {
329
+ type: 'rate_limit_exceeded',
330
+ message: 'Rate limit exceeded. Demo allows 3 requests per minute.',
331
+ hint: `Sign up at ${SIGN_UP_URL} for higher rate limits.`,
332
+ docs: 'https://webpeel.dev/docs/errors#rate-limit-exceeded',
333
+ },
334
+ retryAfter: minuteResult.retryAfter,
335
+ signUpUrl: SIGN_UP_URL,
336
+ requestId: req.requestId || crypto.randomUUID(),
337
+ });
338
+ return;
339
+ }
340
+ const dayResult = dayLimiter.checkLimit(clientIp, 30);
341
+ if (!dayResult.allowed) {
342
+ res.setHeader('Retry-After', String(dayResult.retryAfter || 86400));
343
+ res.setHeader('X-RateLimit-Limit', '30');
344
+ res.setHeader('X-RateLimit-Remaining', '0');
345
+ res.status(429).json({
346
+ success: false,
347
+ error: {
348
+ type: 'daily_rate_limit_exceeded',
349
+ message: 'Daily rate limit exceeded. Demo allows 30 requests per day.',
350
+ hint: `Sign up at ${SIGN_UP_URL} for higher rate limits.`,
351
+ docs: 'https://webpeel.dev/docs/errors#daily-rate-limit-exceeded',
352
+ },
353
+ retryAfter: dayResult.retryAfter,
354
+ signUpUrl: SIGN_UP_URL,
355
+ requestId: req.requestId || crypto.randomUUID(),
356
+ });
357
+ return;
358
+ }
359
+ // ── 5. Cache lookup ──────────────────────────────────────────────────────
360
+ const cacheKey = url;
361
+ const cached = demoCache.get(cacheKey);
362
+ if (cached && Date.now() - cached.timestamp < CACHE_TTL_MS) {
363
+ res.setHeader('X-Cache', 'HIT');
364
+ res.json(cached.result);
365
+ return;
366
+ }
367
+ // ── 6. Fetch the page (HTTP-only, 5s timeout) ────────────────────────────
368
+ const startTime = Date.now();
369
+ let fetchResult;
370
+ try {
371
+ const abortController = new AbortController();
372
+ const timeoutHandle = setTimeout(() => abortController.abort(), FETCH_TIMEOUT_MS);
373
+ try {
374
+ fetchResult = await simpleFetch(url, undefined, // default user agent
375
+ FETCH_TIMEOUT_MS, undefined, // no custom headers
376
+ abortController.signal);
377
+ }
378
+ finally {
379
+ clearTimeout(timeoutHandle);
380
+ }
381
+ }
382
+ catch (err) {
383
+ const msg = err?.message || 'Failed to fetch URL';
384
+ res.status(502).json({
385
+ success: false,
386
+ error: {
387
+ type: 'fetch_failed',
388
+ message: `Fetch failed: ${msg.replace(/[<>"']/g, '')}`,
389
+ hint: 'Check that the URL is publicly accessible.',
390
+ docs: 'https://webpeel.dev/docs/errors#fetch-failed',
391
+ },
392
+ requestId: req.requestId || crypto.randomUUID(),
393
+ });
394
+ return;
395
+ }
396
+ const fetchTimeMs = Date.now() - startTime;
397
+ // ── 7. Extract title and content ─────────────────────────────────────────
398
+ const html = fetchResult.html || '';
399
+ // Count what will be removed BEFORE cleaning runs
400
+ let cleaningStats;
401
+ try {
402
+ cleaningStats = countRemovedElements(html);
403
+ }
404
+ catch {
405
+ cleaningStats = {
406
+ scripts: 0, styles: 0, ads: 0, tracking: 0,
407
+ navigation: 0, socialWidgets: 0, popups: 0, totalRemoved: 0,
408
+ originalSizeBytes: Buffer.byteLength(html, 'utf8'),
409
+ cleanedSizeBytes: 0, reductionPercent: 0,
410
+ };
411
+ }
412
+ // Extract title from metadata
413
+ let title = '';
414
+ try {
415
+ const meta = extractMetadata(html, url);
416
+ title = meta.title || '';
417
+ }
418
+ catch {
419
+ title = '';
420
+ }
421
+ // Extract main content and convert to markdown
422
+ // For Wikipedia URLs: use the REST API for clean structured content
423
+ let markdownContent = '';
424
+ const isWikipedia = /(?:^|\.)wikipedia\.org$/.test(parsedUrl.hostname.toLowerCase());
425
+ if (isWikipedia) {
426
+ try {
427
+ const wikiContent = await fetchWikipediaContent(url);
428
+ if (wikiContent) {
429
+ markdownContent = wikiContent;
430
+ }
431
+ }
432
+ catch {
433
+ markdownContent = '';
434
+ }
435
+ }
436
+ // Fall back to generic HTML→markdown pipeline if Wikipedia fetch failed/N/A
437
+ if (!markdownContent) {
438
+ try {
439
+ const detected = detectMainContent(html);
440
+ const contentHtml = detected.html || html;
441
+ markdownContent = htmlToMarkdown(contentHtml, { prune: true });
442
+ }
443
+ catch {
444
+ markdownContent = '';
445
+ }
446
+ }
447
+ // Apply general post-processing to remove common noise artifacts
448
+ markdownContent = cleanDemoContent(markdownContent);
449
+ // Finalize cleaning stats now that we have the cleaned content size
450
+ const cleanedSizeBytes = Buffer.byteLength(markdownContent, 'utf8');
451
+ const originalSizeBytes = cleaningStats.originalSizeBytes;
452
+ const reductionPercent = originalSizeBytes > 0
453
+ ? Math.round(((originalSizeBytes - cleanedSizeBytes) / originalSizeBytes) * 100)
454
+ : 0;
455
+ const cleaned = {
456
+ scripts: cleaningStats.scripts,
457
+ styles: cleaningStats.styles,
458
+ ads: cleaningStats.ads,
459
+ tracking: cleaningStats.tracking,
460
+ navigation: cleaningStats.navigation,
461
+ socialWidgets: cleaningStats.socialWidgets,
462
+ popups: cleaningStats.popups,
463
+ totalRemoved: cleaningStats.totalRemoved,
464
+ originalSizeKB: Math.round(originalSizeBytes / 1024 * 10) / 10,
465
+ cleanedSizeKB: Math.round(cleanedSizeBytes / 1024 * 10) / 10,
466
+ reductionPercent: Math.max(0, Math.min(100, reductionPercent)),
467
+ };
468
+ // ── 7b. Token savings metrics ────────────────────────────────────────────
469
+ const rawTokens = Math.round(html.length / 4);
470
+ const cleanTokens = Math.round(markdownContent.length / 4);
471
+ const savingsPercent = rawTokens > 0
472
+ ? Math.max(0, Math.round((1 - cleanTokens / rawTokens) * 100))
473
+ : 0;
474
+ const tokenEstimate = {
475
+ raw: rawTokens,
476
+ clean: cleanTokens,
477
+ savings: savingsPercent,
478
+ };
479
+ // ── 8. Truncate content ──────────────────────────────────────────────────
480
+ const truncated = markdownContent.length > MAX_CONTENT_LENGTH;
481
+ const content = truncated
482
+ ? markdownContent.slice(0, MAX_CONTENT_LENGTH)
483
+ : markdownContent;
484
+ // Count words in the truncated content
485
+ const wordCount = content.split(/\s+/).filter(Boolean).length;
486
+ // ── 9. Build response and cache ──────────────────────────────────────────
487
+ const response = {
488
+ url: fetchResult.url || url,
489
+ title,
490
+ content,
491
+ wordCount,
492
+ fetchTimeMs,
493
+ truncated,
494
+ demo: true,
495
+ signUpUrl: SIGN_UP_URL,
496
+ cleaned,
497
+ tokenEstimate,
498
+ };
499
+ demoCache.set(cacheKey, { result: response, timestamp: Date.now() });
500
+ res.setHeader('X-Cache', 'MISS');
501
+ res.json(response);
502
+ }
503
+ catch (error) {
504
+ console.error('Demo endpoint error:', error);
505
+ res.status(500).json({
506
+ success: false,
507
+ error: {
508
+ type: 'internal_error',
509
+ message: 'Internal server error',
510
+ docs: 'https://webpeel.dev/docs/errors#internal-error',
511
+ },
512
+ requestId: req.requestId || crypto.randomUUID(),
513
+ });
514
+ }
515
+ });
516
+ return router;
517
+ }
@@ -0,0 +1,8 @@
1
+ /**
2
+ * /v1/do — Intent-based endpoint.
3
+ * One endpoint that understands natural language and routes internally.
4
+ * POST /v1/do { task: "find Stripe fees" }
5
+ * GET /v1/do?task=find+Stripe+fees
6
+ */
7
+ import { Router } from 'express';
8
+ export declare function createDoRouter(): Router;