trustsource 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,455 @@
1
+ import { Router, Request, Response } from "express";
2
+ import dns from "dns/promises";
3
+
4
+ const router = Router();
5
+
6
+ // ─── Constants ────────────────────────────────────────────────────────────────
7
+
8
+ const VALID_DOMAIN_RE = /^[a-zA-Z0-9][a-zA-Z0-9\-.]{1,251}[a-zA-Z0-9]$/;
9
+ const PRIVATE_IPV4_RE = /^(10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|127\.|0\.0\.0\.0|169\.254\.)/;
10
+ const PRIVATE_IPV6_RE = /^(::1|fc00:|fd00:|fe80:)/i;
11
+
12
+ const ALLOWED_PORTS = new Set(["", "80", "443", "8080", "8443"]);
13
+ const FETCH_TIMEOUT = 8000;
14
+ const MAX_BODY_BYTES = 100 * 1024; // robots.txt cap — 100 KB (RFC-recommended limit is 500 KB, we're stricter)
15
+
16
+ // ─── Known AI/LLM training bots (Spring 2026 list) ────────────────────────────
17
+ // Tracking these gives agents a quick read on whether a site permits AI crawling.
18
+
19
+ const AI_BOTS = [
20
+ // OpenAI / ChatGPT
21
+ "GPTBot", // OpenAI's training crawler
22
+ "ChatGPT-User", // Live ChatGPT browsing
23
+ "OAI-SearchBot", // OpenAI search index
24
+
25
+ // Anthropic / Claude
26
+ "ClaudeBot", // Anthropic's general crawler
27
+ "anthropic-ai", // Legacy Anthropic bot
28
+ "Claude-Web", // Claude web access
29
+
30
+ // Google
31
+ "Google-Extended", // Google's AI training opt-out (separate from Googlebot)
32
+
33
+ // Meta
34
+ "FacebookBot",
35
+ "Meta-ExternalAgent",
36
+
37
+ // Other major AI crawlers
38
+ "PerplexityBot",
39
+ "YouBot",
40
+ "cohere-ai",
41
+ "Bytespider", // ByteDance / TikTok
42
+ "Diffbot",
43
+ "Omgilibot",
44
+ "Applebot-Extended",
45
+ "ImagesiftBot",
46
+ "Amazonbot",
47
+ "Bingbot", // Microsoft (also feeds Copilot)
48
+ "CCBot", // Common Crawl (training data for many models)
49
+
50
+ // Aggregators / catch-all flags
51
+ "AI2Bot",
52
+ "Timpibot",
53
+ "magpie-crawler",
54
+ "SemrushBot-OCOB",
55
+ ];
56
+
57
+ // ─── Cache (12 hour TTL — robots.txt changes infrequently) ────────────────────
58
+
59
+ interface CacheEntry {
60
+ data: Record<string, unknown>;
61
+ expiresAt: number;
62
+ }
63
+ const cache = new Map<string, CacheEntry>();
64
+
65
+ function getCached(key: string): Record<string, unknown> | null {
66
+ const entry = cache.get(key);
67
+ if (!entry) return null;
68
+ if (Date.now() > entry.expiresAt) { cache.delete(key); return null; }
69
+ return entry.data;
70
+ }
71
+
72
+ function setCached(key: string, data: Record<string, unknown>): void {
73
+ cache.set(key, { data, expiresAt: Date.now() + 12 * 60 * 60 * 1000 });
74
+ if (cache.size > 1000) {
75
+ const firstKey = cache.keys().next().value;
76
+ if (firstKey) cache.delete(firstKey);
77
+ }
78
+ }
79
+
80
+ // ─── Domain validation + SSRF protection (same pattern as headers.ts) ────────
81
+
82
+ function extractAndValidateDomain(input: string): { domain: string } | { error: string } {
83
+ let url: URL;
84
+ try {
85
+ const withProto = input.match(/^https?:\/\//i) ? input : `https://${input}`;
86
+ url = new URL(withProto);
87
+ } catch {
88
+ return { error: "Could not parse domain or URL" };
89
+ }
90
+
91
+ if (url.protocol !== "http:" && url.protocol !== "https:") {
92
+ return { error: "Only http/https supported" };
93
+ }
94
+ if (!ALLOWED_PORTS.has(url.port)) {
95
+ return { error: `Port ${url.port} not permitted` };
96
+ }
97
+
98
+ const hostname = url.hostname.toLowerCase().replace(/^www\./, "");
99
+ if (!hostname) return { error: "Missing hostname" };
100
+ if (hostname === "localhost") return { error: "Localhost not permitted" };
101
+ if (PRIVATE_IPV4_RE.test(hostname) || PRIVATE_IPV6_RE.test(hostname)) {
102
+ return { error: "Private addresses not permitted" };
103
+ }
104
+
105
+ const isIp = /^[\d.]+$/.test(hostname) || hostname.includes(":");
106
+ if (!isIp && !VALID_DOMAIN_RE.test(hostname)) {
107
+ return { error: "Invalid hostname" };
108
+ }
109
+
110
+ return { domain: hostname };
111
+ }
112
+
113
+ async function isHostnameSafe(hostname: string): Promise<boolean> {
114
+ if (/^[\d.]+$/.test(hostname) || hostname.includes(":")) {
115
+ return !PRIVATE_IPV4_RE.test(hostname) && !PRIVATE_IPV6_RE.test(hostname);
116
+ }
117
+ try {
118
+ const addresses = await Promise.race([
119
+ dns.resolve(hostname),
120
+ new Promise<string[]>((_, reject) => setTimeout(() => reject(new Error("DNS timeout")), 3000)),
121
+ ]);
122
+ for (const addr of addresses) {
123
+ if (PRIVATE_IPV4_RE.test(addr) || PRIVATE_IPV6_RE.test(addr)) return false;
124
+ }
125
+ return addresses.length > 0;
126
+ } catch {
127
+ return false;
128
+ }
129
+ }
130
+
131
+ // ─── Fetch robots.txt with body size cap ─────────────────────────────────────
132
+
133
+ interface FetchResult {
134
+ exists: boolean;
135
+ status: number;
136
+ body: string;
137
+ truncated: boolean;
138
+ }
139
+
140
+ async function fetchRobotsTxt(domain: string): Promise<FetchResult> {
141
+ const safe = await isHostnameSafe(domain);
142
+ if (!safe) throw new Error(`Refused: ${domain} resolves to a private address`);
143
+
144
+ // Try HTTPS first, fall back to HTTP (some sites still don't redirect)
145
+ const urls = [`https://${domain}/robots.txt`, `http://${domain}/robots.txt`];
146
+
147
+ let lastErr: Error | null = null;
148
+ for (const url of urls) {
149
+ try {
150
+ const controller = new AbortController();
151
+ const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
152
+
153
+ const response = await fetch(url, {
154
+ method: "GET",
155
+ redirect: "follow", // robots.txt redirects are fine, browsers follow them
156
+ signal: controller.signal,
157
+ headers: {
158
+ "User-Agent": "TrustSource-RobotsCheck/1.0 (+https://trustsource.cc)",
159
+ "Accept": "text/plain, */*",
160
+ "Accept-Encoding": "identity",
161
+ },
162
+ });
163
+ clearTimeout(timer);
164
+
165
+ // No robots.txt → not an error, just record it
166
+ if (response.status === 404) {
167
+ try { await response.body?.cancel(); } catch { /* ignore */ }
168
+ return { exists: false, status: 404, body: "", truncated: false };
169
+ }
170
+
171
+ // Stream the body up to MAX_BODY_BYTES, then abort
172
+ let body = "";
173
+ let totalBytes = 0;
174
+ let truncated = false;
175
+
176
+ const reader = response.body?.getReader();
177
+ if (!reader) {
178
+ return { exists: response.status === 200, status: response.status, body: "", truncated: false };
179
+ }
180
+
181
+ const decoder = new TextDecoder("utf-8");
182
+ while (true) {
183
+ const { done, value } = await reader.read();
184
+ if (done) break;
185
+ totalBytes += value.length;
186
+ if (totalBytes > MAX_BODY_BYTES) {
187
+ truncated = true;
188
+ try { await reader.cancel(); } catch { /* ignore */ }
189
+ break;
190
+ }
191
+ body += decoder.decode(value, { stream: true });
192
+ }
193
+ body += decoder.decode();
194
+
195
+ return { exists: response.status === 200, status: response.status, body, truncated };
196
+ } catch (err) {
197
+ lastErr = err instanceof Error ? err : new Error(String(err));
198
+ continue;
199
+ }
200
+ }
201
+ throw lastErr || new Error("Failed to fetch robots.txt");
202
+ }
203
+
204
+ // ─── robots.txt parser ────────────────────────────────────────────────────────
205
+
206
+ interface UserAgentRules {
207
+ userAgent: string;
208
+ allow: string[];
209
+ disallow: string[];
210
+ crawlDelay: number | null;
211
+ }
212
+
213
+ interface ParsedRobots {
214
+ userAgents: UserAgentRules[];
215
+ sitemaps: string[];
216
+ rawLines: number;
217
+ hasErrors: boolean;
218
+ }
219
+
220
+ function parseRobotsTxt(body: string): ParsedRobots {
221
+ const lines = body.split(/\r?\n/);
222
+ const rawLines = lines.length;
223
+
224
+ const userAgents: UserAgentRules[] = [];
225
+ const sitemaps: string[] = [];
226
+ let currentGroup: UserAgentRules | null = null;
227
+ let hasErrors = false;
228
+
229
+ for (let line of lines) {
230
+ // Strip comments and trim
231
+ const hashIdx = line.indexOf("#");
232
+ if (hashIdx >= 0) line = line.slice(0, hashIdx);
233
+ line = line.trim();
234
+ if (!line) continue;
235
+
236
+ const colonIdx = line.indexOf(":");
237
+ if (colonIdx < 0) { hasErrors = true; continue; }
238
+
239
+ const directive = line.slice(0, colonIdx).trim().toLowerCase();
240
+ const value = line.slice(colonIdx + 1).trim();
241
+
242
+ switch (directive) {
243
+ case "user-agent": {
244
+ // A new user-agent block — but consecutive user-agent lines share a group
245
+ if (!currentGroup || currentGroup.allow.length || currentGroup.disallow.length || currentGroup.crawlDelay !== null) {
246
+ currentGroup = { userAgent: value, allow: [], disallow: [], crawlDelay: null };
247
+ userAgents.push(currentGroup);
248
+ } else {
249
+ // Empty group, add this UA as a peer (duplicate the rules later by reference)
250
+ currentGroup = { userAgent: value, allow: [], disallow: [], crawlDelay: null };
251
+ userAgents.push(currentGroup);
252
+ }
253
+ break;
254
+ }
255
+ case "allow":
256
+ if (currentGroup) currentGroup.allow.push(value);
257
+ break;
258
+ case "disallow":
259
+ if (currentGroup) currentGroup.disallow.push(value);
260
+ break;
261
+ case "crawl-delay":
262
+ if (currentGroup) {
263
+ const n = parseFloat(value);
264
+ if (!isNaN(n)) currentGroup.crawlDelay = n;
265
+ }
266
+ break;
267
+ case "sitemap":
268
+ if (value) sitemaps.push(value);
269
+ break;
270
+ default:
271
+ // Unknown directive — ignore (RFC says to skip unknowns gracefully)
272
+ break;
273
+ }
274
+ }
275
+
276
+ return { userAgents, sitemaps, rawLines, hasErrors };
277
+ }
278
+
279
+ // ─── AI bot policy analysis ───────────────────────────────────────────────────
280
+
281
+ interface AiBotPolicy {
282
+ bot: string;
283
+ blocked: boolean;
284
+ partial: boolean; // disallow some paths but not all
285
+ rules: { allow: string[]; disallow: string[] };
286
+ }
287
+
288
+ function analyzeAiBotPolicies(parsed: ParsedRobots): {
289
+ policies: AiBotPolicy[];
290
+ globalBlock: boolean; // "User-agent: *" disallows "/"
291
+ globalAllow: boolean; // "User-agent: *" with no disallows or only "Disallow:"
292
+ } {
293
+ const policies: AiBotPolicy[] = [];
294
+
295
+ // Find global "*" group
296
+ const globalGroup = parsed.userAgents.find(g => g.userAgent === "*");
297
+ const globalBlock = !!globalGroup && globalGroup.disallow.some(d => d === "/" || d === "");
298
+ const globalAllow = !globalGroup || globalGroup.disallow.length === 0 ||
299
+ globalGroup.disallow.every(d => d === "");
300
+
301
+ // Check each known AI bot
302
+ for (const bot of AI_BOTS) {
303
+ const match = parsed.userAgents.find(
304
+ g => g.userAgent.toLowerCase() === bot.toLowerCase()
305
+ );
306
+
307
+ if (!match) {
308
+ // Not mentioned → governed by "*" rules
309
+ policies.push({
310
+ bot,
311
+ blocked: globalBlock,
312
+ partial: false,
313
+ rules: { allow: [], disallow: [] },
314
+ });
315
+ continue;
316
+ }
317
+
318
+ const blockedRoot = match.disallow.some(d => d === "/" || d === "");
319
+ const hasAllow = match.allow.length > 0;
320
+ const hasDisallow = match.disallow.length > 0 && match.disallow.some(d => d !== "");
321
+
322
+ policies.push({
323
+ bot,
324
+ blocked: blockedRoot && !hasAllow,
325
+ partial: !blockedRoot && hasDisallow,
326
+ rules: { allow: match.allow, disallow: match.disallow },
327
+ });
328
+ }
329
+
330
+ return { policies, globalBlock, globalAllow };
331
+ }
332
+
333
+ // ─── Overall tier classification ──────────────────────────────────────────────
334
+
335
+ function classifyTier(
336
+ exists: boolean,
337
+ globalBlock: boolean,
338
+ aiAnalysis: ReturnType<typeof analyzeAiBotPolicies>
339
+ ): { tier: string; aiFriendly: boolean } {
340
+ if (!exists) return { tier: "NO_ROBOTS_TXT", aiFriendly: true };
341
+
342
+ const blockedAiCount = aiAnalysis.policies.filter(p => p.blocked).length;
343
+ const partialAiCount = aiAnalysis.policies.filter(p => p.partial).length;
344
+ const totalAi = aiAnalysis.policies.length;
345
+
346
+ if (globalBlock && blockedAiCount === totalAi) {
347
+ return { tier: "BLOCKED_ALL", aiFriendly: false };
348
+ }
349
+ if (blockedAiCount > totalAi / 2) {
350
+ return { tier: "BLOCKED_AI", aiFriendly: false };
351
+ }
352
+ if (blockedAiCount > 0 || partialAiCount > totalAi / 3) {
353
+ return { tier: "SELECTIVE", aiFriendly: true };
354
+ }
355
+ return { tier: "OPEN", aiFriendly: true };
356
+ }
357
+
358
+ // ─── Route ────────────────────────────────────────────────────────────────────
359
+
360
+ router.get("/robots", async (req: Request, res: Response) => {
361
+ const raw = (req.query.domain as string) || (req.query.url as string);
362
+
363
+ if (!raw) {
364
+ res.status(400).json({
365
+ error: "Missing parameter",
366
+ message: "Provide ?domain=example.com or ?url=https://example.com",
367
+ });
368
+ return;
369
+ }
370
+ if (raw.length > 253) {
371
+ res.status(400).json({
372
+ error: "Invalid input",
373
+ message: "Domain must be 253 characters or fewer",
374
+ });
375
+ return;
376
+ }
377
+
378
+ const validation = extractAndValidateDomain(raw);
379
+ if ("error" in validation) {
380
+ res.status(400).json({ error: "Invalid domain", message: validation.error });
381
+ return;
382
+ }
383
+
384
+ const domain = validation.domain;
385
+
386
+ // Cache check
387
+ const cached = getCached(domain);
388
+ if (cached) {
389
+ res.json({ ...cached, meta: { ...(cached.meta as object), cached: true } });
390
+ return;
391
+ }
392
+
393
+ try {
394
+ const fetchResult = await fetchRobotsTxt(domain);
395
+ const parsed = fetchResult.exists ? parseRobotsTxt(fetchResult.body) : null;
396
+ const aiAnalysis = parsed ? analyzeAiBotPolicies(parsed) : null;
397
+ const classify = classifyTier(
398
+ fetchResult.exists,
399
+ aiAnalysis?.globalBlock ?? false,
400
+ aiAnalysis ?? { policies: [], globalBlock: false, globalAllow: true }
401
+ );
402
+
403
+ const response = {
404
+ domain,
405
+ exists: fetchResult.exists,
406
+ tier: classify.tier,
407
+ aiFriendly: classify.aiFriendly,
408
+
409
+ summary: parsed ? {
410
+ userAgentGroups: parsed.userAgents.length,
411
+ sitemaps: parsed.sitemaps.length,
412
+ rawLines: parsed.rawLines,
413
+ truncated: fetchResult.truncated,
414
+ hasParseErrors: parsed.hasErrors,
415
+ } : null,
416
+
417
+ ai: aiAnalysis ? {
418
+ globalBlock: aiAnalysis.globalBlock,
419
+ globalAllow: aiAnalysis.globalAllow,
420
+ knownBotsChecked: AI_BOTS.length,
421
+ knownBotsBlocked: aiAnalysis.policies.filter(p => p.blocked).length,
422
+ knownBotsPartial: aiAnalysis.policies.filter(p => p.partial).length,
423
+ policies: aiAnalysis.policies,
424
+ } : null,
425
+
426
+ sitemaps: parsed?.sitemaps ?? [],
427
+ userAgents: parsed?.userAgents ?? [],
428
+
429
+ response: {
430
+ status: fetchResult.status,
431
+ },
432
+
433
+ meta: {
434
+ checkedAt: new Date().toISOString(),
435
+ apiVersion: "1.0",
436
+ paidWith: "x402/USDC",
437
+ cached: false,
438
+ },
439
+ };
440
+
441
+ setCached(domain, response);
442
+ res.json(response);
443
+
444
+ } catch (err) {
445
+ const msg = err instanceof Error ? err.message : "Unknown error";
446
+ res.status(502).json({
447
+ error: "robots.txt fetch failed",
448
+ domain,
449
+ message: msg,
450
+ meta: { checkedAt: new Date().toISOString(), apiVersion: "1.0" },
451
+ });
452
+ }
453
+ });
454
+
455
+ export default router;