termsearch 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ // URL fetcher + HTML to readable text extraction
2
+ // Used by AI summary to fetch page content
3
+
4
+ import { assertPublicUrl } from './ssrf-guard.js';
5
+
6
+ const FETCH_MAX_BYTES = 180_000;
7
+
8
+ function escapeRegExp(str) {
9
+ return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
10
+ }
11
+
12
+ export function stripHtml(html) {
13
+ return html
14
+ .replace(/<script\b[^>]*>[\s\S]*?<\/script>/gi, ' ')
15
+ .replace(/<style\b[^>]*>[\s\S]*?<\/style>/gi, ' ')
16
+ .replace(/<noscript\b[^>]*>[\s\S]*?<\/noscript>/gi, ' ')
17
+ // Preserve external links as "text [url]" for AI URL extraction
18
+ .replace(/<a\s[^>]*\bhref="(https?:\/\/[^"#?]{4,})"[^>]*>([\s\S]*?)<\/a>/gi, (_, url, inner) => {
19
+ const text = inner.replace(/<[^>]+>/g, '').trim();
20
+ return text ? `${text} [${url}]` : `[${url}]`;
21
+ })
22
+ .replace(/<[^>]+>/g, ' ')
23
+ .replace(/&nbsp;/gi, ' ')
24
+ .replace(/&amp;/gi, '&')
25
+ .replace(/&quot;/gi, '"')
26
+ .replace(/&#39;/gi, "'")
27
+ .replace(/\s+/g, ' ')
28
+ .trim();
29
+ }
30
+
31
+ export function getMetaContent(html, attr, value) {
32
+ const re = new RegExp(`<meta\\s+[^>]*${attr}="${escapeRegExp(value)}"[^>]*content="([^"]+)"[^>]*>`, 'i');
33
+ return html.match(re)?.[1]?.trim() || '';
34
+ }
35
+
36
+ export function truncateSmart(text, limit = 12000) {
37
+ const clean = String(text || '').trim();
38
+ if (clean.length <= limit) return clean;
39
+ const headLen = Math.max(500, Math.floor(limit * 0.62));
40
+ const tailLen = Math.max(400, limit - headLen - 8);
41
+ return `${clean.slice(0, headLen)} … ${clean.slice(-tailLen)}`.trim();
42
+ }
43
+
44
+ export function extractTitle(html, fallbackUrl) {
45
+ const titleMatch = html.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
46
+ if (titleMatch?.[1]) return stripHtml(titleMatch[1]).slice(0, 160);
47
+ return fallbackUrl;
48
+ }
49
+
50
+ function extractGithubReadable(parsedUrl, html) {
51
+ if (parsedUrl.hostname !== 'github.com') return null;
52
+ const seg = parsedUrl.pathname.split('/').filter(Boolean);
53
+ if (seg.length === 0) return null;
54
+
55
+ const title = extractTitle(html, parsedUrl.toString());
56
+ const metaDesc = getMetaContent(html, 'name', 'description') || getMetaContent(html, 'property', 'og:description');
57
+ const lines = [`GitHub page: ${parsedUrl.toString()}`, `Title: ${title}`];
58
+ if (metaDesc) lines.push(`Summary: ${stripHtml(metaDesc)}`);
59
+
60
+ if (seg.length === 1) {
61
+ const username = seg[0];
62
+ const repoRe = /<a\s+href="\/([^/"?#]+\/[^/"?#]+)"[^>]*itemprop="name codeRepository"[^>]*>([\s\S]*?)<\/a>/gi;
63
+ const repos = [];
64
+ let match;
65
+ while ((match = repoRe.exec(html)) !== null && repos.length < 12) {
66
+ const ownerRepo = String(match[1] || '').trim();
67
+ if (!ownerRepo.toLowerCase().startsWith(`${username.toLowerCase()}/`)) continue;
68
+ const repoName = stripHtml(match[2] || '').trim();
69
+ if (!repoName) continue;
70
+ const chunk = html.slice(match.index, match.index + 2200);
71
+ const descHtml = chunk.match(/itemprop="description"[^>]*>([\s\S]*?)<\/p>/i)?.[1] || '';
72
+ const starsChunk = chunk.match(new RegExp(`href="/${escapeRegExp(ownerRepo)}/stargazers"[\\s\\S]{0,180}<\\/a>`, 'i'))?.[0] || '';
73
+ const forksChunk = chunk.match(new RegExp(`href="/${escapeRegExp(ownerRepo)}/forks"[\\s\\S]{0,180}<\\/a>`, 'i'))?.[0] || '';
74
+ const stars = stripHtml(starsChunk).match(/(\d[\d.,kK]*)/)?.[1] || '';
75
+ const forks = stripHtml(forksChunk).match(/(\d[\d.,kK]*)/)?.[1] || '';
76
+ const desc = stripHtml(descHtml).slice(0, 180);
77
+ repos.push({ repo: repoName, url: `https://github.com/${ownerRepo}`, desc, stars, forks });
78
+ }
79
+ if (repos.length > 0) {
80
+ lines.push(`Repositories found: ${repos.length}`);
81
+ for (const r of repos) {
82
+ const meta = [r.stars ? `stars=${r.stars}` : '', r.forks ? `forks=${r.forks}` : ''].filter(Boolean).join(', ');
83
+ lines.push(`- ${r.repo}${r.desc ? ` — ${r.desc}` : ''}${meta ? ` (${meta})` : ''} [${r.url}]`);
84
+ }
85
+ }
86
+ }
87
+
88
+ if (seg.length >= 2) {
89
+ const ownerRepo = `${seg[0]}/${seg[1]}`;
90
+ const repoDesc = getMetaContent(html, 'property', 'og:description') || getMetaContent(html, 'name', 'description');
91
+ lines.push(`Repository: ${ownerRepo} [https://github.com/${ownerRepo}]`);
92
+ if (repoDesc) lines.push(`Repository summary: ${stripHtml(repoDesc)}`);
93
+ }
94
+
95
+ const content = truncateSmart(lines.join('\n'), 12000);
96
+ return content.length >= 80 ? content : null;
97
+ }
98
+
99
+ async function fetchGithubApiSummary(username, timeoutMs) {
100
+ const handle = String(username || '').trim();
101
+ if (!handle) return '';
102
+ const headers = { 'User-Agent': 'TermSearch/1.0', Accept: 'application/vnd.github+json' };
103
+ const ac = new AbortController();
104
+ const timer = setTimeout(() => ac.abort(), Math.min(timeoutMs, 9000));
105
+ try {
106
+ const [userRes, reposRes] = await Promise.all([
107
+ fetch(`https://api.github.com/users/${encodeURIComponent(handle)}`, { headers, signal: ac.signal }),
108
+ fetch(`https://api.github.com/users/${encodeURIComponent(handle)}/repos?sort=updated&per_page=12`, { headers, signal: ac.signal }),
109
+ ]);
110
+ let user = null;
111
+ let repos = [];
112
+ if (userRes.ok) user = await userRes.json();
113
+ if (reposRes.ok) { const d = await reposRes.json(); repos = Array.isArray(d) ? d : []; }
114
+ if (!user && repos.length === 0) return '';
115
+ const lines = ['GitHub API snapshot:'];
116
+ if (user) {
117
+ lines.push(`Profile: ${user.html_url || `https://github.com/${handle}`}`);
118
+ if (user.name) lines.push(`Name: ${user.name}`);
119
+ if (user.bio) lines.push(`Bio: ${String(user.bio).slice(0, 220)}`);
120
+ if (Number.isFinite(user.public_repos)) lines.push(`Public repos: ${user.public_repos}`);
121
+ }
122
+ if (repos.length > 0) {
123
+ lines.push(`Repositories (latest ${repos.length}):`);
124
+ for (const repo of repos) {
125
+ const parts = [];
126
+ if (repo.language) parts.push(`lang=${repo.language}`);
127
+ if (Number.isFinite(repo.stargazers_count)) parts.push(`stars=${repo.stargazers_count}`);
128
+ lines.push(`- ${repo.name}${repo.description ? ` — ${String(repo.description).slice(0, 180)}` : ''}${parts.length ? ` (${parts.join(', ')})` : ''} [${repo.html_url}]`);
129
+ }
130
+ }
131
+ return lines.join('\n');
132
+ } catch {
133
+ return '';
134
+ } finally {
135
+ clearTimeout(timer);
136
+ }
137
+ }
138
+
139
+ // Fetch a URL and return readable text content
140
+ // docCache: optional cache instance to use (injected from engine.js)
141
+ export async function fetchReadableDocument(rawUrl, { timeoutMs = 12000, docCache } = {}) {
142
+ const cacheKey = String(rawUrl || '').trim();
143
+ if (docCache) {
144
+ const cached = docCache.get(cacheKey);
145
+ if (cached) return cached;
146
+ }
147
+
148
+ const parsed = await assertPublicUrl(rawUrl);
149
+ // Note: AbortSignal.timeout() is broken with HTTPS in Node 24 — use manual AbortController
150
+ const ac = new AbortController();
151
+ const timer = setTimeout(() => ac.abort(), timeoutMs);
152
+ let response;
153
+ try {
154
+ response = await fetch(parsed.toString(), {
155
+ headers: {
156
+ 'User-Agent': 'TermSearchFetch/1.0',
157
+ Accept: 'text/html, text/plain;q=0.9,*/*;q=0.5',
158
+ },
159
+ signal: ac.signal,
160
+ redirect: 'follow',
161
+ });
162
+ } finally {
163
+ clearTimeout(timer);
164
+ }
165
+
166
+ if (!response.ok) throw new Error(`Fetch failed: ${response.status}`);
167
+
168
+ const contentType = response.headers.get('content-type') || '';
169
+ if (!/text\/html|text\/plain|application\/xhtml\+xml/i.test(contentType)) {
170
+ throw new Error(`Unsupported content-type: ${contentType || 'unknown'}`);
171
+ }
172
+
173
+ const buffer = await response.arrayBuffer();
174
+ const bytes = Buffer.from(buffer).subarray(0, FETCH_MAX_BYTES);
175
+ const html = bytes.toString('utf8');
176
+ const githubContent = extractGithubReadable(parsed, html);
177
+ let content = githubContent || truncateSmart(stripHtml(html), 12000);
178
+
179
+ if (parsed.hostname === 'github.com') {
180
+ const seg = parsed.pathname.split('/').filter(Boolean);
181
+ if (seg.length === 1) {
182
+ const apiSummary = await fetchGithubApiSummary(seg[0], timeoutMs);
183
+ if (apiSummary) content = truncateSmart(`${content}\n\n${apiSummary}`.trim(), 12000);
184
+ }
185
+ }
186
+
187
+ if (!content) throw new Error('No readable content extracted.');
188
+
189
+ const result = { url: parsed.toString(), title: extractTitle(html, parsed.toString()), content, status: 'ok' };
190
+ if (docCache) docCache.set(cacheKey, result, 45 * 60 * 1000);
191
+ return result;
192
+ }
193
+
194
+ // Batch fetch multiple URLs in parallel
195
+ export async function batchFetch(urls, { timeoutMs = 12000, docCache } = {}) {
196
+ return Promise.all(
197
+ urls.map((url) =>
198
+ fetchReadableDocument(url, { timeoutMs, docCache })
199
+ .then((doc) => ({ ...doc, url }))
200
+ .catch((e) => ({ url, status: 'error', error: e.message, content: '', title: url }))
201
+ )
202
+ );
203
+ }
204
+
205
+ // Scan a site homepage + a few relevant internal pages by query keywords
206
+ export async function scanSitePages(baseUrl, query, maxPages = 4, { timeoutMs = 12000, docCache } = {}) {
207
+ const clampedMax = Math.min(Number(maxPages) || 4, 8);
208
+ let base;
209
+ try {
210
+ base = await assertPublicUrl(baseUrl);
211
+ } catch {
212
+ return [];
213
+ }
214
+
215
+ const ac = new AbortController();
216
+ const timer = setTimeout(() => ac.abort(), timeoutMs);
217
+ let html = '';
218
+ try {
219
+ const response = await fetch(base.toString(), {
220
+ headers: {
221
+ 'User-Agent': 'TermSearchFetch/1.0',
222
+ Accept: 'text/html,*/*;q=0.5',
223
+ },
224
+ signal: ac.signal,
225
+ redirect: 'follow',
226
+ });
227
+ if (!response.ok) return [];
228
+ const buffer = await response.arrayBuffer();
229
+ html = Buffer.from(buffer).subarray(0, FETCH_MAX_BYTES).toString('utf8');
230
+ } catch {
231
+ return [];
232
+ } finally {
233
+ clearTimeout(timer);
234
+ }
235
+
236
+ const homepageDoc = {
237
+ url: base.toString(),
238
+ title: extractTitle(html, base.toString()),
239
+ content: truncateSmart(stripHtml(html), 12000),
240
+ status: 'ok',
241
+ };
242
+
243
+ const seen = new Set([base.toString().replace(/\/+$/, '')]);
244
+ const candidateUrls = [];
245
+ const linkRe = /href="(\/[^"#?]{2,}|https?:\/\/[^"#?]+)"/gi;
246
+ let match;
247
+ while ((match = linkRe.exec(html)) !== null && candidateUrls.length < 40) {
248
+ try {
249
+ const full = new URL(match[1], base.toString());
250
+ if (full.hostname !== base.hostname) continue;
251
+ if (/(login|signin|logout|register|account|cart|checkout|\.pdf|\.zip|\.exe|\.jpg|\.jpeg|\.png|\.gif|\.css|\.js)/i.test(full.pathname)) continue;
252
+ const normalized = `${full.origin}${full.pathname}`.replace(/\/+$/, '');
253
+ if (!normalized || seen.has(normalized)) continue;
254
+ seen.add(normalized);
255
+ candidateUrls.push(normalized);
256
+ } catch {
257
+ // ignore invalid links
258
+ }
259
+ }
260
+
261
+ const queryWords = String(query || '')
262
+ .toLowerCase()
263
+ .split(/\s+/)
264
+ .filter((w) => w.length > 2);
265
+
266
+ const toFetch = candidateUrls
267
+ .map((url) => {
268
+ let score = 0;
269
+ try {
270
+ const parsed = new URL(url);
271
+ score = queryWords.filter((w) => parsed.pathname.toLowerCase().includes(w)).length;
272
+ } catch {
273
+ // ignore parse failures
274
+ }
275
+ return { url, score };
276
+ })
277
+ .sort((a, b) => b.score - a.score)
278
+ .slice(0, Math.max(0, clampedMax - 1))
279
+ .map((entry) => entry.url);
280
+
281
+ const settled = await Promise.allSettled(
282
+ toFetch.map(async (url) => {
283
+ try {
284
+ const doc = await fetchReadableDocument(url, { timeoutMs, docCache });
285
+ return doc?.status === 'ok' && String(doc.content || '').length > 100 ? doc : null;
286
+ } catch {
287
+ return null;
288
+ }
289
+ })
290
+ );
291
+
292
+ const docs = settled
293
+ .filter((result) => result.status === 'fulfilled' && result.value)
294
+ .map((result) => result.value);
295
+
296
+ return [homepageDoc, ...docs];
297
+ }
@@ -0,0 +1,40 @@
1
+ // SSRF protection — validates URLs before fetching, blocks private/internal IPs
2
+
3
+ import dns from 'dns/promises';
4
+ import net from 'net';
5
+
6
+ const PRIVATE_V4_PREFIXES = [
7
+ '10.', '172.16.', '172.17.', '172.18.', '172.19.', '172.20.', '172.21.',
8
+ '172.22.', '172.23.', '172.24.', '172.25.', '172.26.', '172.27.',
9
+ '172.28.', '172.29.', '172.30.', '172.31.',
10
+ '192.168.', '127.', '169.254.', '0.',
11
+ ];
12
+
13
+ export function isPrivateIp(ip) {
14
+ if (!ip) return true;
15
+ if (net.isIPv6(ip)) {
16
+ return ip === '::1' || ip.startsWith('fc') || ip.startsWith('fd') || ip.startsWith('fe80');
17
+ }
18
+ return PRIVATE_V4_PREFIXES.some((prefix) => ip.startsWith(prefix));
19
+ }
20
+
21
+ export async function assertPublicUrl(rawUrl) {
22
+ let parsed;
23
+ try {
24
+ parsed = new URL(rawUrl);
25
+ } catch {
26
+ throw new Error('Invalid URL.');
27
+ }
28
+ if (!['http:', 'https:'].includes(parsed.protocol)) {
29
+ throw new Error('Only http/https URLs are allowed.');
30
+ }
31
+ const hostname = parsed.hostname;
32
+ if (!hostname || hostname === 'localhost') {
33
+ throw new Error('Local addresses are not allowed.');
34
+ }
35
+ const records = await dns.lookup(hostname, { all: true });
36
+ if (!records.length || records.some((record) => isPrivateIp(record.address))) {
37
+ throw new Error('Private or internal targets are not allowed.');
38
+ }
39
+ return parsed;
40
+ }
@@ -0,0 +1,212 @@
1
+ // Profile scanner — ported from MmmSearch
2
+ // Supports: GitHub, Bluesky, Reddit, Twitter/X, Instagram, YouTube, LinkedIn, TikTok, Telegram, Facebook
3
+
4
+ import { tryNitterInstances, fetchInstagramProfile, fetchYouTubeProfile, fetchFacebookPage, fetchLinkedInProfile, fetchTikTokProfile, fetchTelegramProfile } from '../social/scrapers.js';
5
+ import { fetchBlueskyActors } from '../social/search.js';
6
+
7
+ export const PROFILER_PLATFORMS = new Set([
8
+ 'github', 'bluesky', 'reddit', 'twitter', 'instagram',
9
+ 'linkedin', 'telegram', 'youtube', 'facebook', 'tiktok', 'auto',
10
+ ]);
11
+
12
+ // ─── URL/handle detection ─────────────────────────────────────────────────────
13
+
14
+ export function detectProfileTarget(raw) {
15
+ const q = (raw || '').trim();
16
+ let m;
17
+ if ((m = q.match(/github\.com\/([A-Za-z0-9_-]+)/i))) return { platform: 'github', handle: m[1], url: `https://github.com/${m[1]}` };
18
+ if ((m = q.match(/bsky\.app\/profile\/([A-Za-z0-9._:-]+)/i))) return { platform: 'bluesky', handle: m[1], url: `https://bsky.app/profile/${m[1]}` };
19
+ if ((m = q.match(/reddit\.com\/u(?:ser)?\/([A-Za-z0-9_-]+)/i))) return { platform: 'reddit', handle: m[1], url: `https://reddit.com/u/${m[1]}` };
20
+ if ((m = q.match(/(?:t\.me|telegram\.me)\/([A-Za-z0-9_]{3,})/i))) return { platform: 'telegram', handle: m[1], url: `https://t.me/${m[1]}` };
21
+ if ((m = q.match(/(?:twitter|x)\.com\/([A-Za-z0-9_]+)(?:\/|$)/i))) return { platform: 'twitter', handle: m[1], url: `https://x.com/${m[1]}` };
22
+ if ((m = q.match(/youtube\.com\/@([A-Za-z0-9._-]+)(?:\/|$|\?|#)/i))) return { platform: 'youtube', handle: m[1], url: `https://www.youtube.com/@${m[1]}` };
23
+ if ((m = q.match(/youtube\.com\/(?:channel|c|user)\/([A-Za-z0-9._-]+)(?:\/|$)/i))) return { platform: 'youtube', handle: m[1], url: `https://www.youtube.com/channel/${m[1]}` };
24
+ if ((m = q.match(/instagram\.com\/([A-Za-z0-9_.]+)(?:\/|$)/i))) return { platform: 'instagram', handle: m[1], url: `https://instagram.com/${m[1]}` };
25
+ if ((m = q.match(/linkedin\.com\/in\/([A-Za-z0-9_-]+)(?:\/|$)/i))) return { platform: 'linkedin', handle: m[1], url: `https://linkedin.com/in/${m[1]}` };
26
+ if ((m = q.match(/(?:facebook|fb)\.com\/([A-Za-z0-9_.]+)(?:\/|$)/i))) return { platform: 'facebook', handle: m[1], url: `https://www.facebook.com/${m[1]}` };
27
+ if ((m = q.match(/tiktok\.com\/@([A-Za-z0-9_.]+)(?:\/|$)/i))) return { platform: 'tiktok', handle: m[1], url: `https://www.tiktok.com/@${m[1]}` };
28
+ if ((m = q.match(/^@([A-Za-z0-9_.][A-Za-z0-9_.-]{0,58})$/))) return { platform: 'auto', handle: m[1], url: null };
29
+ return null;
30
+ }
31
+
32
+ // ─── GitHub ───────────────────────────────────────────────────────────────────
33
+
34
+ async function ghFetch(path) {
35
+ const ac = new AbortController();
36
+ const t = setTimeout(() => ac.abort(), 8000);
37
+ try {
38
+ const hdrs = { 'User-Agent': 'TermSearch/1.0', Accept: 'application/vnd.github.v3+json' };
39
+ const token = process.env.TERMSEARCH_GITHUB_TOKEN;
40
+ if (token) hdrs['Authorization'] = `token ${token}`;
41
+ const r = await fetch(`https://api.github.com${path}`, { headers: hdrs, signal: ac.signal });
42
+ if (!r.ok) return null;
43
+ return r.json();
44
+ } catch { return null; }
45
+ finally { clearTimeout(t); }
46
+ }
47
+
48
+ export async function fetchGitHubProfileData(handle) {
49
+ const [user, repos] = await Promise.all([
50
+ ghFetch(`/users/${encodeURIComponent(handle)}`),
51
+ ghFetch(`/users/${encodeURIComponent(handle)}/repos?per_page=100&sort=updated`),
52
+ ]);
53
+ if (!user || user.message) return null;
54
+ const sortedRepos = (Array.isArray(repos) ? repos : [])
55
+ .sort((a, b) => {
56
+ const s = Number(b.stargazers_count || 0) - Number(a.stargazers_count || 0);
57
+ if (s !== 0) return s;
58
+ const f = Number(b.forks_count || 0) - Number(a.forks_count || 0);
59
+ if (f !== 0) return f;
60
+ return new Date(b.updated_at || 0) - new Date(a.updated_at || 0);
61
+ })
62
+ .slice(0, 8);
63
+ return {
64
+ platform: 'github', handle: user.login, name: user.name || user.login,
65
+ bio: user.bio || null, avatar: user.avatar_url, url: user.html_url,
66
+ followers: user.followers, following: user.following,
67
+ publicRepos: user.public_repos, company: user.company || null,
68
+ location: user.location || null, blog: user.blog || null,
69
+ createdAt: user.created_at,
70
+ topRepos: sortedRepos.map((r) => ({
71
+ name: r.name, stars: r.stargazers_count, forks: r.forks_count,
72
+ lang: r.language || null, description: (r.description || '').slice(0, 180),
73
+ url: r.html_url,
74
+ })),
75
+ };
76
+ }
77
+
78
+ // ─── Bluesky ──────────────────────────────────────────────────────────────────
79
+
80
+ export async function fetchBlueskyProfileData(handle) {
81
+ const ac = new AbortController();
82
+ const t = setTimeout(() => ac.abort(), 8000);
83
+ try {
84
+ const r = await fetch(`https://api.bsky.app/xrpc/app.bsky.actor.getProfile?actor=${encodeURIComponent(handle)}`, {
85
+ headers: { 'User-Agent': 'TermSearch/1.0' }, signal: ac.signal,
86
+ });
87
+ if (!r.ok) return null;
88
+ const d = await r.json();
89
+ if (!d.handle) return null;
90
+ return {
91
+ platform: 'bluesky', handle: d.handle, name: d.displayName || d.handle,
92
+ bio: d.description || null, avatar: d.avatar || null,
93
+ url: `https://bsky.app/profile/${d.handle}`,
94
+ followers: d.followersCount, following: d.followsCount, postsCount: d.postsCount,
95
+ createdAt: d.indexedAt || null,
96
+ };
97
+ } catch { return null; }
98
+ finally { clearTimeout(t); }
99
+ }
100
+
101
+ // ─── Reddit ───────────────────────────────────────────────────────────────────
102
+
103
+ export async function fetchRedditProfileData(handle) {
104
+ const ac = new AbortController();
105
+ const t = setTimeout(() => ac.abort(), 8000);
106
+ try {
107
+ const r = await fetch(`https://www.reddit.com/user/${encodeURIComponent(handle)}/about.json`, {
108
+ headers: { 'User-Agent': 'TermSearch/1.0 (by /u/termsearch)', Accept: 'application/json' },
109
+ signal: ac.signal,
110
+ });
111
+ if (!r.ok) return null;
112
+ const d = await r.json();
113
+ const u = d?.data;
114
+ if (!u || u.is_suspended) return null;
115
+ return {
116
+ platform: 'reddit', handle: u.name, name: u.name,
117
+ bio: u.subreddit?.public_description || null,
118
+ avatar: u.icon_img ? u.icon_img.split('?')[0] : null,
119
+ url: `https://reddit.com/u/${u.name}`,
120
+ karma: u.total_karma, linkKarma: u.link_karma, commentKarma: u.comment_karma,
121
+ createdAt: new Date(u.created_utc * 1000).toISOString(),
122
+ };
123
+ } catch { return null; }
124
+ finally { clearTimeout(t); }
125
+ }
126
+
127
+ // ─── Similar profiles ─────────────────────────────────────────────────────────
128
+
129
+ async function findSimilarGitHub(profile) {
130
+ const langs = [...new Set((profile.topRepos || []).map((r) => r.lang).filter(Boolean))].slice(0, 1);
131
+ const minF = Math.max(5, Math.floor((profile.followers || 100) * 0.15));
132
+ const q = langs.length > 0 ? `language:${langs[0]} followers:>${minF} repos:>2` : `followers:>${minF} repos:>3`;
133
+ const data = await ghFetch(`/search/users?q=${encodeURIComponent(q)}&per_page=8&sort=followers`);
134
+ return (data?.items || [])
135
+ .filter((u) => u.login.toLowerCase() !== profile.handle.toLowerCase())
136
+ .slice(0, 6)
137
+ .map((u) => ({ platform: 'github', handle: u.login, name: u.login, avatar: u.avatar_url, url: u.html_url }));
138
+ }
139
+
140
+ async function findSimilarBluesky(profile) {
141
+ const terms = (profile.bio || profile.name || '').split(/\s+/).slice(0, 4).join(' ');
142
+ if (!terms.trim()) return [];
143
+ const actors = await fetchBlueskyActors(terms, 8);
144
+ return actors
145
+ .filter((a) => a.handle !== profile.handle)
146
+ .slice(0, 6)
147
+ .map((a) => ({
148
+ platform: 'bluesky', handle: a.handle,
149
+ name: (a.title || '').replace(/ \(@.*\)$/, '') || a.handle,
150
+ bio: a.snippet || null, url: a.url,
151
+ }));
152
+ }
153
+
154
+ // ─── Main scan ────────────────────────────────────────────────────────────────
155
+
156
+ export async function scanProfile(target) {
157
+ const { platform, handle, url } = target;
158
+ let profile = null;
159
+ let similar = [];
160
+
161
+ switch (platform) {
162
+ case 'github':
163
+ profile = await fetchGitHubProfileData(handle);
164
+ if (profile) similar = await findSimilarGitHub(profile).catch(() => []);
165
+ break;
166
+ case 'bluesky':
167
+ profile = await fetchBlueskyProfileData(handle);
168
+ if (profile) similar = await findSimilarBluesky(profile).catch(() => []);
169
+ break;
170
+ case 'reddit':
171
+ profile = await fetchRedditProfileData(handle);
172
+ break;
173
+ case 'twitter':
174
+ profile = await tryNitterInstances(handle);
175
+ break;
176
+ case 'instagram':
177
+ profile = await fetchInstagramProfile(handle);
178
+ break;
179
+ case 'youtube':
180
+ profile = await fetchYouTubeProfile(handle, url);
181
+ break;
182
+ case 'facebook':
183
+ profile = url ? await fetchFacebookPage(url, handle) : null;
184
+ break;
185
+ case 'linkedin':
186
+ profile = await fetchLinkedInProfile(handle);
187
+ break;
188
+ case 'tiktok':
189
+ profile = await fetchTikTokProfile(handle);
190
+ break;
191
+ case 'telegram':
192
+ profile = await fetchTelegramProfile(handle);
193
+ break;
194
+ case 'auto': {
195
+ // Try platforms in order for @handle
196
+ const attempts = [
197
+ () => fetchBlueskyProfileData(handle),
198
+ () => fetchGitHubProfileData(handle),
199
+ () => fetchRedditProfileData(handle),
200
+ () => tryNitterInstances(handle),
201
+ () => fetchTelegramProfile(handle),
202
+ ];
203
+ for (const attempt of attempts) {
204
+ profile = await attempt().catch(() => null);
205
+ if (profile) break;
206
+ }
207
+ break;
208
+ }
209
+ }
210
+
211
+ return { target, profile: profile || null, similar };
212
+ }
@@ -0,0 +1,119 @@
1
+ // Tiered cache: L1 in-process RAM + L2 disk (persistent across restarts)
2
+ // L1: hot-set in RAM, bounded, lost on restart (fast Map with LRU eviction)
3
+ // L2: disk JSON files, larger budget, survives restarts
4
+ // Read path: L1 hit → return; L2 hit → promote to L1 (remaining TTL) → return
5
+ // Write path: write to both L1 and L2 (async disk write, non-blocking)
6
+
7
+ import crypto from 'crypto';
8
+ import fs from 'fs';
9
+ import path from 'path';
10
+
11
+ // L1: in-process Map with TTL + LRU eviction
12
+ export function makeCache(maxSize) {
13
+ const store = new Map();
14
+ return {
15
+ get(key) {
16
+ const entry = store.get(key);
17
+ if (!entry) return undefined;
18
+ if (Date.now() > entry.expires) { store.delete(key); return undefined; }
19
+ return entry.value;
20
+ },
21
+ set(key, value, ttl) {
22
+ if (store.size >= maxSize) store.delete(store.keys().next().value);
23
+ store.set(key, { value, expires: Date.now() + ttl });
24
+ },
25
+ };
26
+ }
27
+
28
+ function _hashKey(key) {
29
+ return crypto.createHash('sha1').update(key).digest('hex');
30
+ }
31
+
32
+ function _diskEvict(dir, maxEntries, maxBytes) {
33
+ fs.readdir(dir, (err, files) => {
34
+ if (err) return;
35
+ const jsonFiles = files.filter((f) => f.endsWith('.json'));
36
+ if (!jsonFiles.length) return;
37
+ const stats = [];
38
+ let pending = jsonFiles.length;
39
+ for (const f of jsonFiles) {
40
+ const fp = path.join(dir, f);
41
+ fs.stat(fp, (statErr, st) => {
42
+ if (!statErr) stats.push({ fp, mtime: st.mtimeMs, size: st.size });
43
+ if (--pending > 0) return;
44
+ stats.sort((a, b) => a.mtime - b.mtime); // oldest first
45
+ let totalBytes = stats.reduce((s, e) => s + e.size, 0);
46
+ const now = Date.now();
47
+ const alive = [];
48
+ for (const s of stats) {
49
+ try {
50
+ const { expires } = JSON.parse(fs.readFileSync(s.fp, 'utf8'));
51
+ if (now > expires) { fs.unlink(s.fp, () => {}); totalBytes -= s.size; }
52
+ else alive.push(s);
53
+ } catch { fs.unlink(s.fp, () => {}); }
54
+ }
55
+ while (alive.length > maxEntries || totalBytes > maxBytes) {
56
+ const oldest = alive.shift();
57
+ if (!oldest) break;
58
+ totalBytes -= oldest.size;
59
+ fs.unlink(oldest.fp, () => {});
60
+ }
61
+ });
62
+ }
63
+ });
64
+ }
65
+
66
+ // L2: disk cache — sync read, async write (non-blocking fire-and-forget)
67
+ export function makeDiskCache(dir, maxEntries, maxBytes) {
68
+ try { fs.mkdirSync(dir, { recursive: true }); } catch { /* ignore */ }
69
+ let evictTimer = null;
70
+ return {
71
+ get(key) {
72
+ const fp = path.join(dir, _hashKey(key) + '.json');
73
+ try {
74
+ const { value, expires } = JSON.parse(fs.readFileSync(fp, 'utf8'));
75
+ if (Date.now() > expires) { fs.unlink(fp, () => {}); return undefined; }
76
+ return { value, remainingTtl: expires - Date.now() };
77
+ } catch { return undefined; }
78
+ },
79
+ set(key, value, ttl) {
80
+ const fp = path.join(dir, _hashKey(key) + '.json');
81
+ fs.writeFile(fp, JSON.stringify({ value, expires: Date.now() + ttl }), (err) => {
82
+ if (err) console.warn('[disk-cache] write error:', err.message);
83
+ });
84
+ if (!evictTimer) {
85
+ evictTimer = setTimeout(() => { evictTimer = null; _diskEvict(dir, maxEntries, maxBytes); }, 15_000);
86
+ evictTimer.unref?.();
87
+ }
88
+ },
89
+ };
90
+ }
91
+
92
+ // Tiered cache: L1 (RAM) + L2 (disk). Same get/set API as makeCache.
93
+ export function makeTieredCache(l1Max, diskDir, diskMaxEntries, diskMaxBytes) {
94
+ const l1 = makeCache(l1Max);
95
+ const l2 = makeDiskCache(diskDir, diskMaxEntries, diskMaxBytes);
96
+ const L1_PROMO_CAP = 15 * 60 * 1000;
97
+ return {
98
+ get(key) {
99
+ const v = l1.get(key);
100
+ if (v !== undefined) return v;
101
+ const d = l2.get(key);
102
+ if (d !== undefined) {
103
+ l1.set(key, d.value, Math.min(d.remainingTtl, L1_PROMO_CAP));
104
+ return d.value;
105
+ }
106
+ return undefined;
107
+ },
108
+ set(key, value, ttl) {
109
+ l1.set(key, value, ttl);
110
+ l2.set(key, value, ttl);
111
+ },
112
+ };
113
+ }
114
+
115
+ // Build a cache key for search queries
116
+ export function searchCacheKey(query, lang, safe, providerList, tier, category = 'web', page = 1) {
117
+ const sorted = [...providerList].sort().join(',');
118
+ return `${tier}:${lang}:${safe}:${sorted}:${category}:p${page}:${query.toLowerCase().trim()}`;
119
+ }