termsearch 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,155 @@
1
+ // Result ranking and deduplication for TermSearch
2
+
3
+ // Source quality weights — higher = results from this source ranked first
4
+ const SOURCE_ENGINE_WEIGHTS = {
5
+ 'wikipedia': 1.8,
6
+ 'wikipedia-api': 1.8,
7
+ 'brave-api': 1.5,
8
+ 'mojeek-api': 1.4,
9
+ 'duckduckgo': 1.2,
10
+ 'searxng': 1.1,
11
+ // engines from SearXNG
12
+ 'startpage': 1.3,
13
+ 'qwant': 1.2,
14
+ 'bing': 1.1,
15
+ 'google': 1.1,
16
+ 'yahoo': 1.0,
17
+ };
18
+
19
+ function getSourceWeight(engine) {
20
+ return SOURCE_ENGINE_WEIGHTS[String(engine || '').toLowerCase()] || 1.0;
21
+ }
22
+
23
+ export function safeHostname(url) {
24
+ try {
25
+ return new URL(String(url || '')).hostname.replace(/^www\./, '').toLowerCase();
26
+ } catch {
27
+ return '';
28
+ }
29
+ }
30
+
31
+ export function normalizeComparableUrl(url) {
32
+ try {
33
+ const parsed = new URL(String(url || '').trim());
34
+ parsed.hash = '';
35
+ ['utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content', 'gclid', 'fbclid']
36
+ .forEach((k) => parsed.searchParams.delete(k));
37
+ if ((parsed.protocol === 'https:' && parsed.port === '443') ||
38
+ (parsed.protocol === 'http:' && parsed.port === '80')) parsed.port = '';
39
+ parsed.pathname = parsed.pathname.replace(/\/+$/, '') || '/';
40
+ return parsed.toString();
41
+ } catch {
42
+ return '';
43
+ }
44
+ }
45
+
46
+ // Merge two result arrays, deduplicating by normalized URL
47
+ export function mergeSearchResultSets(primary, secondary) {
48
+ const seen = new Set(
49
+ primary.map((r) => normalizeComparableUrl(r.url)).filter(Boolean)
50
+ );
51
+ const merged = [...primary];
52
+ for (const r of secondary) {
53
+ const norm = normalizeComparableUrl(r.url);
54
+ if (norm && !seen.has(norm)) {
55
+ seen.add(norm);
56
+ merged.push(r);
57
+ }
58
+ }
59
+ return merged;
60
+ }
61
+
62
+ // Rank results by source diversity, penalizing repeated engines/hosts
63
+ export function rankResultsBySourceDiversity(results) {
64
+ const seenEngines = new Map();
65
+ const seenHosts = new Map();
66
+ return (results || [])
67
+ .map((item, index) => {
68
+ const engine = String(item.engine || '').toLowerCase();
69
+ const host = safeHostname(item.url);
70
+ const engineSeen = seenEngines.get(engine) || 0;
71
+ const hostSeen = seenHosts.get(host) || 0;
72
+ seenEngines.set(engine, engineSeen + 1);
73
+ if (host) seenHosts.set(host, hostSeen + 1);
74
+
75
+ const sourceWeight = getSourceWeight(engine);
76
+ const baseScore = Number(item.score || 0);
77
+ const engineDiversity = engineSeen === 0 ? 2.0 : Math.max(0.1, 1.2 - (engineSeen * 0.25));
78
+ const hostDiversity = hostSeen === 0 ? 1.4 : Math.max(-1.0, 0.3 - (hostSeen * 0.6));
79
+ const positionPenalty = index * 0.03;
80
+ const diversityScore = (sourceWeight * 2.0) + Math.min(baseScore, 2.0) + engineDiversity + hostDiversity - positionPenalty;
81
+ return { ...item, diversityScore };
82
+ })
83
+ .sort((a, b) => b.diversityScore - a.diversityScore)
84
+ .map(({ diversityScore, ...item }) => item);
85
+ }
86
+
87
+ // Build per-engine fetch plan for AI document fetching
88
+ export function buildPerEngineFetchPlan(results, {
89
+ minPerEngine = 3,
90
+ maxTotal = 15,
91
+ maxPerDomain = 2,
92
+ } = {}) {
93
+ const perEngine = new Map();
94
+ for (const result of results || []) {
95
+ const url = normalizeComparableUrl(result?.url);
96
+ if (!url || !/^https?:\/\//.test(url) || /(login|signin|\.pdf|\.zip|\.exe)/i.test(url)) continue;
97
+ const engine = String(result?.engine || 'unknown').toLowerCase();
98
+ if (!perEngine.has(engine)) perEngine.set(engine, []);
99
+ perEngine.get(engine).push(url);
100
+ }
101
+
102
+ const engineOrder = Array.from(perEngine.keys())
103
+ .sort((a, b) => getSourceWeight(b) - getSourceWeight(a));
104
+
105
+ const picked = [];
106
+ const seen = new Set();
107
+ const perEngineCount = new Map();
108
+ const perDomainCount = new Map();
109
+ const nextIndex = new Map(engineOrder.map((e) => [e, 0]));
110
+
111
+ let progressed = true;
112
+ while (progressed && picked.length < maxTotal) {
113
+ progressed = false;
114
+ for (const engine of engineOrder) {
115
+ if (picked.length >= maxTotal) break;
116
+ const have = perEngineCount.get(engine) || 0;
117
+ if (have >= minPerEngine) continue;
118
+ const pool = perEngine.get(engine) || [];
119
+ let idx = nextIndex.get(engine) || 0;
120
+ while (idx < pool.length) {
121
+ const candidate = pool[idx++];
122
+ const host = safeHostname(candidate);
123
+ if (seen.has(candidate)) continue;
124
+ if (host && (perDomainCount.get(host) || 0) >= maxPerDomain) continue;
125
+ seen.add(candidate);
126
+ picked.push(candidate);
127
+ perEngineCount.set(engine, have + 1);
128
+ if (host) perDomainCount.set(host, (perDomainCount.get(host) || 0) + 1);
129
+ progressed = true;
130
+ break;
131
+ }
132
+ nextIndex.set(engine, idx);
133
+ }
134
+ }
135
+
136
+ // Fill remaining slots from all engines without minPerEngine constraint
137
+ if (picked.length < maxTotal) {
138
+ for (const engine of engineOrder) {
139
+ const pool = perEngine.get(engine) || [];
140
+ let idx = nextIndex.get(engine) || 0;
141
+ while (idx < pool.length && picked.length < maxTotal) {
142
+ const candidate = pool[idx++];
143
+ const host = safeHostname(candidate);
144
+ if (seen.has(candidate)) continue;
145
+ if (host && (perDomainCount.get(host) || 0) >= maxPerDomain) continue;
146
+ seen.add(candidate);
147
+ picked.push(candidate);
148
+ if (host) perDomainCount.set(host, (perDomainCount.get(host) || 0) + 1);
149
+ }
150
+ nextIndex.set(engine, idx);
151
+ }
152
+ }
153
+
154
+ return picked;
155
+ }
package/src/server.js ADDED
@@ -0,0 +1,68 @@
1
+ // TermSearch — personal search engine server
2
+ // Replaces the 4290-line monolith with a clean modular setup
3
+
4
+ import express from 'express';
5
+ import path from 'path';
6
+ import { fileURLToPath } from 'url';
7
+ import config from './config/manager.js';
8
+ import { initCaches } from './search/engine.js';
9
+ import { createRouter } from './api/routes.js';
10
+ import { createRateLimiters, ipMiddleware, applySecurityHeaders } from './api/middleware.js';
11
+
12
+ const __filename = fileURLToPath(import.meta.url);
13
+ const __dirname = path.dirname(__filename);
14
+ const FRONTEND_DIST = path.join(__dirname, '../frontend/dist');
15
+
16
+ // Initialize config and caches
17
+ const cfg = config.getConfig();
18
+ const dataDir = config.getDataDir();
19
+ initCaches(dataDir, cfg);
20
+
21
+ // Express app setup
22
+ const app = express();
23
+ app.set('trust proxy', 1);
24
+ app.disable('x-powered-by');
25
+
26
+ // Middleware
27
+ app.use(ipMiddleware);
28
+ app.use(express.json({ limit: '256kb' }));
29
+
30
+ // Rate limiters
31
+ const rateLimiters = createRateLimiters(cfg);
32
+
33
+ // API routes
34
+ const router = createRouter(config, rateLimiters);
35
+ app.use(router);
36
+
37
+ // Serve frontend static files
38
+ app.use(express.static(FRONTEND_DIST, {
39
+ maxAge: '1h',
40
+ etag: true,
41
+ index: 'index.html',
42
+ }));
43
+
44
+ // SPA fallback — serve index.html for any non-API route
45
+ app.get('*', (req, res) => {
46
+ applySecurityHeaders(res);
47
+ res.sendFile(path.join(FRONTEND_DIST, 'index.html'));
48
+ });
49
+
50
+ // Start server
51
+ const port = cfg.port || 3000;
52
+ const host = cfg.host || '127.0.0.1';
53
+
54
+ const server = app.listen(port, host, () => {
55
+ // Server ready — bin/termsearch.js prints the startup banner
56
+ });
57
+
58
+ // Graceful shutdown
59
+ function shutdown(signal) {
60
+ console.log(`\n[termsearch] ${signal} received, shutting down...`);
61
+ server.close(() => process.exit(0));
62
+ setTimeout(() => process.exit(0), 3000);
63
+ }
64
+ process.on('SIGINT', () => shutdown('SIGINT'));
65
+ process.on('SIGTERM', () => shutdown('SIGTERM'));
66
+
67
+ export default app;
68
+ export { port, host };
@@ -0,0 +1,356 @@
1
+ // Social platform scrapers — ported from MmmSearch
2
+ // Twitter/Nitter, Instagram, YouTube, Facebook, LinkedIn, TikTok, Telegram
3
+
4
+ const BROWSER_UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36';
5
+ const MOBILE_UA = 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Mobile/15E148 Safari/604.1';
6
+
7
+ const NITTER_INSTANCES = [
8
+ 'https://nitter.net',
9
+ 'https://nitter.privacydev.net',
10
+ 'https://nitter.poast.org',
11
+ 'https://xcancel.com',
12
+ 'https://nitter.cz',
13
+ 'https://nitter.space',
14
+ 'https://lightbrd.com',
15
+ ];
16
+
17
+ // ─── Shared helpers ───────────────────────────────────────────────────────────
18
+
19
+ async function fetchWith(url, { headers = {}, timeoutMs = 8000 } = {}) {
20
+ const ac = new AbortController();
21
+ const t = setTimeout(() => ac.abort(), timeoutMs);
22
+ try {
23
+ const r = await fetch(url, { headers: { 'User-Agent': BROWSER_UA, ...headers }, signal: ac.signal });
24
+ if (!r.ok) return null;
25
+ return r.text();
26
+ } catch { return null; }
27
+ finally { clearTimeout(t); }
28
+ }
29
+
30
+ function parseOgTags(html) {
31
+ const og = {};
32
+ const unescape = (s) => s
33
+ .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>')
34
+ .replace(/&quot;/g, '"').replace(/&#39;/g, "'").replace(/&#x27;/g, "'");
35
+ const re1 = /<meta\s[^>]*property=["']og:([^"']+)["'][^>]*content=["']([^"']*)/gi;
36
+ let m;
37
+ while ((m = re1.exec(html)) !== null) og[m[1]] = unescape(m[2]);
38
+ const re2 = /<meta\s[^>]*content=["']([^"']*)[^>]*property=["']og:([^"']+)["']/gi;
39
+ while ((m = re2.exec(html)) !== null) { if (!og[m[2]]) og[m[2]] = unescape(m[1]); }
40
+ return og;
41
+ }
42
+
43
+ function parseMetaContent(html, name) {
44
+ const quoted = String(name || '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
45
+ const re1 = new RegExp(`<meta\\s[^>]*name=["']${quoted}["'][^>]*content=["']([^"']*)`, 'i');
46
+ const re2 = new RegExp(`<meta\\s[^>]*content=["']([^"']*)[^>]*name=["']${quoted}["']`, 'i');
47
+ const match = html.match(re1) || html.match(re2);
48
+ return match?.[1]?.replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>').replace(/&quot;/g, '"').replace(/&#39;/g, "'").trim() || null;
49
+ }
50
+
51
+ function parseTitleTag(html) {
52
+ return html.match(/<title[^>]*>([\s\S]*?)<\/title>/i)?.[1]?.replace(/\s+/g, ' ')?.trim() || null;
53
+ }
54
+
55
+ export function parseLooseCount(raw) {
56
+ const value = String(raw || '').trim();
57
+ if (!value) return null;
58
+ const compact = value.match(/([\d.,]+)\s*([KMB])?/i);
59
+ if (!compact) return null;
60
+ const num = Number.parseFloat(compact[1].replace(/,/g, '.'));
61
+ if (!Number.isFinite(num)) return null;
62
+ const suffix = (compact[2] || '').toUpperCase();
63
+ const multiplier = suffix === 'K' ? 1_000 : suffix === 'M' ? 1_000_000 : suffix === 'B' ? 1_000_000_000 : 1;
64
+ return Math.round(num * multiplier);
65
+ }
66
+
67
+ function parseNitterRSS(xml) {
68
+ const posts = [];
69
+ const itemRe = /<item>([\s\S]*?)<\/item>/gi;
70
+ let item;
71
+ while ((item = itemRe.exec(xml)) !== null) {
72
+ const titleM = item[1].match(/<title><!\[CDATA\[([\s\S]*?)\]\]><\/title>/i) || item[1].match(/<title>([\s\S]*?)<\/title>/i);
73
+ const linkM = item[1].match(/<link>([\s\S]*?)<\/link>/i);
74
+ const dateM = item[1].match(/<pubDate>([\s\S]*?)<\/pubDate>/i);
75
+ if (titleM) posts.push({ text: titleM[1].trim(), url: linkM?.[1]?.trim() || null, date: dateM?.[1]?.trim() || null });
76
+ if (posts.length >= 5) break;
77
+ }
78
+ return posts;
79
+ }
80
+
81
+ // ─── SocialBlade fallback ─────────────────────────────────────────────────────
82
+
83
+ async function trySocialBlade(platform, handle) {
84
+ const pathMap = {
85
+ instagram: `instagram/user/${handle}`,
86
+ youtube: `youtube/channel/${handle}`,
87
+ tiktok: `tiktok/user/${handle}`,
88
+ twitter: `twitter/user/${handle}`,
89
+ facebook: `facebook/user/${handle}`,
90
+ };
91
+ const sbPath = pathMap[platform];
92
+ if (!sbPath) return null;
93
+ const html = await fetchWith(`https://socialblade.com/${sbPath}`, { timeoutMs: 10000 });
94
+ if (!html) return null;
95
+ const og = parseOgTags(html);
96
+ if (!og.title) return null;
97
+ const name = og.title.split(/\s+(?:Instagram|YouTube|Twitch|TikTok|Twitter|Facebook)\s+Stats/i)[0].trim() || handle;
98
+ const followersM = og.description?.match(/([\d,\.]+[KkMm]?)\s*(?:follower|subscriber)/i);
99
+ return {
100
+ platform, handle, name, bio: null,
101
+ followers: followersM?.[1] || null,
102
+ avatar: og.image || null,
103
+ url: og.url || `https://socialblade.com/${sbPath}`,
104
+ scraped: true, source: 'socialblade',
105
+ };
106
+ }
107
+
108
+ // ─── Twitter/X (via Nitter RSS) ───────────────────────────────────────────────
109
+
110
+ export async function tryNitterInstances(handle) {
111
+ for (const base of NITTER_INSTANCES) {
112
+ const ac = new AbortController();
113
+ const t = setTimeout(() => ac.abort(), 6000);
114
+ try {
115
+ const rssUrl = `${base}/${encodeURIComponent(handle)}/rss`;
116
+ const r = await fetch(rssUrl, {
117
+ headers: { 'User-Agent': BROWSER_UA, Accept: 'application/rss+xml, application/xml, text/xml, */*' },
118
+ signal: ac.signal,
119
+ });
120
+ clearTimeout(t);
121
+ if (!r.ok) continue;
122
+ const xml = await r.text();
123
+ if (!xml.includes('<rss')) continue;
124
+ const posts = parseNitterRSS(xml);
125
+ const nameM = xml.match(/<title><!\[CDATA\[(.*?) \/ Twitter\]\]><\/title>/i) || xml.match(/<title>(.*?) \/ Twitter<\/title>/i);
126
+ const descM = xml.match(/<description><!\[CDATA\[([\s\S]*?)\]\]><\/description>/i);
127
+ const imgM = xml.match(/<url>(https?:\/\/[^<]+)<\/url>/i);
128
+ const followersM = xml.match(/(\d[\d,\.]+)\s*Followers/i);
129
+ const followingM = xml.match(/(\d[\d,\.]+)\s*Following/i);
130
+ return {
131
+ platform: 'twitter', handle,
132
+ name: nameM?.[1]?.trim() || handle,
133
+ bio: descM?.[1]?.trim().slice(0, 300) || null,
134
+ avatar: imgM?.[1]?.trim() || null,
135
+ followers: followersM ? parseInt(followersM[1].replace(/[,\.]/g, '')) : null,
136
+ following: followingM ? parseInt(followingM[1].replace(/[,\.]/g, '')) : null,
137
+ url: `https://x.com/${handle}`,
138
+ recentPosts: posts, scraped: true, source: base,
139
+ };
140
+ } catch { clearTimeout(t); continue; }
141
+ }
142
+ return null;
143
+ }
144
+
145
+ // ─── Instagram ────────────────────────────────────────────────────────────────
146
+
147
+ async function tryInstagramApi(handle) {
148
+ const IG_APP_ID = '936619743392459';
149
+ const IG_SESSION = process.env.TERMSEARCH_INSTAGRAM_SESSION || '';
150
+ const url = `https://i.instagram.com/api/v1/users/web_profile_info/?username=${encodeURIComponent(handle)}`;
151
+ const ac = new AbortController();
152
+ const t = setTimeout(() => ac.abort(), 10000);
153
+ try {
154
+ const hdrs = {
155
+ 'x-ig-app-id': IG_APP_ID,
156
+ 'x-requested-with': 'XMLHttpRequest',
157
+ 'Referer': 'https://www.instagram.com/',
158
+ 'Accept': '*/*',
159
+ 'User-Agent': BROWSER_UA,
160
+ };
161
+ if (IG_SESSION) hdrs['Cookie'] = IG_SESSION;
162
+ const r = await fetch(url, { headers: hdrs, signal: ac.signal });
163
+ if (!r.ok) return null;
164
+ const data = await r.json();
165
+ if (data?.status === 'fail') return null;
166
+ const user = data?.data?.user;
167
+ if (!user || user.is_private === undefined) return null;
168
+ return {
169
+ platform: 'instagram', handle,
170
+ name: user.full_name || handle,
171
+ bio: user.biography || null,
172
+ avatar: user.profile_pic_url || null,
173
+ followers: user.edge_followed_by?.count ?? null,
174
+ following: user.edge_follow?.count ?? null,
175
+ posts: user.edge_owner_to_timeline_media?.count ?? null,
176
+ isPrivate: user.is_private || false,
177
+ externalUrl: user.external_url || null,
178
+ url: `https://www.instagram.com/${handle}/`,
179
+ scraped: true, source: 'instagram-api',
180
+ };
181
+ } catch { return null; }
182
+ finally { clearTimeout(t); }
183
+ }
184
+
185
+ async function tryDumpor(handle) {
186
+ const html = await fetchWith(`https://dumpor.io/v/${encodeURIComponent(handle)}`);
187
+ if (!html) return null;
188
+ const og = parseOgTags(html);
189
+ if (!og.title) return null;
190
+ if (og.title.toLowerCase().includes('dumpor') || og.title.toLowerCase().includes('watch instagram')) return null;
191
+ if (og.image && og.image.includes('dumpor.io/images')) return null;
192
+ const name = og.title.split('(')[0].split('•')[0].trim() || handle;
193
+ const followersM = og.description?.match(/([\d,]+)\s*Followers/i);
194
+ const followingM = og.description?.match(/([\d,]+)\s*Following/i);
195
+ const postsM = og.description?.match(/([\d,]+)\s*Posts/i);
196
+ const bioM = html.match(/class="[^"]*bio[^"]*"[^>]*>([\s\S]{1,300}?)<\/(?:p|div|span)>/i);
197
+ const bioText = bioM ? bioM[1].replace(/<[^>]+>/g, '').trim() : null;
198
+ let externalUrl = null;
199
+ const linkRe = /<a[^>]+href=["'](https?:\/\/(?!(?:www\.)?(?:instagram\.com|dumpor\.io))[^"'\s>]+)["']/gi;
200
+ let lm;
201
+ while ((lm = linkRe.exec(html)) !== null) {
202
+ const u = lm[1];
203
+ if (/\.(png|jpg|gif|woff|css|js)(\?|$)/i.test(u)) continue;
204
+ if (/fonts\.googleapis|cdn\.|static\.|analytics|fbcdn|cdninstagram/i.test(u)) continue;
205
+ externalUrl = u; break;
206
+ }
207
+ return {
208
+ platform: 'instagram', handle, name, bio: bioText,
209
+ avatar: og.image || null,
210
+ followers: followersM ? parseInt(followersM[1].replace(/,/g, '')) : null,
211
+ following: followingM ? parseInt(followingM[1].replace(/,/g, '')) : null,
212
+ posts: postsM ? parseInt(postsM[1].replace(/,/g, '')) : null,
213
+ externalUrl, url: `https://www.instagram.com/${handle}/`,
214
+ scraped: true, source: 'dumpor',
215
+ };
216
+ }
217
+
218
+ export async function fetchInstagramProfile(handle) {
219
+ return await tryInstagramApi(handle) || await tryDumpor(handle) || await trySocialBlade('instagram', handle) || null;
220
+ }
221
+
222
+ // ─── YouTube ──────────────────────────────────────────────────────────────────
223
+
224
+ export async function fetchYouTubeProfile(handle, profileUrl = null) {
225
+ const candidates = [
226
+ profileUrl,
227
+ `https://www.youtube.com/@${encodeURIComponent(handle)}`,
228
+ `https://www.youtube.com/channel/${encodeURIComponent(handle)}`,
229
+ `https://www.youtube.com/c/${encodeURIComponent(handle)}`,
230
+ `https://www.youtube.com/user/${encodeURIComponent(handle)}`,
231
+ ].filter(Boolean);
232
+ const seen = new Set();
233
+ for (const candidate of candidates) {
234
+ if (seen.has(candidate)) continue;
235
+ seen.add(candidate);
236
+ const html = await fetchWith(candidate, { headers: { 'Accept-Language': 'en-US,en;q=0.9' }, timeoutMs: 10000 });
237
+ if (!html) continue;
238
+ const og = parseOgTags(html);
239
+ const metaDesc = parseMetaContent(html, 'description');
240
+ const titleTag = parseTitleTag(html);
241
+ const title = og.title || titleTag || handle;
242
+ const name = title.replace(/\s*-\s*YouTube.*$/i, '').trim() || handle;
243
+ const description = [og.description, metaDesc].map((s) => String(s || '').replace(/\s+/g, ' ').trim()).find(Boolean) || null;
244
+ const subM = html.match(/"subscriberCountText"\s*:\s*\{"simpleText":"([^"]+)"/) || html.match(/"subscriberCountText"\s*:\s*\{"runs":\[\{"text":"([^"]+)"/);
245
+ const vidM = html.match(/"videosCountText"\s*:\s*\{"runs":\[\{"text":"([^"]+)"/) || description?.match(/([\d.,KMB]+)\s+videos?/i);
246
+ if (!name && !description) continue;
247
+ return {
248
+ platform: 'youtube', handle, name,
249
+ bio: description && description !== name ? description : null,
250
+ avatar: og.image || null,
251
+ followers: parseLooseCount(subM?.[1]),
252
+ posts: parseLooseCount(Array.isArray(vidM) ? vidM[1] : vidM),
253
+ url: og.url || candidate,
254
+ scraped: true, source: 'youtube-page',
255
+ };
256
+ }
257
+ return await trySocialBlade('youtube', handle) || null;
258
+ }
259
+
260
+ // ─── Facebook ─────────────────────────────────────────────────────────────────
261
+
262
+ export async function fetchFacebookPage(url, handle) {
263
+ const mobileUrl = url.replace('www.facebook.com', 'm.facebook.com').replace('fb.com', 'm.facebook.com');
264
+ for (const [tryUrl, ua] of [[mobileUrl, MOBILE_UA], [url, BROWSER_UA]]) {
265
+ const html = await fetchWith(tryUrl, { headers: { 'User-Agent': ua } });
266
+ if (!html) continue;
267
+ const og = parseOgTags(html);
268
+ if (!og.title && !og.description) continue;
269
+ const followersM = og.description?.match(/([\d,\.]+[KkMm]?)\s*(?:follower|like|Mi piace)/i);
270
+ return {
271
+ platform: 'facebook', handle,
272
+ name: og.title || handle,
273
+ bio: og.description?.split('·')[0]?.split('|')[0]?.trim() || null,
274
+ avatar: og.image || null,
275
+ followers: followersM?.[1] || null,
276
+ url: og.url || url,
277
+ scraped: true, source: tryUrl.includes('m.facebook') ? 'facebook-mobile' : 'facebook',
278
+ };
279
+ }
280
+ return null;
281
+ }
282
+
283
+ // ─── LinkedIn ─────────────────────────────────────────────────────────────────
284
+
285
+ export async function fetchLinkedInProfile(handle) {
286
+ const html = await fetchWith(`https://www.linkedin.com/in/${encodeURIComponent(handle)}/`, {
287
+ headers: { 'Accept-Language': 'en-US,en;q=0.9', 'Cache-Control': 'no-cache' },
288
+ timeoutMs: 10000,
289
+ });
290
+ if (html) {
291
+ const og = parseOgTags(html);
292
+ if (og.title && !og.title.toLowerCase().includes('linkedin') && og.title !== 'LinkedIn') {
293
+ const headline = og.description?.split('|')[0]?.split('–')[0]?.split('-')[0]?.trim() || null;
294
+ return {
295
+ platform: 'linkedin', handle,
296
+ name: og.title.split('|')[0].split('-')[0].trim() || handle,
297
+ bio: headline, avatar: og.image || null,
298
+ url: og.url || `https://linkedin.com/in/${handle}`,
299
+ scraped: true, source: 'linkedin-og',
300
+ };
301
+ }
302
+ }
303
+ return { platform: 'linkedin', handle, name: handle, url: `https://linkedin.com/in/${handle}`, webOnly: true };
304
+ }
305
+
306
+ // ─── TikTok ───────────────────────────────────────────────────────────────────
307
+
308
+ export async function fetchTikTokProfile(handle) {
309
+ const html = await fetchWith(`https://www.tiktok.com/@${encodeURIComponent(handle)}`);
310
+ if (html) {
311
+ const og = parseOgTags(html);
312
+ if (og.title) {
313
+ const name = og.title.split('(')[0].split('-')[0].split('|')[0].trim() || handle;
314
+ const followersM = html.match(/"followerCount"\s*:\s*(\d+)/);
315
+ const followingM = html.match(/"followingCount"\s*:\s*(\d+)/);
316
+ const likesM = html.match(/"heartCount"\s*:\s*(\d+)/);
317
+ const bioM = html.match(/"signature"\s*:\s*"([^"]{1,300})"/);
318
+ return {
319
+ platform: 'tiktok', handle, name,
320
+ bio: bioM?.[1] || og.description?.split('·')[0]?.trim() || null,
321
+ avatar: og.image || null,
322
+ followers: followersM ? parseInt(followersM[1]) : null,
323
+ following: followingM ? parseInt(followingM[1]) : null,
324
+ likes: likesM ? parseInt(likesM[1]) : null,
325
+ url: `https://www.tiktok.com/@${handle}`,
326
+ scraped: true, source: 'tiktok-og',
327
+ };
328
+ }
329
+ }
330
+ return await trySocialBlade('tiktok', handle) || null;
331
+ }
332
+
333
+ // ─── Telegram ─────────────────────────────────────────────────────────────────
334
+
335
+ export async function fetchTelegramProfile(handle) {
336
+ const h = handle.replace(/^@/, '');
337
+ const html = await fetchWith(`https://t.me/${encodeURIComponent(h)}`);
338
+ if (!html) return null;
339
+ const og = parseOgTags(html);
340
+ const nameM = html.match(/<div class="tgme_page_title"[^>]*>([\s\S]*?)<\/div>/i);
341
+ const descM = html.match(/<div class="tgme_page_description"[^>]*>([\s\S]*?)<\/div>/i);
342
+ const extraM = html.match(/<div class="tgme_page_extra"[^>]*>([\s\S]*?)<\/div>/i);
343
+ const avatarM = html.match(/<img class="tgme_page_photo_image"[^>]*src="([^"]+)"/i);
344
+ const name = nameM?.[1]?.replace(/<[^>]+>/g, '').trim() || og.title || h;
345
+ const bio = descM?.[1]?.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim() || og.description || null;
346
+ const extraTx = (extraM?.[1] || '').replace(/<[^>]+>/g, '').replace(/\s+/g, ' ').trim();
347
+ const membersM = extraTx.match(/([\d\s]+)\s*(?:subscriber|member)/i);
348
+ if (!name && !bio) return null;
349
+ return {
350
+ platform: 'telegram', handle: h, name, bio,
351
+ avatar: avatarM?.[1] || og.image || null,
352
+ followers: membersM ? parseInt(membersM[1].replace(/\s/g, ''), 10) : null,
353
+ url: `https://t.me/${h}`,
354
+ scraped: true, source: 'telegram-page',
355
+ };
356
+ }
@@ -0,0 +1,77 @@
1
+ // Social search — Bluesky (AT Protocol) + GDELT news
2
+ // Ported from MmmSearch
3
+
4
+ // ─── Bluesky ──────────────────────────────────────────────────────────────────
5
+
6
+ export async function fetchBlueskyPosts(query, limit = 25) {
7
+ const url = `https://api.bsky.app/xrpc/app.bsky.feed.searchPosts?q=${encodeURIComponent(query)}&limit=${Math.min(limit, 100)}`;
8
+ const ac = new AbortController();
9
+ const timer = setTimeout(() => ac.abort(), 10_000);
10
+ try {
11
+ const r = await fetch(url, { headers: { 'User-Agent': 'TermSearch/1.0' }, signal: ac.signal });
12
+ if (!r.ok) return [];
13
+ const data = await r.json();
14
+ return (data.posts || []).map((post) => {
15
+ const uriParts = (post.uri || '').split('/');
16
+ const rkey = uriParts[uriParts.length - 1];
17
+ const handle = post.author?.handle || 'unknown';
18
+ const text = post.record?.text || '';
19
+ return {
20
+ title: (post.author?.displayName || handle) + ': ' + text.slice(0, 100),
21
+ url: `https://bsky.app/profile/${handle}/post/${rkey}`,
22
+ snippet: text, engine: 'bluesky',
23
+ author: handle,
24
+ likeCount: post.likeCount || 0,
25
+ repostCount: post.repostCount || 0,
26
+ publishedDate: post.record?.createdAt || null,
27
+ };
28
+ });
29
+ } catch { return []; }
30
+ finally { clearTimeout(timer); }
31
+ }
32
+
33
+ export async function fetchBlueskyActors(query, limit = 20) {
34
+ const url = `https://api.bsky.app/xrpc/app.bsky.actor.searchActors?q=${encodeURIComponent(query)}&limit=${Math.min(limit, 100)}`;
35
+ const ac = new AbortController();
36
+ const timer = setTimeout(() => ac.abort(), 10_000);
37
+ try {
38
+ const r = await fetch(url, { headers: { 'User-Agent': 'TermSearch/1.0' }, signal: ac.signal });
39
+ if (!r.ok) return [];
40
+ const data = await r.json();
41
+ return (data.actors || []).map((actor) => ({
42
+ title: (actor.displayName || actor.handle) + ' (@' + actor.handle + ')',
43
+ url: `https://bsky.app/profile/${actor.handle}`,
44
+ snippet: actor.description || '',
45
+ engine: 'bluesky users',
46
+ handle: actor.handle,
47
+ followersCount: actor.followersCount || 0,
48
+ publishedDate: null,
49
+ }));
50
+ } catch { return []; }
51
+ finally { clearTimeout(timer); }
52
+ }
53
+
54
+ // ─── GDELT ────────────────────────────────────────────────────────────────────
55
+
56
+ export async function fetchGdeltArticles(query, limit = 25) {
57
+ const url = `https://api.gdeltproject.org/api/v2/doc/doc?query=${encodeURIComponent(query)}&mode=artlist&maxrecords=${Math.min(limit, 250)}&format=json`;
58
+ const ac = new AbortController();
59
+ const timer = setTimeout(() => ac.abort(), 12_000);
60
+ try {
61
+ const r = await fetch(url, { headers: { 'User-Agent': 'TermSearch/1.0' }, signal: ac.signal });
62
+ if (!r.ok) return [];
63
+ const data = await r.json();
64
+ return (data.articles || []).map((article) => {
65
+ const raw = article.seendate || '';
66
+ const publishedDate = raw.length >= 8 ? raw.slice(0, 4) + '-' + raw.slice(4, 6) + '-' + raw.slice(6, 8) : null;
67
+ return {
68
+ title: article.title || '',
69
+ url: article.url || '',
70
+ snippet: article.title || '',
71
+ engine: 'gdelt', publishedDate,
72
+ source: article.domain || null,
73
+ };
74
+ }).filter((a) => a.url);
75
+ } catch { return []; }
76
+ finally { clearTimeout(timer); }
77
+ }