@0xkobold/pi-web 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/search.ts ADDED
@@ -0,0 +1,393 @@
1
+ /**
2
+ * pi-web - Search and Content Extraction
3
+ *
4
+ * Internal search and fetch utilities. No framework dependencies.
5
+ */
6
+
7
+ // ═════════════════════════════════════════════════════════════════════════════
8
+ // Types
9
+ // ═════════════════════════════════════════════════════════════════════════════
10
+
11
+ export interface ScrapingResult {
12
+ content: string;
13
+ title: string;
14
+ method: string;
15
+ url: string;
16
+ }
17
+
18
+ export interface WebSearchResult {
19
+ title: string;
20
+ url: string;
21
+ snippet: string;
22
+ }
23
+
24
+ // ═════════════════════════════════════════════════════════════════════════════
25
+ // Content Extraction (Cascade)
26
+ // ═════════════════════════════════════════════════════════════════════════════
27
+
28
+ /**
29
+ * Fast fetch for simple HTML sites
30
+ */
31
+ async function fastFetch(url: string, maxLength: number): Promise<ScrapingResult | null> {
32
+ try {
33
+ const controller = new AbortController();
34
+ const timeout = setTimeout(() => controller.abort(), 10000);
35
+
36
+ const response = await fetch(url, {
37
+ headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Bot/1.0)' },
38
+ signal: controller.signal
39
+ });
40
+
41
+ clearTimeout(timeout);
42
+ if (!response.ok) return null;
43
+
44
+ const html = await response.text();
45
+ const title = html.match(/<title[^>]*>(.*?)<\/title>/i)?.[1]?.trim() || "Untitled";
46
+
47
+ const content = html
48
+ .replace(/<script[^>]*>.*?<\/script>/gi, '')
49
+ .replace(/<style[^>]*>.*?<\/style>/gi, '')
50
+ .replace(/<[^>]*>/g, ' ')
51
+ .replace(/\s+/g, ' ')
52
+ .trim()
53
+ .slice(0, maxLength);
54
+
55
+ if (content.length < 200) return null;
56
+ return { content, title, method: 'fast', url };
57
+ } catch {
58
+ return null;
59
+ }
60
+ }
61
+
62
+ /**
63
+ * Readability-style extraction using regex
64
+ */
65
+ async function readabilityFetch(url: string, maxLength: number): Promise<ScrapingResult | null> {
66
+ try {
67
+ const response = await fetch(url, {
68
+ headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Bot/1.0)' }
69
+ });
70
+
71
+ if (!response.ok) return null;
72
+ const html = await response.text();
73
+
74
+ const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
75
+ const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
76
+ const contentDiv = html.match(/<div[^>]*class="[^"]*(?:content|article|post)[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
77
+
78
+ const rawContent = articleMatch?.[1] || mainMatch?.[1] || contentDiv?.[1];
79
+ if (!rawContent) return null;
80
+
81
+ const content = rawContent
82
+ .replace(/<script[^>]*>.*?<\/script>/gi, '')
83
+ .replace(/<style[^>]*>.*?<\/style>/gi, '')
84
+ .replace(/<[^>]*>/g, ' ')
85
+ .replace(/\s+/g, ' ')
86
+ .trim()
87
+ .slice(0, maxLength);
88
+
89
+ const title = html.match(/<title[^>]*>(.*?)<\/title>/i)?.[1]?.trim() || "Untitled";
90
+ if (content.length < 200) return null;
91
+
92
+ return { content, title, method: 'readability', url };
93
+ } catch {
94
+ return null;
95
+ }
96
+ }
97
+
98
+ // ═════════════════════════════════════════════════════════════════════════════
99
+ // Playwright Browser Manager
100
+ // ═════════════════════════════════════════════════════════════════════════════
101
+
102
+ class BrowserManager {
103
+ private browser: any = null;
104
+ private context: any = null;
105
+ private lastUsed: number = 0;
106
+ private readonly POOL_TTL_MS = 120000;
107
+
108
+ async getBrowser() {
109
+ const { chromium } = await import('playwright');
110
+
111
+ if (this.browser && Date.now() - this.lastUsed < this.POOL_TTL_MS) {
112
+ try {
113
+ await this.browser.contexts();
114
+ this.lastUsed = Date.now();
115
+ return { browser: this.browser, context: this.context, newBrowser: false };
116
+ } catch {
117
+ await this.close();
118
+ }
119
+ }
120
+
121
+ this.browser = await chromium.launch({
122
+ headless: true,
123
+ args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--no-first-run']
124
+ });
125
+
126
+ this.context = await this.browser.newContext({
127
+ viewport: { width: 1280, height: 720 },
128
+ userAgent: 'Mozilla/5.0 (compatible; 0xKobold/0.1)'
129
+ });
130
+
131
+ this.lastUsed = Date.now();
132
+ return { browser: this.browser, context: this.context, newBrowser: true };
133
+ }
134
+
135
+ async close() {
136
+ if (this.browser) {
137
+ try { await this.browser.close(); } catch { /* ignore */ }
138
+ this.browser = null;
139
+ this.context = null;
140
+ }
141
+ }
142
+ }
143
+
144
+ const browserManager = new BrowserManager();
145
+
146
+ // Request queue for Playwright
147
+ type QueuedRequest = {
148
+ url: string;
149
+ maxLength: number;
150
+ timeoutMs: number;
151
+ resolve: (value: ScrapingResult | null) => void;
152
+ };
153
+
154
+ const requestQueue: QueuedRequest[] = [];
155
+ let isProcessing = false;
156
+ const MAX_CONCURRENT = 2;
157
+ const MAX_RETRIES = 3;
158
+
159
+ async function processQueue(): Promise<void> {
160
+ if (isProcessing) return;
161
+ isProcessing = true;
162
+
163
+ while (requestQueue.length > 0) {
164
+ const batch = requestQueue.splice(0, MAX_CONCURRENT);
165
+ await Promise.all(batch.map(req => processRequest(req)));
166
+ if (requestQueue.length > 0) {
167
+ await new Promise(r => setTimeout(r, 500));
168
+ }
169
+ }
170
+
171
+ isProcessing = false;
172
+ }
173
+
174
+ async function processRequest(req: QueuedRequest, attempt = 1): Promise<void> {
175
+ try {
176
+ const result = await playwrightFetchWithTimeout(req.url, req.maxLength, req.timeoutMs);
177
+ req.resolve(result);
178
+ } catch {
179
+ if (attempt < MAX_RETRIES) {
180
+ const delay = Math.min(2000 * Math.pow(2, attempt - 1), 30000);
181
+ await new Promise(r => setTimeout(r, delay));
182
+ await processRequest(req, attempt + 1);
183
+ } else {
184
+ req.resolve(null);
185
+ }
186
+ }
187
+ }
188
+
189
+ async function playwrightFetchWithTimeout(
190
+ url: string,
191
+ maxLength: number,
192
+ timeoutMs: number = 15000
193
+ ): Promise<ScrapingResult | null> {
194
+ const controller = new AbortController();
195
+ const timeoutId = setTimeout(() => controller.abort(), timeoutMs + 2000);
196
+
197
+ try {
198
+ const { context, newBrowser } = await browserManager.getBrowser();
199
+ const page = await context.newPage();
200
+
201
+ page.setDefaultTimeout(Math.min(timeoutMs, 10000));
202
+ page.setDefaultNavigationTimeout(Math.min(timeoutMs, 10000));
203
+
204
+ controller.signal.addEventListener('abort', () => {
205
+ page.close().catch(() => {});
206
+ });
207
+
208
+ try {
209
+ const response = await page.goto(url, {
210
+ waitUntil: 'commit',
211
+ timeout: Math.min(timeoutMs, 10000)
212
+ });
213
+
214
+ if (!response) throw new Error("No response");
215
+ await page.waitForTimeout(500);
216
+
217
+ const extracted = await page.evaluate((maxLen: number) => {
218
+ const doc = (globalThis as any).document;
219
+ const main = doc.querySelector('main, article, .content, [role="main"]');
220
+ if (main?.innerText?.trim().length > 100) {
221
+ return main.innerText.slice(0, maxLen);
222
+ }
223
+ return doc.body?.innerText?.slice(0, maxLen) || '';
224
+ }, maxLength);
225
+
226
+ const title = await page.title().catch(() => 'Untitled');
227
+
228
+ if (!extracted || extracted.length < 50) throw new Error("Insufficient content");
229
+
230
+ return { content: extracted, title, url, method: newBrowser ? 'playwright-new' : 'playwright-pooled' };
231
+ } finally {
232
+ await page.close();
233
+ clearTimeout(timeoutId);
234
+ }
235
+ } catch (error) {
236
+ throw error;
237
+ }
238
+ }
239
+
240
+ /**
241
+ * Playwright fetch with queue-based concurrency
242
+ */
243
+ export async function playwrightFetch(
244
+ url: string,
245
+ maxLength: number,
246
+ timeoutMs: number = 15000
247
+ ): Promise<ScrapingResult | null> {
248
+ return new Promise((resolve) => {
249
+ requestQueue.push({ url, maxLength, timeoutMs, resolve });
250
+ processQueue();
251
+ });
252
+ }
253
+
254
+ // ═════════════════════════════════════════════════════════════════════════════
255
+ // Cascade Fetch
256
+ // ═════════════════════════════════════════════════════════════════════════════
257
+
258
+ /**
259
+ * CASCADE: Try all methods in order of speed → quality
260
+ */
261
+ export async function cascadeFetch(
262
+ url: string,
263
+ maxLength: number = 5000,
264
+ usePlaywright: boolean = false,
265
+ timeoutMs: number = 15000
266
+ ): Promise<ScrapingResult | null> {
267
+ // Level 1: Fast HTML fetch
268
+ if (!usePlaywright) {
269
+ const fast = await fastFetch(url, maxLength);
270
+ if (fast && fast.content.length > 1000) return fast;
271
+ }
272
+
273
+ // Level 2: Readability extraction
274
+ if (!usePlaywright) {
275
+ const readability = await readabilityFetch(url, maxLength);
276
+ if (readability) return readability;
277
+ }
278
+
279
+ // Level 3: JavaScript rendering with Playwright
280
+ const pw = await playwrightFetch(url, maxLength, timeoutMs);
281
+ if (pw) return pw;
282
+
283
+ return null;
284
+ }
285
+
286
+ // ═════════════════════════════════════════════════════════════════════════════
287
+ // Search
288
+ // ═════════════════════════════════════════════════════════════════════════════
289
+
290
+ export async function searchDuckDuckGo(query: string, limit: number): Promise<WebSearchResult[]> {
291
+ const results: WebSearchResult[] = [];
292
+
293
+ try {
294
+ const liteUrl = `https://lite.duckduckgo.com/lite/?q=${encodeURIComponent(query)}`;
295
+ const response = await fetch(liteUrl, {
296
+ headers: {
297
+ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
298
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
299
+ 'Accept-Language': 'en-US,en;q=0.5',
300
+ },
301
+ signal: AbortSignal.timeout(10000),
302
+ });
303
+
304
+ if (!response.ok) return results;
305
+
306
+ const html = await response.text();
307
+
308
+ const linkRegex = /href="\/\/duckduckgo\.com\/l\/\?uddg=([^"&]+)/gi;
309
+ const urls: string[] = [];
310
+ let match;
311
+
312
+ while ((match = linkRegex.exec(html)) && urls.length < limit * 2) {
313
+ try {
314
+ const decoded = decodeURIComponent(match[1]);
315
+ const cleanUrl = decoded.split('&')[0].split('?rut=')[0];
316
+ if (cleanUrl.startsWith('http') && !urls.includes(cleanUrl)) {
317
+ urls.push(cleanUrl);
318
+ }
319
+ } catch { /* skip */ }
320
+ }
321
+
322
+ const anchorRegex = /<a[^>]*href="[^"]*uddg=[^"]*"[^>]*>([^<]+)<\/a>/gi;
323
+ const titles: string[] = [];
324
+ while ((match = anchorRegex.exec(html)) && titles.length < urls.length) {
325
+ const title = match[1].replace(/<[^>]*>/g, '').trim();
326
+ if (title && title.length > 2 && title.length < 200) {
327
+ titles.push(title);
328
+ }
329
+ }
330
+
331
+ for (let i = 0; i < Math.min(urls.length, titles.length, limit); i++) {
332
+ results.push({ title: titles[i] || new URL(urls[i]).hostname, url: urls[i], snippet: '' });
333
+ }
334
+
335
+ for (let i = results.length; i < Math.min(urls.length, limit); i++) {
336
+ try {
337
+ results.push({ title: new URL(urls[i]).hostname, url: urls[i], snippet: '' });
338
+ } catch { /* skip */ }
339
+ }
340
+ } catch { /* skip */ }
341
+
342
+ return results;
343
+ }
344
+
345
+ export async function searchSearX(query: string, limit: number, instance?: string): Promise<WebSearchResult[]> {
346
+ const results: WebSearchResult[] = [];
347
+
348
+ const searxInstances = instance ? [instance] : [
349
+ "https://search.bus-hit.me",
350
+ "https://search.projectsegfau.ltd",
351
+ "https://searx.foss.family",
352
+ ];
353
+
354
+ for (const baseUrl of searxInstances) {
355
+ try {
356
+ const searchUrl = `${baseUrl}/search?q=${encodeURIComponent(query)}&format=json`;
357
+ const response = await fetch(searchUrl, {
358
+ headers: { 'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json' },
359
+ signal: AbortSignal.timeout(8000)
360
+ });
361
+
362
+ if (!response.ok) continue;
363
+
364
+ const data: any = await response.json();
365
+ if (data.results && data.results.length > 0) {
366
+ for (const r of data.results.slice(0, limit)) {
367
+ results.push({
368
+ title: r.title || "Untitled",
369
+ url: r.url,
370
+ snippet: r.content || r.snippet || ""
371
+ });
372
+ }
373
+ if (results.length >= limit) break;
374
+ }
375
+ } catch {
376
+ continue;
377
+ }
378
+ }
379
+
380
+ return results;
381
+ }
382
+
383
+ /**
384
+ * Combined search across multiple engines
385
+ */
386
+ export async function webSearch(query: string, limit: number = 5): Promise<WebSearchResult[]> {
387
+ let results = await searchDuckDuckGo(query, Math.min(limit, 10));
388
+ if (results.length < limit) {
389
+ const searxResults = await searchSearX(query, Math.min(limit, 10));
390
+ results = [...results, ...searxResults].slice(0, limit);
391
+ }
392
+ return results;
393
+ }