@0xkobold/pi-web 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -0
- package/dist/index.d.ts +22 -0
- package/dist/index.js +223 -0
- package/dist/index.js.map +1 -0
- package/dist/search.d.ts +30 -0
- package/dist/search.js +322 -0
- package/dist/search.js.map +1 -0
- package/package.json +47 -0
- package/src/index.ts +270 -0
- package/src/search.ts +393 -0
package/src/search.ts
ADDED
|
@@ -0,0 +1,393 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* pi-web - Search and Content Extraction
|
|
3
|
+
*
|
|
4
|
+
* Internal search and fetch utilities. No framework dependencies.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
8
|
+
// Types
|
|
9
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
10
|
+
|
|
11
|
+
export interface ScrapingResult {
|
|
12
|
+
content: string;
|
|
13
|
+
title: string;
|
|
14
|
+
method: string;
|
|
15
|
+
url: string;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
export interface WebSearchResult {
|
|
19
|
+
title: string;
|
|
20
|
+
url: string;
|
|
21
|
+
snippet: string;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
25
|
+
// Content Extraction (Cascade)
|
|
26
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
27
|
+
|
|
28
|
+
/**
|
|
29
|
+
* Fast fetch for simple HTML sites
|
|
30
|
+
*/
|
|
31
|
+
async function fastFetch(url: string, maxLength: number): Promise<ScrapingResult | null> {
|
|
32
|
+
try {
|
|
33
|
+
const controller = new AbortController();
|
|
34
|
+
const timeout = setTimeout(() => controller.abort(), 10000);
|
|
35
|
+
|
|
36
|
+
const response = await fetch(url, {
|
|
37
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Bot/1.0)' },
|
|
38
|
+
signal: controller.signal
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
clearTimeout(timeout);
|
|
42
|
+
if (!response.ok) return null;
|
|
43
|
+
|
|
44
|
+
const html = await response.text();
|
|
45
|
+
const title = html.match(/<title[^>]*>(.*?)<\/title>/i)?.[1]?.trim() || "Untitled";
|
|
46
|
+
|
|
47
|
+
const content = html
|
|
48
|
+
.replace(/<script[^>]*>.*?<\/script>/gi, '')
|
|
49
|
+
.replace(/<style[^>]*>.*?<\/style>/gi, '')
|
|
50
|
+
.replace(/<[^>]*>/g, ' ')
|
|
51
|
+
.replace(/\s+/g, ' ')
|
|
52
|
+
.trim()
|
|
53
|
+
.slice(0, maxLength);
|
|
54
|
+
|
|
55
|
+
if (content.length < 200) return null;
|
|
56
|
+
return { content, title, method: 'fast', url };
|
|
57
|
+
} catch {
|
|
58
|
+
return null;
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
/**
|
|
63
|
+
* Readability-style extraction using regex
|
|
64
|
+
*/
|
|
65
|
+
async function readabilityFetch(url: string, maxLength: number): Promise<ScrapingResult | null> {
|
|
66
|
+
try {
|
|
67
|
+
const response = await fetch(url, {
|
|
68
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Bot/1.0)' }
|
|
69
|
+
});
|
|
70
|
+
|
|
71
|
+
if (!response.ok) return null;
|
|
72
|
+
const html = await response.text();
|
|
73
|
+
|
|
74
|
+
const articleMatch = html.match(/<article[^>]*>([\s\S]*?)<\/article>/i);
|
|
75
|
+
const mainMatch = html.match(/<main[^>]*>([\s\S]*?)<\/main>/i);
|
|
76
|
+
const contentDiv = html.match(/<div[^>]*class="[^"]*(?:content|article|post)[^"]*"[^>]*>([\s\S]*?)<\/div>/i);
|
|
77
|
+
|
|
78
|
+
const rawContent = articleMatch?.[1] || mainMatch?.[1] || contentDiv?.[1];
|
|
79
|
+
if (!rawContent) return null;
|
|
80
|
+
|
|
81
|
+
const content = rawContent
|
|
82
|
+
.replace(/<script[^>]*>.*?<\/script>/gi, '')
|
|
83
|
+
.replace(/<style[^>]*>.*?<\/style>/gi, '')
|
|
84
|
+
.replace(/<[^>]*>/g, ' ')
|
|
85
|
+
.replace(/\s+/g, ' ')
|
|
86
|
+
.trim()
|
|
87
|
+
.slice(0, maxLength);
|
|
88
|
+
|
|
89
|
+
const title = html.match(/<title[^>]*>(.*?)<\/title>/i)?.[1]?.trim() || "Untitled";
|
|
90
|
+
if (content.length < 200) return null;
|
|
91
|
+
|
|
92
|
+
return { content, title, method: 'readability', url };
|
|
93
|
+
} catch {
|
|
94
|
+
return null;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
99
|
+
// Playwright Browser Manager
|
|
100
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
101
|
+
|
|
102
|
+
class BrowserManager {
|
|
103
|
+
private browser: any = null;
|
|
104
|
+
private context: any = null;
|
|
105
|
+
private lastUsed: number = 0;
|
|
106
|
+
private readonly POOL_TTL_MS = 120000;
|
|
107
|
+
|
|
108
|
+
async getBrowser() {
|
|
109
|
+
const { chromium } = await import('playwright');
|
|
110
|
+
|
|
111
|
+
if (this.browser && Date.now() - this.lastUsed < this.POOL_TTL_MS) {
|
|
112
|
+
try {
|
|
113
|
+
await this.browser.contexts();
|
|
114
|
+
this.lastUsed = Date.now();
|
|
115
|
+
return { browser: this.browser, context: this.context, newBrowser: false };
|
|
116
|
+
} catch {
|
|
117
|
+
await this.close();
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
this.browser = await chromium.launch({
|
|
122
|
+
headless: true,
|
|
123
|
+
args: ['--no-sandbox', '--disable-setuid-sandbox', '--disable-dev-shm-usage', '--disable-gpu', '--no-first-run']
|
|
124
|
+
});
|
|
125
|
+
|
|
126
|
+
this.context = await this.browser.newContext({
|
|
127
|
+
viewport: { width: 1280, height: 720 },
|
|
128
|
+
userAgent: 'Mozilla/5.0 (compatible; 0xKobold/0.1)'
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
this.lastUsed = Date.now();
|
|
132
|
+
return { browser: this.browser, context: this.context, newBrowser: true };
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
async close() {
|
|
136
|
+
if (this.browser) {
|
|
137
|
+
try { await this.browser.close(); } catch { /* ignore */ }
|
|
138
|
+
this.browser = null;
|
|
139
|
+
this.context = null;
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const browserManager = new BrowserManager();
|
|
145
|
+
|
|
146
|
+
// Request queue for Playwright
|
|
147
|
+
type QueuedRequest = {
|
|
148
|
+
url: string;
|
|
149
|
+
maxLength: number;
|
|
150
|
+
timeoutMs: number;
|
|
151
|
+
resolve: (value: ScrapingResult | null) => void;
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
const requestQueue: QueuedRequest[] = [];
|
|
155
|
+
let isProcessing = false;
|
|
156
|
+
const MAX_CONCURRENT = 2;
|
|
157
|
+
const MAX_RETRIES = 3;
|
|
158
|
+
|
|
159
|
+
async function processQueue(): Promise<void> {
|
|
160
|
+
if (isProcessing) return;
|
|
161
|
+
isProcessing = true;
|
|
162
|
+
|
|
163
|
+
while (requestQueue.length > 0) {
|
|
164
|
+
const batch = requestQueue.splice(0, MAX_CONCURRENT);
|
|
165
|
+
await Promise.all(batch.map(req => processRequest(req)));
|
|
166
|
+
if (requestQueue.length > 0) {
|
|
167
|
+
await new Promise(r => setTimeout(r, 500));
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
isProcessing = false;
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
async function processRequest(req: QueuedRequest, attempt = 1): Promise<void> {
|
|
175
|
+
try {
|
|
176
|
+
const result = await playwrightFetchWithTimeout(req.url, req.maxLength, req.timeoutMs);
|
|
177
|
+
req.resolve(result);
|
|
178
|
+
} catch {
|
|
179
|
+
if (attempt < MAX_RETRIES) {
|
|
180
|
+
const delay = Math.min(2000 * Math.pow(2, attempt - 1), 30000);
|
|
181
|
+
await new Promise(r => setTimeout(r, delay));
|
|
182
|
+
await processRequest(req, attempt + 1);
|
|
183
|
+
} else {
|
|
184
|
+
req.resolve(null);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
async function playwrightFetchWithTimeout(
|
|
190
|
+
url: string,
|
|
191
|
+
maxLength: number,
|
|
192
|
+
timeoutMs: number = 15000
|
|
193
|
+
): Promise<ScrapingResult | null> {
|
|
194
|
+
const controller = new AbortController();
|
|
195
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs + 2000);
|
|
196
|
+
|
|
197
|
+
try {
|
|
198
|
+
const { context, newBrowser } = await browserManager.getBrowser();
|
|
199
|
+
const page = await context.newPage();
|
|
200
|
+
|
|
201
|
+
page.setDefaultTimeout(Math.min(timeoutMs, 10000));
|
|
202
|
+
page.setDefaultNavigationTimeout(Math.min(timeoutMs, 10000));
|
|
203
|
+
|
|
204
|
+
controller.signal.addEventListener('abort', () => {
|
|
205
|
+
page.close().catch(() => {});
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
try {
|
|
209
|
+
const response = await page.goto(url, {
|
|
210
|
+
waitUntil: 'commit',
|
|
211
|
+
timeout: Math.min(timeoutMs, 10000)
|
|
212
|
+
});
|
|
213
|
+
|
|
214
|
+
if (!response) throw new Error("No response");
|
|
215
|
+
await page.waitForTimeout(500);
|
|
216
|
+
|
|
217
|
+
const extracted = await page.evaluate((maxLen: number) => {
|
|
218
|
+
const doc = (globalThis as any).document;
|
|
219
|
+
const main = doc.querySelector('main, article, .content, [role="main"]');
|
|
220
|
+
if (main?.innerText?.trim().length > 100) {
|
|
221
|
+
return main.innerText.slice(0, maxLen);
|
|
222
|
+
}
|
|
223
|
+
return doc.body?.innerText?.slice(0, maxLen) || '';
|
|
224
|
+
}, maxLength);
|
|
225
|
+
|
|
226
|
+
const title = await page.title().catch(() => 'Untitled');
|
|
227
|
+
|
|
228
|
+
if (!extracted || extracted.length < 50) throw new Error("Insufficient content");
|
|
229
|
+
|
|
230
|
+
return { content: extracted, title, url, method: newBrowser ? 'playwright-new' : 'playwright-pooled' };
|
|
231
|
+
} finally {
|
|
232
|
+
await page.close();
|
|
233
|
+
clearTimeout(timeoutId);
|
|
234
|
+
}
|
|
235
|
+
} catch (error) {
|
|
236
|
+
throw error;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Playwright fetch with queue-based concurrency
|
|
242
|
+
*/
|
|
243
|
+
export async function playwrightFetch(
|
|
244
|
+
url: string,
|
|
245
|
+
maxLength: number,
|
|
246
|
+
timeoutMs: number = 15000
|
|
247
|
+
): Promise<ScrapingResult | null> {
|
|
248
|
+
return new Promise((resolve) => {
|
|
249
|
+
requestQueue.push({ url, maxLength, timeoutMs, resolve });
|
|
250
|
+
processQueue();
|
|
251
|
+
});
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
255
|
+
// Cascade Fetch
|
|
256
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
257
|
+
|
|
258
|
+
/**
|
|
259
|
+
* CASCADE: Try all methods in order of speed → quality
|
|
260
|
+
*/
|
|
261
|
+
export async function cascadeFetch(
|
|
262
|
+
url: string,
|
|
263
|
+
maxLength: number = 5000,
|
|
264
|
+
usePlaywright: boolean = false,
|
|
265
|
+
timeoutMs: number = 15000
|
|
266
|
+
): Promise<ScrapingResult | null> {
|
|
267
|
+
// Level 1: Fast HTML fetch
|
|
268
|
+
if (!usePlaywright) {
|
|
269
|
+
const fast = await fastFetch(url, maxLength);
|
|
270
|
+
if (fast && fast.content.length > 1000) return fast;
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Level 2: Readability extraction
|
|
274
|
+
if (!usePlaywright) {
|
|
275
|
+
const readability = await readabilityFetch(url, maxLength);
|
|
276
|
+
if (readability) return readability;
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Level 3: JavaScript rendering with Playwright
|
|
280
|
+
const pw = await playwrightFetch(url, maxLength, timeoutMs);
|
|
281
|
+
if (pw) return pw;
|
|
282
|
+
|
|
283
|
+
return null;
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
287
|
+
// Search
|
|
288
|
+
// ═════════════════════════════════════════════════════════════════════════════
|
|
289
|
+
|
|
290
|
+
export async function searchDuckDuckGo(query: string, limit: number): Promise<WebSearchResult[]> {
|
|
291
|
+
const results: WebSearchResult[] = [];
|
|
292
|
+
|
|
293
|
+
try {
|
|
294
|
+
const liteUrl = `https://lite.duckduckgo.com/lite/?q=${encodeURIComponent(query)}`;
|
|
295
|
+
const response = await fetch(liteUrl, {
|
|
296
|
+
headers: {
|
|
297
|
+
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
298
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
299
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
|
300
|
+
},
|
|
301
|
+
signal: AbortSignal.timeout(10000),
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
if (!response.ok) return results;
|
|
305
|
+
|
|
306
|
+
const html = await response.text();
|
|
307
|
+
|
|
308
|
+
const linkRegex = /href="\/\/duckduckgo\.com\/l\/\?uddg=([^"&]+)/gi;
|
|
309
|
+
const urls: string[] = [];
|
|
310
|
+
let match;
|
|
311
|
+
|
|
312
|
+
while ((match = linkRegex.exec(html)) && urls.length < limit * 2) {
|
|
313
|
+
try {
|
|
314
|
+
const decoded = decodeURIComponent(match[1]);
|
|
315
|
+
const cleanUrl = decoded.split('&')[0].split('?rut=')[0];
|
|
316
|
+
if (cleanUrl.startsWith('http') && !urls.includes(cleanUrl)) {
|
|
317
|
+
urls.push(cleanUrl);
|
|
318
|
+
}
|
|
319
|
+
} catch { /* skip */ }
|
|
320
|
+
}
|
|
321
|
+
|
|
322
|
+
const anchorRegex = /<a[^>]*href="[^"]*uddg=[^"]*"[^>]*>([^<]+)<\/a>/gi;
|
|
323
|
+
const titles: string[] = [];
|
|
324
|
+
while ((match = anchorRegex.exec(html)) && titles.length < urls.length) {
|
|
325
|
+
const title = match[1].replace(/<[^>]*>/g, '').trim();
|
|
326
|
+
if (title && title.length > 2 && title.length < 200) {
|
|
327
|
+
titles.push(title);
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
for (let i = 0; i < Math.min(urls.length, titles.length, limit); i++) {
|
|
332
|
+
results.push({ title: titles[i] || new URL(urls[i]).hostname, url: urls[i], snippet: '' });
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
for (let i = results.length; i < Math.min(urls.length, limit); i++) {
|
|
336
|
+
try {
|
|
337
|
+
results.push({ title: new URL(urls[i]).hostname, url: urls[i], snippet: '' });
|
|
338
|
+
} catch { /* skip */ }
|
|
339
|
+
}
|
|
340
|
+
} catch { /* skip */ }
|
|
341
|
+
|
|
342
|
+
return results;
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
export async function searchSearX(query: string, limit: number, instance?: string): Promise<WebSearchResult[]> {
|
|
346
|
+
const results: WebSearchResult[] = [];
|
|
347
|
+
|
|
348
|
+
const searxInstances = instance ? [instance] : [
|
|
349
|
+
"https://search.bus-hit.me",
|
|
350
|
+
"https://search.projectsegfau.ltd",
|
|
351
|
+
"https://searx.foss.family",
|
|
352
|
+
];
|
|
353
|
+
|
|
354
|
+
for (const baseUrl of searxInstances) {
|
|
355
|
+
try {
|
|
356
|
+
const searchUrl = `${baseUrl}/search?q=${encodeURIComponent(query)}&format=json`;
|
|
357
|
+
const response = await fetch(searchUrl, {
|
|
358
|
+
headers: { 'User-Agent': 'Mozilla/5.0', 'Accept': 'application/json' },
|
|
359
|
+
signal: AbortSignal.timeout(8000)
|
|
360
|
+
});
|
|
361
|
+
|
|
362
|
+
if (!response.ok) continue;
|
|
363
|
+
|
|
364
|
+
const data: any = await response.json();
|
|
365
|
+
if (data.results && data.results.length > 0) {
|
|
366
|
+
for (const r of data.results.slice(0, limit)) {
|
|
367
|
+
results.push({
|
|
368
|
+
title: r.title || "Untitled",
|
|
369
|
+
url: r.url,
|
|
370
|
+
snippet: r.content || r.snippet || ""
|
|
371
|
+
});
|
|
372
|
+
}
|
|
373
|
+
if (results.length >= limit) break;
|
|
374
|
+
}
|
|
375
|
+
} catch {
|
|
376
|
+
continue;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
return results;
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
/**
|
|
384
|
+
* Combined search across multiple engines
|
|
385
|
+
*/
|
|
386
|
+
export async function webSearch(query: string, limit: number = 5): Promise<WebSearchResult[]> {
|
|
387
|
+
let results = await searchDuckDuckGo(query, Math.min(limit, 10));
|
|
388
|
+
if (results.length < limit) {
|
|
389
|
+
const searxResults = await searchSearX(query, Math.min(limit, 10));
|
|
390
|
+
results = [...results, ...searxResults].slice(0, limit);
|
|
391
|
+
}
|
|
392
|
+
return results;
|
|
393
|
+
}
|