webpeel 0.21.36 → 0.21.37
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -153,7 +153,12 @@ async function fetchJson(url, customHeaders) {
|
|
|
153
153
|
Accept: 'application/json',
|
|
154
154
|
...customHeaders,
|
|
155
155
|
});
|
|
156
|
-
|
|
156
|
+
const parsed = tryParseJson(result.html);
|
|
157
|
+
if (parsed === null && result.html.length > 0) {
|
|
158
|
+
// Log when we get non-JSON back (likely an HTML error page)
|
|
159
|
+
console.warn(`[webpeel:fetchJson] Non-JSON response from ${url} (${result.html.length} bytes, status: ${result.statusCode}): ${result.html.slice(0, 120)}`);
|
|
160
|
+
}
|
|
161
|
+
return parsed;
|
|
157
162
|
}
|
|
158
163
|
/** Fetch JSON with exponential backoff retry on 429 / rate-limit errors. */
|
|
159
164
|
async function fetchJsonWithRetry(url, headers, retries = 2, baseDelayMs = 1000) {
|
|
@@ -910,11 +915,17 @@ ${commentsMd || '*No comments.*'}`;
|
|
|
910
915
|
if (pathParts.length >= 2) {
|
|
911
916
|
// Sequential fetches to avoid secondary rate limits on popular repos
|
|
912
917
|
const repoData = await fetchJsonWithRetry(`https://api.github.com/repos/${owner}/${repo}`, ghHeaders, 2, 1000);
|
|
913
|
-
if (!repoData
|
|
914
|
-
|
|
915
|
-
// Secondary rate limit check
|
|
916
|
-
if (repoData.message?.includes('secondary rate limit') || repoData.message?.includes('abuse'))
|
|
918
|
+
if (!repoData) {
|
|
919
|
+
console.warn(`[webpeel:github] repo API returned null for ${owner}/${repo}`);
|
|
917
920
|
return null;
|
|
921
|
+
}
|
|
922
|
+
if (repoData.message) {
|
|
923
|
+
console.warn(`[webpeel:github] repo API error for ${owner}/${repo}: ${repoData.message}`);
|
|
924
|
+
if (repoData.message === 'Not Found')
|
|
925
|
+
return null;
|
|
926
|
+
if (repoData.message.includes('secondary rate limit') || repoData.message.includes('abuse'))
|
|
927
|
+
return null;
|
|
928
|
+
}
|
|
918
929
|
const readmeData = await fetchJsonWithRetry(`https://api.github.com/repos/${owner}/${repo}/readme`, ghHeaders, 1, 500).catch(() => null);
|
|
919
930
|
// README content is base64 encoded
|
|
920
931
|
let readmeText = '';
|
package/dist/core/pipeline.js
CHANGED
|
@@ -307,7 +307,7 @@ export async function fetchContent(ctx) {
|
|
|
307
307
|
}
|
|
308
308
|
catch (e) {
|
|
309
309
|
// Domain API failed — fall through to normal fetch
|
|
310
|
-
log.
|
|
310
|
+
log.warn('domain API first-pass failed, falling back to fetch:', e instanceof Error ? e.message : e);
|
|
311
311
|
}
|
|
312
312
|
}
|
|
313
313
|
ctx.timer.mark('fetch');
|
|
@@ -959,7 +959,7 @@ export async function postProcess(ctx) {
|
|
|
959
959
|
}
|
|
960
960
|
catch (e) {
|
|
961
961
|
// Domain extraction failure is non-fatal; continue with normal content
|
|
962
|
-
log.
|
|
962
|
+
log.warn('domain extraction (second pass) failed:', e instanceof Error ? e.message : e);
|
|
963
963
|
}
|
|
964
964
|
}
|
|
965
965
|
// === Challenge / bot-protection page detection ===
|
|
@@ -153,7 +153,7 @@ const VALID_LLM_PROVIDERS = [
|
|
|
153
153
|
'cloudflare',
|
|
154
154
|
];
|
|
155
155
|
const MAX_SOURCES_HARD_LIMIT = 8;
|
|
156
|
-
const PER_URL_TIMEOUT_MS =
|
|
156
|
+
const PER_URL_TIMEOUT_MS = 8_000;
|
|
157
157
|
const TOTAL_TIMEOUT_MS = 60_000;
|
|
158
158
|
export function createResearchRouter() {
|
|
159
159
|
const router = Router();
|
|
@@ -318,9 +318,14 @@ export function createResearchRouter() {
|
|
|
318
318
|
wordCount,
|
|
319
319
|
fetchTime,
|
|
320
320
|
});
|
|
321
|
-
if (
|
|
321
|
+
if (wordCount >= 50) {
|
|
322
322
|
fetchedContents.push({ url, content });
|
|
323
323
|
}
|
|
324
|
+
else if (snippet.length > 20) {
|
|
325
|
+
// Content too thin — use search snippet + title as surrogate
|
|
326
|
+
const surrogateContent = `${pageTitle}\n\n${snippet}`;
|
|
327
|
+
fetchedContents.push({ url, content: surrogateContent });
|
|
328
|
+
}
|
|
324
329
|
}
|
|
325
330
|
catch {
|
|
326
331
|
// Skip failed URLs, continue to next
|
|
@@ -349,10 +354,15 @@ export function createResearchRouter() {
|
|
|
349
354
|
const effectiveLLMConfig = llmConfig ?? (process.env.OLLAMA_URL
|
|
350
355
|
? { provider: 'ollama', apiKey: process.env.OLLAMA_SECRET || '' }
|
|
351
356
|
: undefined);
|
|
352
|
-
if (effectiveLLMConfig && fetchedContents.length > 0 && Date.now() < overallDeadline -
|
|
357
|
+
if (effectiveLLMConfig && fetchedContents.length > 0 && Date.now() < overallDeadline - 1_000) {
|
|
353
358
|
try {
|
|
359
|
+
// Filter to sources with 30+ words; fall back to all if none pass the threshold
|
|
360
|
+
const contentsForLLM = (() => {
|
|
361
|
+
const filtered = fetchedContents.filter(fc => fc.content.split(/\s+/).filter(Boolean).length >= 30);
|
|
362
|
+
return filtered.length > 0 ? filtered : fetchedContents;
|
|
363
|
+
})();
|
|
354
364
|
// Sanitize web content before sending to LLM (prompt injection defense layer 1)
|
|
355
|
-
const sourcesText =
|
|
365
|
+
const sourcesText = contentsForLLM
|
|
356
366
|
.map((fc, i) => {
|
|
357
367
|
const sanitized = sanitizeForLLM(fc.content.slice(0, 1200));
|
|
358
368
|
if (sanitized.injectionDetected) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.37",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|