webpeel 0.21.29 → 0.21.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -44,18 +44,21 @@ declare class ProviderStatsTracker {
|
|
|
44
44
|
private readonly windowSize;
|
|
45
45
|
private readonly failThreshold;
|
|
46
46
|
private readonly minSamples;
|
|
47
|
-
|
|
47
|
+
private readonly decayMs;
|
|
48
|
+
constructor(windowSize?: number, failThreshold?: number, minSamples?: number, decayMs?: number);
|
|
48
49
|
/** Record the outcome of a single attempt for the given source. */
|
|
49
50
|
record(sourceId: string, success: boolean): void;
|
|
50
51
|
/**
|
|
51
52
|
* Returns the failure rate (0–1) for the given source based on
|
|
52
53
|
* the sliding window of recorded attempts. Returns 0 if fewer
|
|
53
|
-
* than minSamples have been recorded
|
|
54
|
+
* than minSamples have been recorded, or if all samples are older
|
|
55
|
+
* than decayMs (failures expire so cold-start blips don't permanently
|
|
56
|
+
* lock out a provider).
|
|
54
57
|
*/
|
|
55
58
|
getFailureRate(sourceId: string): number;
|
|
56
59
|
/**
|
|
57
60
|
* Returns true when the source should be skipped (failure rate >=
|
|
58
|
-
* failThreshold with at least minSamples recorded).
|
|
61
|
+
* failThreshold with at least minSamples recent recorded).
|
|
59
62
|
*/
|
|
60
63
|
shouldSkip(sourceId: string): boolean;
|
|
61
64
|
/** Debug snapshot for a source. */
|
|
@@ -114,15 +114,17 @@ class ProviderStatsTracker {
|
|
|
114
114
|
windowSize;
|
|
115
115
|
failThreshold;
|
|
116
116
|
minSamples;
|
|
117
|
-
|
|
117
|
+
decayMs; // failures older than this are ignored
|
|
118
|
+
constructor(windowSize = 10, failThreshold = 0.8, minSamples = 5, decayMs = 5 * 60 * 1000) {
|
|
118
119
|
this.windowSize = windowSize;
|
|
119
120
|
this.failThreshold = failThreshold;
|
|
120
121
|
this.minSamples = minSamples;
|
|
122
|
+
this.decayMs = decayMs; // default 5 minutes: old failures don't permanently lock a provider
|
|
121
123
|
}
|
|
122
124
|
/** Record the outcome of a single attempt for the given source. */
|
|
123
125
|
record(sourceId, success) {
|
|
124
126
|
const arr = this.history.get(sourceId) ?? [];
|
|
125
|
-
arr.push({ success });
|
|
127
|
+
arr.push({ success, ts: Date.now() });
|
|
126
128
|
if (arr.length > this.windowSize)
|
|
127
129
|
arr.splice(0, arr.length - this.windowSize);
|
|
128
130
|
this.history.set(sourceId, arr);
|
|
@@ -130,18 +132,24 @@ class ProviderStatsTracker {
|
|
|
130
132
|
/**
|
|
131
133
|
* Returns the failure rate (0–1) for the given source based on
|
|
132
134
|
* the sliding window of recorded attempts. Returns 0 if fewer
|
|
133
|
-
* than minSamples have been recorded
|
|
135
|
+
* than minSamples have been recorded, or if all samples are older
|
|
136
|
+
* than decayMs (failures expire so cold-start blips don't permanently
|
|
137
|
+
* lock out a provider).
|
|
134
138
|
*/
|
|
135
139
|
getFailureRate(sourceId) {
|
|
136
140
|
const arr = this.history.get(sourceId);
|
|
137
141
|
if (!arr || arr.length < this.minSamples)
|
|
138
142
|
return 0;
|
|
139
|
-
const
|
|
140
|
-
|
|
143
|
+
const cutoff = Date.now() - this.decayMs;
|
|
144
|
+
const recent = arr.filter(a => a.ts >= cutoff);
|
|
145
|
+
if (recent.length < this.minSamples)
|
|
146
|
+
return 0; // not enough recent samples
|
|
147
|
+
const failures = recent.filter(a => !a.success).length;
|
|
148
|
+
return failures / recent.length;
|
|
141
149
|
}
|
|
142
150
|
/**
|
|
143
151
|
* Returns true when the source should be skipped (failure rate >=
|
|
144
|
-
* failThreshold with at least minSamples recorded).
|
|
152
|
+
* failThreshold with at least minSamples recent recorded).
|
|
145
153
|
*/
|
|
146
154
|
shouldSkip(sourceId) {
|
|
147
155
|
return this.getFailureRate(sourceId) >= this.failThreshold;
|
|
@@ -676,25 +684,37 @@ export class DuckDuckGoProvider {
|
|
|
676
684
|
'Upgrade-Insecure-Requests': '1',
|
|
677
685
|
'Referer': 'https://duckduckgo.com/',
|
|
678
686
|
};
|
|
679
|
-
// Try
|
|
687
|
+
// Try direct first, then proxy as fallback.
|
|
688
|
+
// Webshare backbone IPs are blocked by DDG (returns empty results).
|
|
689
|
+
// Render datacenter IPs work intermittently — direct has better odds.
|
|
680
690
|
let response;
|
|
681
|
-
|
|
691
|
+
let html;
|
|
692
|
+
// let usedProxy = false;
|
|
693
|
+
// Attempt 1: Direct fetch (no proxy)
|
|
694
|
+
try {
|
|
695
|
+
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
|
|
696
|
+
html = response.ok ? await response.text() : '';
|
|
697
|
+
}
|
|
698
|
+
catch (directErr) {
|
|
699
|
+
log.debug('DDG direct fetch failed:', directErr instanceof Error ? directErr.message : directErr);
|
|
700
|
+
html = '';
|
|
701
|
+
}
|
|
702
|
+
// Check if direct returned actual results (not empty/CAPTCHA)
|
|
703
|
+
const hasResults = html.includes('class="result"') || html.includes('class="result ');
|
|
704
|
+
if (!hasResults && proxyUrl) {
|
|
705
|
+
// Attempt 2: Proxy fallback
|
|
706
|
+
log.debug('DDG direct returned no results, trying proxy...');
|
|
682
707
|
try {
|
|
708
|
+
// usedProxy = true;
|
|
683
709
|
const dispatcher = new ProxyAgent(proxyUrl);
|
|
684
710
|
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal, dispatcher });
|
|
711
|
+
if (response.ok)
|
|
712
|
+
html = await response.text();
|
|
685
713
|
}
|
|
686
714
|
catch (proxyErr) {
|
|
687
|
-
log.debug('DDG proxy
|
|
688
|
-
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
|
|
715
|
+
log.debug('DDG proxy also failed:', proxyErr instanceof Error ? proxyErr.message : proxyErr);
|
|
689
716
|
}
|
|
690
717
|
}
|
|
691
|
-
else {
|
|
692
|
-
response = await undiciFetch(searchUrl, { headers: baseHeaders, signal });
|
|
693
|
-
}
|
|
694
|
-
if (!response.ok) {
|
|
695
|
-
throw new Error(`Search failed: HTTP ${response.status}`);
|
|
696
|
-
}
|
|
697
|
-
const html = await response.text();
|
|
698
718
|
const $ = load(html);
|
|
699
719
|
const results = [];
|
|
700
720
|
const seen = new Set();
|
|
@@ -758,22 +778,25 @@ export class DuckDuckGoProvider {
|
|
|
758
778
|
'Referer': 'https://lite.duckduckgo.com/',
|
|
759
779
|
};
|
|
760
780
|
const liteUrl = `https://lite.duckduckgo.com/lite/?${params.toString()}`;
|
|
761
|
-
|
|
762
|
-
|
|
781
|
+
// Direct first, proxy fallback (same reasoning as searchOnce — Webshare IPs blocked by DDG)
|
|
782
|
+
let html = '';
|
|
783
|
+
try {
|
|
784
|
+
const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
|
|
785
|
+
if (resp.ok)
|
|
786
|
+
html = await resp.text();
|
|
787
|
+
}
|
|
788
|
+
catch { /* direct failed */ }
|
|
789
|
+
if (!html.includes('result-link') && liteProxyUrl) {
|
|
763
790
|
try {
|
|
764
791
|
const dispatcher = new ProxyAgent(liteProxyUrl);
|
|
765
|
-
|
|
792
|
+
const resp = await undiciFetch(liteUrl, { headers: liteHeaders, signal, dispatcher });
|
|
793
|
+
if (resp.ok)
|
|
794
|
+
html = await resp.text();
|
|
766
795
|
}
|
|
767
|
-
catch {
|
|
768
|
-
response = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
|
|
769
|
-
}
|
|
770
|
-
}
|
|
771
|
-
else {
|
|
772
|
-
response = await undiciFetch(liteUrl, { headers: liteHeaders, signal });
|
|
796
|
+
catch { /* proxy also failed */ }
|
|
773
797
|
}
|
|
774
|
-
if (!
|
|
798
|
+
if (!html)
|
|
775
799
|
return [];
|
|
776
|
-
const html = await response.text();
|
|
777
800
|
const $ = load(html);
|
|
778
801
|
const results = [];
|
|
779
802
|
const seen = new Set();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "webpeel",
|
|
3
|
-
"version": "0.21.
|
|
3
|
+
"version": "0.21.31",
|
|
4
4
|
"description": "Fast web fetcher for AI agents - stealth mode, crawl mode, page actions, structured extraction, PDF parsing, smart escalation from simple HTTP to headless browser",
|
|
5
5
|
"author": "Jake Liu",
|
|
6
6
|
"license": "AGPL-3.0-only",
|