scrapex 1.0.0-alpha.1 → 1.0.0-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +164 -5
- package/dist/embeddings/index.cjs +52 -0
- package/dist/embeddings/index.d.cts +3 -0
- package/dist/embeddings/index.d.mts +3 -0
- package/dist/embeddings/index.mjs +4 -0
- package/dist/embeddings-BjNTQSG9.cjs +1455 -0
- package/dist/embeddings-BjNTQSG9.cjs.map +1 -0
- package/dist/embeddings-Bsymy_jA.mjs +1215 -0
- package/dist/embeddings-Bsymy_jA.mjs.map +1 -0
- package/dist/{enhancer-oM4BhYYS.cjs → enhancer-Cs_WyWtJ.cjs} +2 -51
- package/dist/enhancer-Cs_WyWtJ.cjs.map +1 -0
- package/dist/{enhancer-Q6CSc1gA.mjs → enhancer-INx5NlgO.mjs} +2 -45
- package/dist/enhancer-INx5NlgO.mjs.map +1 -0
- package/dist/http-base-CHLf-Tco.cjs +684 -0
- package/dist/http-base-CHLf-Tco.cjs.map +1 -0
- package/dist/http-base-DM7YNo6X.mjs +618 -0
- package/dist/http-base-DM7YNo6X.mjs.map +1 -0
- package/dist/index-Bvseqli-.d.cts +268 -0
- package/dist/index-Bvseqli-.d.cts.map +1 -0
- package/dist/index-CIFjNySr.d.mts +268 -0
- package/dist/index-CIFjNySr.d.mts.map +1 -0
- package/dist/index-D6qfjmZQ.d.mts +401 -0
- package/dist/index-D6qfjmZQ.d.mts.map +1 -0
- package/dist/index-RFSpP5g8.d.cts +401 -0
- package/dist/index-RFSpP5g8.d.cts.map +1 -0
- package/dist/index.cjs +171 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +61 -2
- package/dist/index.d.cts.map +1 -1
- package/dist/index.d.mts +61 -2
- package/dist/index.d.mts.map +1 -1
- package/dist/index.mjs +129 -6
- package/dist/index.mjs.map +1 -1
- package/dist/llm/index.cjs +252 -233
- package/dist/llm/index.cjs.map +1 -1
- package/dist/llm/index.d.cts +132 -85
- package/dist/llm/index.d.cts.map +1 -1
- package/dist/llm/index.d.mts +132 -85
- package/dist/llm/index.d.mts.map +1 -1
- package/dist/llm/index.mjs +244 -236
- package/dist/llm/index.mjs.map +1 -1
- package/dist/parsers/index.cjs +10 -199
- package/dist/parsers/index.d.cts +2 -133
- package/dist/parsers/index.d.mts +2 -133
- package/dist/parsers/index.mjs +2 -191
- package/dist/parsers-Bneuws8x.cjs +569 -0
- package/dist/parsers-Bneuws8x.cjs.map +1 -0
- package/dist/parsers-DsawHeo0.mjs +482 -0
- package/dist/parsers-DsawHeo0.mjs.map +1 -0
- package/dist/types-BOcHQU9s.d.mts +831 -0
- package/dist/types-BOcHQU9s.d.mts.map +1 -0
- package/dist/types-DutdBpqd.d.cts +831 -0
- package/dist/types-DutdBpqd.d.cts.map +1 -0
- package/package.json +15 -16
- package/dist/enhancer-Q6CSc1gA.mjs.map +0 -1
- package/dist/enhancer-oM4BhYYS.cjs.map +0 -1
- package/dist/parsers/index.cjs.map +0 -1
- package/dist/parsers/index.d.cts.map +0 -1
- package/dist/parsers/index.d.mts.map +0 -1
- package/dist/parsers/index.mjs.map +0 -1
- package/dist/types-CNQZVW36.d.mts +0 -150
- package/dist/types-CNQZVW36.d.mts.map +0 -1
- package/dist/types-D0HYR95H.d.cts +0 -150
- package/dist/types-D0HYR95H.d.cts.map +0 -1
|
@@ -0,0 +1,684 @@
|
|
|
1
|
+
const require_parsers = require('./parsers-Bneuws8x.cjs');
|
|
2
|
+
let node_dns = require("node:dns");
|
|
3
|
+
let node_net = require("node:net");
|
|
4
|
+
|
|
5
|
+
//#region src/core/errors.ts
|
|
6
|
+
/**
|
|
7
|
+
* Custom error class for scraping failures with structured error codes
|
|
8
|
+
*/
|
|
9
|
+
var ScrapeError = class ScrapeError extends Error {
|
|
10
|
+
code;
|
|
11
|
+
statusCode;
|
|
12
|
+
constructor(message, code, statusCode, cause) {
|
|
13
|
+
super(message, { cause });
|
|
14
|
+
this.name = "ScrapeError";
|
|
15
|
+
this.code = code;
|
|
16
|
+
this.statusCode = statusCode;
|
|
17
|
+
if (Error.captureStackTrace) Error.captureStackTrace(this, ScrapeError);
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Create a ScrapeError from an unknown error
|
|
21
|
+
*/
|
|
22
|
+
static from(error, code = "FETCH_FAILED") {
|
|
23
|
+
if (error instanceof ScrapeError) return error;
|
|
24
|
+
if (error instanceof Error) return new ScrapeError(error.message, code, void 0, error);
|
|
25
|
+
return new ScrapeError(String(error), code);
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Check if error is retryable (network issues, timeouts)
|
|
29
|
+
*/
|
|
30
|
+
isRetryable() {
|
|
31
|
+
return this.code === "FETCH_FAILED" || this.code === "TIMEOUT";
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Convert to a plain object for serialization
|
|
35
|
+
*/
|
|
36
|
+
toJSON() {
|
|
37
|
+
return {
|
|
38
|
+
name: this.name,
|
|
39
|
+
message: this.message,
|
|
40
|
+
code: this.code,
|
|
41
|
+
statusCode: this.statusCode,
|
|
42
|
+
stack: this.stack
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
//#endregion
|
|
48
|
+
//#region src/common/errors.ts
|
|
49
|
+
/**
|
|
50
|
+
* Error normalization utilities for HTTP providers.
|
|
51
|
+
* Maps HTTP status codes to consistent ScrapeError codes.
|
|
52
|
+
*/
|
|
53
|
+
/**
|
|
54
|
+
* HTTP status code to ScrapeError code mapping.
|
|
55
|
+
*/
|
|
56
|
+
function getErrorCodeFromStatus(status) {
|
|
57
|
+
if (status === 401 || status === 403) return "BLOCKED";
|
|
58
|
+
if (status === 404) return "NOT_FOUND";
|
|
59
|
+
if (status === 429) return "BLOCKED";
|
|
60
|
+
if (status === 408) return "TIMEOUT";
|
|
61
|
+
if (status >= 500) return "LLM_ERROR";
|
|
62
|
+
return "FETCH_FAILED";
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Parse error message from API response body.
|
|
66
|
+
*/
|
|
67
|
+
async function parseErrorBody(response) {
|
|
68
|
+
try {
|
|
69
|
+
const text = await response.text();
|
|
70
|
+
try {
|
|
71
|
+
const json = JSON.parse(text);
|
|
72
|
+
if (typeof json.error === "object" && json.error !== null) {
|
|
73
|
+
const error = json.error;
|
|
74
|
+
return String(error.message ?? error.msg ?? JSON.stringify(error));
|
|
75
|
+
}
|
|
76
|
+
if (typeof json.error === "string") return json.error;
|
|
77
|
+
if (typeof json.message === "string") return json.message;
|
|
78
|
+
if (typeof json.detail === "string") return json.detail;
|
|
79
|
+
return text;
|
|
80
|
+
} catch {
|
|
81
|
+
return text || `HTTP ${response.status} ${response.statusText}`;
|
|
82
|
+
}
|
|
83
|
+
} catch {
|
|
84
|
+
return `HTTP ${response.status} ${response.statusText}`;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Create a ScrapeError from an HTTP response.
|
|
89
|
+
*/
|
|
90
|
+
async function createHttpError(response, providerName, errorMapper) {
|
|
91
|
+
const code = getErrorCodeFromStatus(response.status);
|
|
92
|
+
let message;
|
|
93
|
+
if (errorMapper) try {
|
|
94
|
+
message = errorMapper(await response.json());
|
|
95
|
+
} catch {
|
|
96
|
+
message = await parseErrorBody(response);
|
|
97
|
+
}
|
|
98
|
+
else message = await parseErrorBody(response);
|
|
99
|
+
return new ScrapeError(`${providerName} API error (${response.status}): ${message}`, code, response.status);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
//#endregion
|
|
103
|
+
//#region src/common/resilience.ts
|
|
104
|
+
/**
|
|
105
|
+
* Default retry configuration.
|
|
106
|
+
*/
|
|
107
|
+
const DEFAULT_RETRY = {
|
|
108
|
+
maxAttempts: 3,
|
|
109
|
+
backoffMs: 1e3,
|
|
110
|
+
backoffMultiplier: 2,
|
|
111
|
+
retryableStatuses: [
|
|
112
|
+
408,
|
|
113
|
+
429,
|
|
114
|
+
500,
|
|
115
|
+
502,
|
|
116
|
+
503,
|
|
117
|
+
504
|
|
118
|
+
]
|
|
119
|
+
};
|
|
120
|
+
/**
|
|
121
|
+
* Errors that should be retried (transient failures).
|
|
122
|
+
*/
|
|
123
|
+
const RETRYABLE_ERROR_CODES = [
|
|
124
|
+
"ECONNRESET",
|
|
125
|
+
"ETIMEDOUT",
|
|
126
|
+
"ECONNREFUSED",
|
|
127
|
+
"EPIPE",
|
|
128
|
+
"ENOTFOUND",
|
|
129
|
+
"ENETUNREACH",
|
|
130
|
+
"EAI_AGAIN"
|
|
131
|
+
];
|
|
132
|
+
/**
|
|
133
|
+
* Check if an error is retryable.
|
|
134
|
+
*/
|
|
135
|
+
function isRetryableError(error, retryableStatuses = DEFAULT_RETRY.retryableStatuses) {
|
|
136
|
+
if (error instanceof Error) {
|
|
137
|
+
const code = error.code;
|
|
138
|
+
if (code && RETRYABLE_ERROR_CODES.includes(code)) return true;
|
|
139
|
+
if ("statusCode" in error && typeof error.statusCode === "number") return retryableStatuses.includes(error.statusCode);
|
|
140
|
+
if ("status" in error && typeof error.status === "number") return retryableStatuses.includes(error.status);
|
|
141
|
+
if ("code" in error) {
|
|
142
|
+
const errCode = error.code;
|
|
143
|
+
if (errCode === "TIMEOUT" || errCode === "FETCH_FAILED") return true;
|
|
144
|
+
}
|
|
145
|
+
const message = error.message.toLowerCase();
|
|
146
|
+
if (message.includes("timeout") || message.includes("rate limit") || message.includes("too many requests") || message.includes("temporarily unavailable")) return true;
|
|
147
|
+
}
|
|
148
|
+
return false;
|
|
149
|
+
}
|
|
150
|
+
/**
|
|
151
|
+
* Sleep for specified milliseconds.
|
|
152
|
+
*/
|
|
153
|
+
function sleep(ms) {
|
|
154
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Execute a function with retry logic.
|
|
158
|
+
*/
|
|
159
|
+
async function withRetry(fn, config, onRetry) {
|
|
160
|
+
const maxAttempts = config?.maxAttempts ?? DEFAULT_RETRY.maxAttempts;
|
|
161
|
+
const backoffMs = config?.backoffMs ?? DEFAULT_RETRY.backoffMs;
|
|
162
|
+
const multiplier = config?.backoffMultiplier ?? DEFAULT_RETRY.backoffMultiplier;
|
|
163
|
+
const retryableStatuses = config?.retryableStatuses ?? DEFAULT_RETRY.retryableStatuses;
|
|
164
|
+
let lastError;
|
|
165
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) try {
|
|
166
|
+
return {
|
|
167
|
+
result: await fn(),
|
|
168
|
+
attempts: attempt
|
|
169
|
+
};
|
|
170
|
+
} catch (error) {
|
|
171
|
+
lastError = error instanceof Error ? error : new Error(String(error));
|
|
172
|
+
if (attempt === maxAttempts || !isRetryableError(error, retryableStatuses)) throw lastError;
|
|
173
|
+
const jitter = backoffMs * multiplier ** (attempt - 1) * (.9 + Math.random() * .2);
|
|
174
|
+
onRetry?.(attempt, lastError, jitter);
|
|
175
|
+
await sleep(jitter);
|
|
176
|
+
}
|
|
177
|
+
throw lastError ?? /* @__PURE__ */ new Error("Retry failed");
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Execute a function with timeout.
|
|
181
|
+
*/
|
|
182
|
+
async function withTimeout(fn, timeoutMs) {
|
|
183
|
+
const controller = new AbortController();
|
|
184
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
185
|
+
try {
|
|
186
|
+
return await fn(controller.signal);
|
|
187
|
+
} finally {
|
|
188
|
+
clearTimeout(timeoutId);
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
/**
|
|
192
|
+
* Create an AbortSignal that times out after specified milliseconds.
|
|
193
|
+
* If parentSignal is provided, this signal will abort when the parent aborts.
|
|
194
|
+
*/
|
|
195
|
+
function createTimeoutSignal(timeoutMs, parentSignal) {
|
|
196
|
+
const controller = new AbortController();
|
|
197
|
+
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
|
|
198
|
+
timeoutId.unref?.();
|
|
199
|
+
const clear = () => clearTimeout(timeoutId);
|
|
200
|
+
controller.signal.addEventListener("abort", clear, { once: true });
|
|
201
|
+
if (parentSignal) {
|
|
202
|
+
if (parentSignal.aborted) {
|
|
203
|
+
clear();
|
|
204
|
+
controller.abort(parentSignal.reason);
|
|
205
|
+
return controller.signal;
|
|
206
|
+
}
|
|
207
|
+
parentSignal.addEventListener("abort", () => {
|
|
208
|
+
clear();
|
|
209
|
+
controller.abort(parentSignal.reason);
|
|
210
|
+
}, { once: true });
|
|
211
|
+
}
|
|
212
|
+
return controller.signal;
|
|
213
|
+
}
|
|
214
|
+
/**
|
|
215
|
+
* Default circuit breaker configuration.
|
|
216
|
+
*/
|
|
217
|
+
const DEFAULT_CIRCUIT_BREAKER = {
|
|
218
|
+
failureThreshold: 5,
|
|
219
|
+
resetTimeoutMs: 3e4
|
|
220
|
+
};
|
|
221
|
+
/**
|
|
222
|
+
* Error thrown when circuit breaker is open.
|
|
223
|
+
*/
|
|
224
|
+
var CircuitOpenError = class extends Error {
|
|
225
|
+
isCircuitOpen = true;
|
|
226
|
+
constructor(message) {
|
|
227
|
+
super(message);
|
|
228
|
+
this.name = "CircuitOpenError";
|
|
229
|
+
}
|
|
230
|
+
};
|
|
231
|
+
/**
|
|
232
|
+
* Circuit breaker implementation.
|
|
233
|
+
* Prevents cascade failures by stopping requests when failure rate is high.
|
|
234
|
+
*/
|
|
235
|
+
var CircuitBreaker = class {
|
|
236
|
+
state;
|
|
237
|
+
failureThreshold;
|
|
238
|
+
resetTimeoutMs;
|
|
239
|
+
constructor(config) {
|
|
240
|
+
this.failureThreshold = config?.failureThreshold ?? DEFAULT_CIRCUIT_BREAKER.failureThreshold;
|
|
241
|
+
this.resetTimeoutMs = config?.resetTimeoutMs ?? DEFAULT_CIRCUIT_BREAKER.resetTimeoutMs;
|
|
242
|
+
this.state = {
|
|
243
|
+
state: "closed",
|
|
244
|
+
failures: 0
|
|
245
|
+
};
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Check if requests are blocked.
|
|
249
|
+
*/
|
|
250
|
+
isOpen() {
|
|
251
|
+
this.updateState();
|
|
252
|
+
return this.state.state === "open";
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Get current circuit state.
|
|
256
|
+
*/
|
|
257
|
+
getState() {
|
|
258
|
+
this.updateState();
|
|
259
|
+
return this.state.state;
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* Record a successful request.
|
|
263
|
+
*/
|
|
264
|
+
recordSuccess() {
|
|
265
|
+
this.state.failures = 0;
|
|
266
|
+
this.state.state = "closed";
|
|
267
|
+
this.state.lastFailureTime = void 0;
|
|
268
|
+
this.state.nextAttemptTime = void 0;
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Record a failed request.
|
|
272
|
+
*/
|
|
273
|
+
recordFailure() {
|
|
274
|
+
this.state.failures++;
|
|
275
|
+
this.state.lastFailureTime = Date.now();
|
|
276
|
+
if (this.state.failures >= this.failureThreshold) {
|
|
277
|
+
this.state.state = "open";
|
|
278
|
+
this.state.nextAttemptTime = Date.now() + this.resetTimeoutMs;
|
|
279
|
+
}
|
|
280
|
+
}
|
|
281
|
+
/**
|
|
282
|
+
* Execute a function with circuit breaker protection.
|
|
283
|
+
*/
|
|
284
|
+
async execute(fn) {
|
|
285
|
+
if (this.isOpen()) throw new CircuitOpenError(`Circuit breaker is open. Next attempt at ${this.state.nextAttemptTime ? new Date(this.state.nextAttemptTime).toISOString() : "unknown"}`);
|
|
286
|
+
try {
|
|
287
|
+
const result = await fn();
|
|
288
|
+
this.recordSuccess();
|
|
289
|
+
return result;
|
|
290
|
+
} catch (error) {
|
|
291
|
+
this.recordFailure();
|
|
292
|
+
throw error;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Reset the circuit breaker.
|
|
297
|
+
*/
|
|
298
|
+
reset() {
|
|
299
|
+
this.state = {
|
|
300
|
+
state: "closed",
|
|
301
|
+
failures: 0
|
|
302
|
+
};
|
|
303
|
+
}
|
|
304
|
+
/**
|
|
305
|
+
* Update state based on time (open -> half-open transition).
|
|
306
|
+
*/
|
|
307
|
+
updateState() {
|
|
308
|
+
if (this.state.state === "open" && this.state.nextAttemptTime && Date.now() >= this.state.nextAttemptTime) this.state.state = "half-open";
|
|
309
|
+
}
|
|
310
|
+
};
|
|
311
|
+
/**
|
|
312
|
+
* Token bucket rate limiter.
|
|
313
|
+
*/
|
|
314
|
+
var RateLimiter = class {
|
|
315
|
+
tokens;
|
|
316
|
+
lastRefill;
|
|
317
|
+
maxTokens;
|
|
318
|
+
refillRate;
|
|
319
|
+
constructor(config) {
|
|
320
|
+
const requestsPerSecond = (config.requestsPerMinute ?? 60) / 60;
|
|
321
|
+
this.maxTokens = Math.max(1, Math.ceil(requestsPerSecond * 10));
|
|
322
|
+
this.refillRate = requestsPerSecond;
|
|
323
|
+
this.tokens = this.maxTokens;
|
|
324
|
+
this.lastRefill = Date.now();
|
|
325
|
+
}
|
|
326
|
+
/**
|
|
327
|
+
* Check if a request is allowed without consuming tokens.
|
|
328
|
+
*/
|
|
329
|
+
canProceed() {
|
|
330
|
+
this.refill();
|
|
331
|
+
return this.tokens >= 1;
|
|
332
|
+
}
|
|
333
|
+
/**
|
|
334
|
+
* Attempt to acquire tokens for a request.
|
|
335
|
+
* Returns true if allowed, false if rate limited.
|
|
336
|
+
*/
|
|
337
|
+
tryAcquire(tokens = 1) {
|
|
338
|
+
this.refill();
|
|
339
|
+
if (this.tokens >= tokens) {
|
|
340
|
+
this.tokens -= tokens;
|
|
341
|
+
return true;
|
|
342
|
+
}
|
|
343
|
+
return false;
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Wait until tokens are available, then acquire.
|
|
347
|
+
*/
|
|
348
|
+
async acquire(tokens = 1) {
|
|
349
|
+
if (this.tryAcquire(tokens)) return;
|
|
350
|
+
this.refill();
|
|
351
|
+
const tokensNeeded = tokens - this.tokens;
|
|
352
|
+
const waitMs = Math.ceil(tokensNeeded / this.refillRate * 1e3);
|
|
353
|
+
if (waitMs > 0) await sleep(waitMs);
|
|
354
|
+
while (!this.tryAcquire(tokens)) await sleep(Math.ceil(1 / this.refillRate * 1e3));
|
|
355
|
+
}
|
|
356
|
+
/**
|
|
357
|
+
* Get time until next token is available (in milliseconds).
|
|
358
|
+
*/
|
|
359
|
+
getWaitTime() {
|
|
360
|
+
this.refill();
|
|
361
|
+
if (this.tokens >= 1) return 0;
|
|
362
|
+
return Math.ceil(1 / this.refillRate * 1e3);
|
|
363
|
+
}
|
|
364
|
+
/**
|
|
365
|
+
* Refill tokens based on elapsed time.
|
|
366
|
+
*/
|
|
367
|
+
refill() {
|
|
368
|
+
const now = Date.now();
|
|
369
|
+
const newTokens = (now - this.lastRefill) / 1e3 * this.refillRate;
|
|
370
|
+
this.tokens = Math.min(this.maxTokens, this.tokens + newTokens);
|
|
371
|
+
this.lastRefill = now;
|
|
372
|
+
}
|
|
373
|
+
};
|
|
374
|
+
/**
|
|
375
|
+
* Semaphore for limiting concurrent operations.
|
|
376
|
+
*/
|
|
377
|
+
var Semaphore = class {
|
|
378
|
+
permits;
|
|
379
|
+
waiting = [];
|
|
380
|
+
constructor(permits) {
|
|
381
|
+
this.permits = permits;
|
|
382
|
+
}
|
|
383
|
+
/**
|
|
384
|
+
* Acquire a permit, waiting if necessary.
|
|
385
|
+
*/
|
|
386
|
+
async acquire() {
|
|
387
|
+
if (this.permits > 0) {
|
|
388
|
+
this.permits--;
|
|
389
|
+
return;
|
|
390
|
+
}
|
|
391
|
+
return new Promise((resolve) => {
|
|
392
|
+
this.waiting.push(resolve);
|
|
393
|
+
});
|
|
394
|
+
}
|
|
395
|
+
/**
|
|
396
|
+
* Release a permit.
|
|
397
|
+
*/
|
|
398
|
+
release() {
|
|
399
|
+
const next = this.waiting.shift();
|
|
400
|
+
if (next) next();
|
|
401
|
+
else this.permits++;
|
|
402
|
+
}
|
|
403
|
+
/**
|
|
404
|
+
* Execute function with semaphore protection.
|
|
405
|
+
*/
|
|
406
|
+
async execute(fn) {
|
|
407
|
+
await this.acquire();
|
|
408
|
+
try {
|
|
409
|
+
return await fn();
|
|
410
|
+
} finally {
|
|
411
|
+
this.release();
|
|
412
|
+
}
|
|
413
|
+
}
|
|
414
|
+
};
|
|
415
|
+
/**
|
|
416
|
+
* Execute a function with all resilience features.
|
|
417
|
+
*
|
|
418
|
+
* @param fn - The async function to execute with resilience
|
|
419
|
+
* @param config - Configuration for retry and timeout behavior
|
|
420
|
+
* @param state - Pre-instantiated resilience primitives for stateful features.
|
|
421
|
+
* Circuit breaker, rate limiter, and semaphore must be instantiated by the caller
|
|
422
|
+
* and passed via state to enable those features. This allows sharing state across
|
|
423
|
+
* multiple calls for proper circuit breaker tracking and rate limiting.
|
|
424
|
+
* The config parameter is only used for retry and timeout settings.
|
|
425
|
+
* @param callbacks - Optional callbacks for retry events
|
|
426
|
+
*/
|
|
427
|
+
async function withResilience(fn, config, state, callbacks) {
|
|
428
|
+
const timeoutMs = config?.timeoutMs ?? 3e4;
|
|
429
|
+
if (state?.circuitBreaker?.isOpen()) throw new CircuitOpenError("Circuit breaker is open");
|
|
430
|
+
if (state?.rateLimiter) await state.rateLimiter.acquire();
|
|
431
|
+
const executeWithConcurrency = async () => {
|
|
432
|
+
const withTimeoutFn = () => withTimeout(fn, timeoutMs);
|
|
433
|
+
try {
|
|
434
|
+
const retryResult = await withRetry(withTimeoutFn, config?.retry, callbacks?.onRetry);
|
|
435
|
+
state?.circuitBreaker?.recordSuccess();
|
|
436
|
+
return retryResult;
|
|
437
|
+
} catch (error) {
|
|
438
|
+
state?.circuitBreaker?.recordFailure();
|
|
439
|
+
throw error;
|
|
440
|
+
}
|
|
441
|
+
};
|
|
442
|
+
if (state?.semaphore) return state.semaphore.execute(executeWithConcurrency);
|
|
443
|
+
return executeWithConcurrency();
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
//#endregion
|
|
447
|
+
//#region src/common/http-base.ts
|
|
448
|
+
/**
|
|
449
|
+
* Shared HTTP provider infrastructure for LLM and Embedding providers.
|
|
450
|
+
* Provides SSRF protection, resilience, and error normalization.
|
|
451
|
+
*/
|
|
452
|
+
/**
|
|
453
|
+
* Private IP ranges blocked for SSRF protection.
|
|
454
|
+
*/
|
|
455
|
+
const PRIVATE_IP_PATTERNS = [
|
|
456
|
+
/^10\./,
|
|
457
|
+
/^172\.(1[6-9]|2\d|3[01])\./,
|
|
458
|
+
/^192\.168\./,
|
|
459
|
+
/^127\./,
|
|
460
|
+
/^0\./,
|
|
461
|
+
/^169\.254\./,
|
|
462
|
+
/^100\.(6[4-9]|[7-9]\d|1[01]\d|12[0-7])\./,
|
|
463
|
+
/^::1$/,
|
|
464
|
+
/^(fc|fd)[0-9a-f]{2}:/i,
|
|
465
|
+
/^fe80:/i,
|
|
466
|
+
/^fec0:/i,
|
|
467
|
+
/^::ffff:(10\.|172\.(1[6-9]|2\d|3[01])\.|192\.168\.|127\.|0\.)/i,
|
|
468
|
+
/^localhost$/i
|
|
469
|
+
];
|
|
470
|
+
/**
|
|
471
|
+
* Check if a hostname/IP is private.
|
|
472
|
+
*/
|
|
473
|
+
function isPrivateHost(hostname) {
|
|
474
|
+
return PRIVATE_IP_PATTERNS.some((pattern) => pattern.test(hostname));
|
|
475
|
+
}
|
|
476
|
+
/**
|
|
477
|
+
* Validate a URL for security.
|
|
478
|
+
*/
|
|
479
|
+
function validateUrl(url, options = {}) {
|
|
480
|
+
const requireHttps = options.requireHttps ?? true;
|
|
481
|
+
const allowPrivate = options.allowPrivate ?? false;
|
|
482
|
+
let parsed;
|
|
483
|
+
try {
|
|
484
|
+
parsed = new URL(url);
|
|
485
|
+
} catch {
|
|
486
|
+
throw new ScrapeError(`Invalid URL: ${url}`, "INVALID_URL");
|
|
487
|
+
}
|
|
488
|
+
if (requireHttps && parsed.protocol !== "https:") throw new ScrapeError(`HTTPS required. Got: ${parsed.protocol}`, "VALIDATION_ERROR");
|
|
489
|
+
if (!allowPrivate && isPrivateHost(parsed.hostname)) throw new ScrapeError(`Private/internal addresses not allowed: ${parsed.hostname}`, "VALIDATION_ERROR");
|
|
490
|
+
return parsed;
|
|
491
|
+
}
|
|
492
|
+
/**
|
|
493
|
+
* Validate URL and resolve DNS to check for private IPs.
|
|
494
|
+
*/
|
|
495
|
+
async function validateUrlWithDns(url, options = {}) {
|
|
496
|
+
const parsed = validateUrl(url, options);
|
|
497
|
+
const resolveDns = options.resolveDns ?? true;
|
|
498
|
+
const allowPrivate = options.allowPrivate ?? false;
|
|
499
|
+
if (!resolveDns || allowPrivate) return;
|
|
500
|
+
const host = parsed.hostname;
|
|
501
|
+
if ((0, node_net.isIP)(host)) return;
|
|
502
|
+
try {
|
|
503
|
+
const addresses = await node_dns.promises.lookup(host, { all: true });
|
|
504
|
+
for (const addr of addresses) if (isPrivateHost(addr.address)) throw new ScrapeError(`DNS resolved to private address: ${host} -> ${addr.address}`, "VALIDATION_ERROR");
|
|
505
|
+
} catch (error) {
|
|
506
|
+
if (error instanceof ScrapeError) throw error;
|
|
507
|
+
throw new ScrapeError(`Failed to resolve hostname: ${host} (${error instanceof Error ? error.message : String(error)})`, "FETCH_FAILED");
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* Base HTTP provider with shared security and resilience.
|
|
512
|
+
*/
|
|
513
|
+
var BaseHttpProvider = class {
|
|
514
|
+
baseUrl;
|
|
515
|
+
model;
|
|
516
|
+
headers;
|
|
517
|
+
errorMapper;
|
|
518
|
+
requireHttps;
|
|
519
|
+
allowPrivate;
|
|
520
|
+
resolveDns;
|
|
521
|
+
allowRedirects;
|
|
522
|
+
timeoutMs;
|
|
523
|
+
retryConfig;
|
|
524
|
+
concurrency;
|
|
525
|
+
circuitBreaker;
|
|
526
|
+
rateLimiter;
|
|
527
|
+
semaphore;
|
|
528
|
+
constructor(config) {
|
|
529
|
+
this.baseUrl = config.baseUrl.replace(/\/$/, "");
|
|
530
|
+
this.model = config.model;
|
|
531
|
+
this.headers = {
|
|
532
|
+
"Content-Type": "application/json",
|
|
533
|
+
...config.headers
|
|
534
|
+
};
|
|
535
|
+
this.errorMapper = config.errorMapper;
|
|
536
|
+
this.requireHttps = config.requireHttps ?? true;
|
|
537
|
+
this.allowPrivate = config.allowPrivate ?? false;
|
|
538
|
+
this.resolveDns = config.resolveDns ?? true;
|
|
539
|
+
this.allowRedirects = config.allowRedirects ?? false;
|
|
540
|
+
this.timeoutMs = config.resilience?.timeoutMs ?? 3e4;
|
|
541
|
+
this.retryConfig = config.resilience?.retry;
|
|
542
|
+
this.concurrency = config.resilience?.concurrency ?? 1;
|
|
543
|
+
const sharedState = config.resilience?.state;
|
|
544
|
+
this.circuitBreaker = sharedState?.circuitBreaker ?? (config.resilience?.circuitBreaker ? new CircuitBreaker(config.resilience.circuitBreaker) : void 0);
|
|
545
|
+
this.rateLimiter = sharedState?.rateLimiter ?? (config.resilience?.rateLimit ? new RateLimiter(config.resilience.rateLimit) : void 0);
|
|
546
|
+
this.semaphore = sharedState?.semaphore ?? new Semaphore(this.concurrency);
|
|
547
|
+
validateUrl(this.baseUrl, {
|
|
548
|
+
requireHttps: this.requireHttps,
|
|
549
|
+
allowPrivate: this.allowPrivate
|
|
550
|
+
});
|
|
551
|
+
}
|
|
552
|
+
/**
|
|
553
|
+
* Get the current resilience state for persistence across calls.
|
|
554
|
+
*/
|
|
555
|
+
getResilienceState() {
|
|
556
|
+
return {
|
|
557
|
+
circuitBreaker: this.circuitBreaker,
|
|
558
|
+
rateLimiter: this.rateLimiter,
|
|
559
|
+
semaphore: this.semaphore
|
|
560
|
+
};
|
|
561
|
+
}
|
|
562
|
+
/**
|
|
563
|
+
* Make an HTTP request with security and resilience.
|
|
564
|
+
*/
|
|
565
|
+
async fetch(url, options = {}) {
|
|
566
|
+
const securityOptions = {
|
|
567
|
+
requireHttps: this.requireHttps,
|
|
568
|
+
allowPrivate: this.allowPrivate,
|
|
569
|
+
resolveDns: this.resolveDns,
|
|
570
|
+
allowRedirects: this.allowRedirects
|
|
571
|
+
};
|
|
572
|
+
await validateUrlWithDns(url, securityOptions);
|
|
573
|
+
if (this.circuitBreaker?.isOpen()) throw new CircuitOpenError("Circuit breaker is open. Too many recent failures.");
|
|
574
|
+
if (this.rateLimiter) await this.rateLimiter.acquire();
|
|
575
|
+
const doFetch = async (signal) => {
|
|
576
|
+
const composedSignal = options.signal ? AbortSignal.any([options.signal, signal]) : signal;
|
|
577
|
+
const response = await fetch(url, {
|
|
578
|
+
method: options.method ?? "POST",
|
|
579
|
+
headers: {
|
|
580
|
+
...this.headers,
|
|
581
|
+
...options.headers
|
|
582
|
+
},
|
|
583
|
+
body: options.body ? JSON.stringify(options.body) : void 0,
|
|
584
|
+
signal: composedSignal,
|
|
585
|
+
redirect: this.allowRedirects ? "follow" : "error"
|
|
586
|
+
});
|
|
587
|
+
if (this.allowRedirects && response.redirected) await validateUrlWithDns(response.url, securityOptions);
|
|
588
|
+
if (!response.ok) throw await createHttpError(response, this.constructor.name, this.errorMapper);
|
|
589
|
+
return {
|
|
590
|
+
data: await response.json(),
|
|
591
|
+
status: response.status,
|
|
592
|
+
headers: response.headers
|
|
593
|
+
};
|
|
594
|
+
};
|
|
595
|
+
const executeWithConcurrency = async () => {
|
|
596
|
+
if (!this.semaphore) throw new ScrapeError("Semaphore not initialized", "VALIDATION_ERROR");
|
|
597
|
+
return this.semaphore.execute(async () => {
|
|
598
|
+
const fetchWithTimeout = async () => {
|
|
599
|
+
return withTimeout((signal) => doFetch(signal), this.timeoutMs);
|
|
600
|
+
};
|
|
601
|
+
try {
|
|
602
|
+
let result;
|
|
603
|
+
if (this.retryConfig) result = (await withRetry(fetchWithTimeout, this.retryConfig)).result;
|
|
604
|
+
else result = await fetchWithTimeout();
|
|
605
|
+
this.circuitBreaker?.recordSuccess();
|
|
606
|
+
return result;
|
|
607
|
+
} catch (error) {
|
|
608
|
+
this.circuitBreaker?.recordFailure();
|
|
609
|
+
throw error;
|
|
610
|
+
}
|
|
611
|
+
});
|
|
612
|
+
};
|
|
613
|
+
return executeWithConcurrency();
|
|
614
|
+
}
|
|
615
|
+
};
|
|
616
|
+
|
|
617
|
+
//#endregion
|
|
618
|
+
Object.defineProperty(exports, 'BaseHttpProvider', {
|
|
619
|
+
enumerable: true,
|
|
620
|
+
get: function () {
|
|
621
|
+
return BaseHttpProvider;
|
|
622
|
+
}
|
|
623
|
+
});
|
|
624
|
+
Object.defineProperty(exports, 'CircuitBreaker', {
|
|
625
|
+
enumerable: true,
|
|
626
|
+
get: function () {
|
|
627
|
+
return CircuitBreaker;
|
|
628
|
+
}
|
|
629
|
+
});
|
|
630
|
+
Object.defineProperty(exports, 'CircuitOpenError', {
|
|
631
|
+
enumerable: true,
|
|
632
|
+
get: function () {
|
|
633
|
+
return CircuitOpenError;
|
|
634
|
+
}
|
|
635
|
+
});
|
|
636
|
+
Object.defineProperty(exports, 'RateLimiter', {
|
|
637
|
+
enumerable: true,
|
|
638
|
+
get: function () {
|
|
639
|
+
return RateLimiter;
|
|
640
|
+
}
|
|
641
|
+
});
|
|
642
|
+
Object.defineProperty(exports, 'ScrapeError', {
|
|
643
|
+
enumerable: true,
|
|
644
|
+
get: function () {
|
|
645
|
+
return ScrapeError;
|
|
646
|
+
}
|
|
647
|
+
});
|
|
648
|
+
Object.defineProperty(exports, 'Semaphore', {
|
|
649
|
+
enumerable: true,
|
|
650
|
+
get: function () {
|
|
651
|
+
return Semaphore;
|
|
652
|
+
}
|
|
653
|
+
});
|
|
654
|
+
Object.defineProperty(exports, 'createTimeoutSignal', {
|
|
655
|
+
enumerable: true,
|
|
656
|
+
get: function () {
|
|
657
|
+
return createTimeoutSignal;
|
|
658
|
+
}
|
|
659
|
+
});
|
|
660
|
+
Object.defineProperty(exports, 'isRetryableError', {
|
|
661
|
+
enumerable: true,
|
|
662
|
+
get: function () {
|
|
663
|
+
return isRetryableError;
|
|
664
|
+
}
|
|
665
|
+
});
|
|
666
|
+
Object.defineProperty(exports, 'withResilience', {
|
|
667
|
+
enumerable: true,
|
|
668
|
+
get: function () {
|
|
669
|
+
return withResilience;
|
|
670
|
+
}
|
|
671
|
+
});
|
|
672
|
+
Object.defineProperty(exports, 'withRetry', {
|
|
673
|
+
enumerable: true,
|
|
674
|
+
get: function () {
|
|
675
|
+
return withRetry;
|
|
676
|
+
}
|
|
677
|
+
});
|
|
678
|
+
Object.defineProperty(exports, 'withTimeout', {
|
|
679
|
+
enumerable: true,
|
|
680
|
+
get: function () {
|
|
681
|
+
return withTimeout;
|
|
682
|
+
}
|
|
683
|
+
});
|
|
684
|
+
//# sourceMappingURL=http-base-CHLf-Tco.cjs.map
|