crawl4ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +429 -0
- package/dist/errors.d.ts +96 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +483 -0
- package/dist/sdk.d.ts +238 -0
- package/dist/types.d.ts +285 -0
- package/package.json +67 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
// src/errors.ts
|
|
2
|
+
class Crawl4AIError extends Error {
|
|
3
|
+
status;
|
|
4
|
+
statusText;
|
|
5
|
+
data;
|
|
6
|
+
request;
|
|
7
|
+
constructor(message, status, statusText, data) {
|
|
8
|
+
super(message);
|
|
9
|
+
this.name = "Crawl4AIError";
|
|
10
|
+
if (status !== undefined) {
|
|
11
|
+
this.status = status;
|
|
12
|
+
}
|
|
13
|
+
if (statusText !== undefined) {
|
|
14
|
+
this.statusText = statusText;
|
|
15
|
+
}
|
|
16
|
+
if (data !== undefined) {
|
|
17
|
+
this.data = data;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
class NetworkError extends Crawl4AIError {
|
|
23
|
+
constructor(message, cause) {
|
|
24
|
+
super(message);
|
|
25
|
+
this.name = "NetworkError";
|
|
26
|
+
if (cause) {
|
|
27
|
+
this.cause = cause;
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
class TimeoutError extends NetworkError {
|
|
33
|
+
timeout;
|
|
34
|
+
constructor(timeout, url) {
|
|
35
|
+
const message = url ? `Request to ${url} timed out after ${timeout}ms` : `Request timed out after ${timeout}ms`;
|
|
36
|
+
super(message);
|
|
37
|
+
this.name = "TimeoutError";
|
|
38
|
+
this.timeout = timeout;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
class RequestValidationError extends Crawl4AIError {
|
|
43
|
+
field;
|
|
44
|
+
value;
|
|
45
|
+
constructor(message, field, value) {
|
|
46
|
+
super(message, 400, "Bad Request");
|
|
47
|
+
this.name = "RequestValidationError";
|
|
48
|
+
if (field !== undefined) {
|
|
49
|
+
this.field = field;
|
|
50
|
+
}
|
|
51
|
+
if (value !== undefined) {
|
|
52
|
+
this.value = value;
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
class RateLimitError extends Crawl4AIError {
|
|
58
|
+
retryAfter;
|
|
59
|
+
limit;
|
|
60
|
+
remaining;
|
|
61
|
+
reset;
|
|
62
|
+
constructor(message, retryAfter, headers) {
|
|
63
|
+
super(message, 429, "Too Many Requests");
|
|
64
|
+
this.name = "RateLimitError";
|
|
65
|
+
if (retryAfter !== undefined) {
|
|
66
|
+
this.retryAfter = retryAfter;
|
|
67
|
+
}
|
|
68
|
+
if (headers) {
|
|
69
|
+
if (headers["x-ratelimit-limit"]) {
|
|
70
|
+
this.limit = parseInt(headers["x-ratelimit-limit"], 10);
|
|
71
|
+
}
|
|
72
|
+
if (headers["x-ratelimit-remaining"]) {
|
|
73
|
+
this.remaining = parseInt(headers["x-ratelimit-remaining"], 10);
|
|
74
|
+
}
|
|
75
|
+
if (headers["x-ratelimit-reset"]) {
|
|
76
|
+
this.reset = new Date(parseInt(headers["x-ratelimit-reset"], 10) * 1000);
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
class AuthError extends Crawl4AIError {
|
|
83
|
+
constructor(message = "Authentication failed", status = 401) {
|
|
84
|
+
super(message, status, status === 401 ? "Unauthorized" : "Forbidden");
|
|
85
|
+
this.name = "AuthError";
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
class ServerError extends Crawl4AIError {
|
|
90
|
+
constructor(message = "Internal server error", status = 500, statusText) {
|
|
91
|
+
super(message, status, statusText || "Internal Server Error");
|
|
92
|
+
this.name = "ServerError";
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
class NotFoundError extends Crawl4AIError {
|
|
97
|
+
resource;
|
|
98
|
+
constructor(resource) {
|
|
99
|
+
const message = resource ? `Resource not found: ${resource}` : "Resource not found";
|
|
100
|
+
super(message, 404, "Not Found");
|
|
101
|
+
this.name = "NotFoundError";
|
|
102
|
+
if (resource) {
|
|
103
|
+
this.resource = resource;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
class ParseError extends Crawl4AIError {
|
|
109
|
+
responseText;
|
|
110
|
+
constructor(message, responseText) {
|
|
111
|
+
super(message);
|
|
112
|
+
this.name = "ParseError";
|
|
113
|
+
if (responseText) {
|
|
114
|
+
this.responseText = responseText;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
function isCrawl4AIError(error) {
|
|
119
|
+
return error instanceof Crawl4AIError;
|
|
120
|
+
}
|
|
121
|
+
function isRateLimitError(error) {
|
|
122
|
+
return error instanceof RateLimitError;
|
|
123
|
+
}
|
|
124
|
+
function isAuthError(error) {
|
|
125
|
+
return error instanceof AuthError;
|
|
126
|
+
}
|
|
127
|
+
function isNetworkError(error) {
|
|
128
|
+
return error instanceof NetworkError;
|
|
129
|
+
}
|
|
130
|
+
function createHttpError(status, statusText, message, data, headers) {
|
|
131
|
+
const errorMessage = message || `HTTP ${status}: ${statusText}`;
|
|
132
|
+
switch (status) {
|
|
133
|
+
case 400:
|
|
134
|
+
return new RequestValidationError(errorMessage);
|
|
135
|
+
case 401:
|
|
136
|
+
return new AuthError(errorMessage, 401);
|
|
137
|
+
case 403:
|
|
138
|
+
return new AuthError(errorMessage, 403);
|
|
139
|
+
case 404:
|
|
140
|
+
return new NotFoundError;
|
|
141
|
+
case 429: {
|
|
142
|
+
const retryAfter = headers?.["retry-after"] ? parseInt(headers["retry-after"], 10) : undefined;
|
|
143
|
+
return new RateLimitError(errorMessage, retryAfter, headers);
|
|
144
|
+
}
|
|
145
|
+
case 500:
|
|
146
|
+
case 502:
|
|
147
|
+
case 503:
|
|
148
|
+
case 504:
|
|
149
|
+
return new ServerError(errorMessage, status, statusText);
|
|
150
|
+
default:
|
|
151
|
+
return new Crawl4AIError(errorMessage, status, statusText, data);
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
// src/sdk.ts
|
|
155
|
+
var DEFAULT_TIMEOUT = 300000;
|
|
156
|
+
var DEFAULT_RETRIES = 3;
|
|
157
|
+
var DEFAULT_RETRY_DELAY = 1000;
|
|
158
|
+
var RETRY_BACKOFF_MULTIPLIER = 2;
|
|
159
|
+
var HEALTH_CHECK_TIMEOUT = 5000;
|
|
160
|
+
var CLIENT_ERROR_MIN = 400;
|
|
161
|
+
var CLIENT_ERROR_MAX = 500;
|
|
162
|
+
var RATE_LIMIT_STATUS = 429;
|
|
163
|
+
|
|
164
|
+
class Crawl4AI {
|
|
165
|
+
config;
|
|
166
|
+
constructor(config) {
|
|
167
|
+
if (!config.baseUrl) {
|
|
168
|
+
throw new RequestValidationError("baseUrl is required in configuration", "baseUrl");
|
|
169
|
+
}
|
|
170
|
+
try {
|
|
171
|
+
new URL(config.baseUrl);
|
|
172
|
+
} catch {
|
|
173
|
+
throw new RequestValidationError(`Invalid baseUrl: ${config.baseUrl}`, "baseUrl", config.baseUrl);
|
|
174
|
+
}
|
|
175
|
+
const defaults = {
|
|
176
|
+
apiToken: "",
|
|
177
|
+
timeout: DEFAULT_TIMEOUT,
|
|
178
|
+
retries: DEFAULT_RETRIES,
|
|
179
|
+
retryDelay: DEFAULT_RETRY_DELAY,
|
|
180
|
+
defaultHeaders: { "Content-Type": "application/json" },
|
|
181
|
+
throwOnError: true,
|
|
182
|
+
validateStatus: (status) => status < CLIENT_ERROR_MIN,
|
|
183
|
+
debug: false
|
|
184
|
+
};
|
|
185
|
+
if (config.timeout !== undefined && (config.timeout <= 0 || !Number.isFinite(config.timeout))) {
|
|
186
|
+
throw new RequestValidationError("timeout must be a positive number", "timeout", config.timeout);
|
|
187
|
+
}
|
|
188
|
+
if (config.retries !== undefined && (config.retries < 0 || !Number.isInteger(config.retries))) {
|
|
189
|
+
throw new RequestValidationError("retries must be a non-negative integer", "retries", config.retries);
|
|
190
|
+
}
|
|
191
|
+
if (config.retryDelay !== undefined && (config.retryDelay < 0 || !Number.isFinite(config.retryDelay))) {
|
|
192
|
+
throw new RequestValidationError("retryDelay must be a non-negative number", "retryDelay", config.retryDelay);
|
|
193
|
+
}
|
|
194
|
+
this.config = {
|
|
195
|
+
...defaults,
|
|
196
|
+
...config,
|
|
197
|
+
baseUrl: config.baseUrl.replace(/\/$/, ""),
|
|
198
|
+
defaultHeaders: {
|
|
199
|
+
...defaults.defaultHeaders,
|
|
200
|
+
...config.defaultHeaders
|
|
201
|
+
},
|
|
202
|
+
throwOnError: config.throwOnError ?? defaults.throwOnError,
|
|
203
|
+
validateStatus: config.validateStatus || defaults.validateStatus
|
|
204
|
+
};
|
|
205
|
+
if (this.config.apiToken) {
|
|
206
|
+
this.config.defaultHeaders.Authorization = `Bearer ${this.config.apiToken}`;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
validateUrl(url) {
|
|
210
|
+
try {
|
|
211
|
+
new URL(url);
|
|
212
|
+
} catch {
|
|
213
|
+
throw new RequestValidationError(`Invalid URL: ${url}`, "url", url);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
log(message, data) {
|
|
217
|
+
if (this.config.debug) {
|
|
218
|
+
console.log(`[Crawl4AI] ${message}`, data || "");
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
normalizeArrayResponse(response) {
|
|
222
|
+
if (Array.isArray(response)) {
|
|
223
|
+
return response;
|
|
224
|
+
}
|
|
225
|
+
if (typeof response === "object" && response !== null) {
|
|
226
|
+
const apiResponse = response;
|
|
227
|
+
if (apiResponse.results && Array.isArray(apiResponse.results)) {
|
|
228
|
+
return apiResponse.results;
|
|
229
|
+
}
|
|
230
|
+
if (apiResponse.result && Array.isArray(apiResponse.result)) {
|
|
231
|
+
return apiResponse.result;
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
return [response];
|
|
235
|
+
}
|
|
236
|
+
buildQueryParams(params) {
|
|
237
|
+
const searchParams = new URLSearchParams;
|
|
238
|
+
for (const [key, value] of Object.entries(params)) {
|
|
239
|
+
if (value !== undefined) {
|
|
240
|
+
searchParams.append(key, String(value));
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
return searchParams.toString();
|
|
244
|
+
}
|
|
245
|
+
async request(endpoint, options = {}) {
|
|
246
|
+
const url = `${this.config.baseUrl}${endpoint}`;
|
|
247
|
+
const { timeout = this.config.timeout, signal, headers, ...fetchOptions } = options;
|
|
248
|
+
this.log(`Request: ${fetchOptions.method || "GET"} ${url}`, fetchOptions.body);
|
|
249
|
+
const requestHeaders = {
|
|
250
|
+
...this.config.defaultHeaders,
|
|
251
|
+
...headers
|
|
252
|
+
};
|
|
253
|
+
const controller = new AbortController;
|
|
254
|
+
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
255
|
+
const requestSignal = signal || controller.signal;
|
|
256
|
+
try {
|
|
257
|
+
const response = await fetch(url, {
|
|
258
|
+
...fetchOptions,
|
|
259
|
+
headers: requestHeaders,
|
|
260
|
+
signal: requestSignal
|
|
261
|
+
});
|
|
262
|
+
clearTimeout(timeoutId);
|
|
263
|
+
const contentType = response.headers.get("content-type") || "";
|
|
264
|
+
let responseData;
|
|
265
|
+
if (contentType.includes("application/json")) {
|
|
266
|
+
responseData = await response.json();
|
|
267
|
+
} else if (contentType.includes("text/html") || contentType.includes("text/plain")) {
|
|
268
|
+
responseData = await response.text();
|
|
269
|
+
} else if (contentType.includes("text/event-stream")) {
|
|
270
|
+
return response;
|
|
271
|
+
} else {
|
|
272
|
+
responseData = await response.text();
|
|
273
|
+
}
|
|
274
|
+
this.log(`Response: ${response.status}`, responseData);
|
|
275
|
+
if (!this.config.validateStatus(response.status)) {
|
|
276
|
+
const headers2 = {};
|
|
277
|
+
response.headers.forEach((value, key) => {
|
|
278
|
+
headers2[key] = value;
|
|
279
|
+
});
|
|
280
|
+
const error = createHttpError(response.status, response.statusText, undefined, responseData, headers2);
|
|
281
|
+
error.request = {
|
|
282
|
+
url,
|
|
283
|
+
method: fetchOptions.method || "GET",
|
|
284
|
+
headers: requestHeaders,
|
|
285
|
+
body: fetchOptions.body
|
|
286
|
+
};
|
|
287
|
+
if (this.config.throwOnError) {
|
|
288
|
+
throw error;
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
return responseData;
|
|
292
|
+
} catch (error) {
|
|
293
|
+
clearTimeout(timeoutId);
|
|
294
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
295
|
+
throw new TimeoutError(timeout, url);
|
|
296
|
+
}
|
|
297
|
+
if (error instanceof TypeError && error.message.includes("fetch")) {
|
|
298
|
+
throw new NetworkError(`Network request failed: ${error.message}`, error);
|
|
299
|
+
}
|
|
300
|
+
throw error;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
async requestWithRetry(endpoint, options = {}) {
|
|
304
|
+
let lastError = new Error("No attempts made");
|
|
305
|
+
for (let attempt = 0;attempt <= this.config.retries; attempt++) {
|
|
306
|
+
try {
|
|
307
|
+
return await this.request(endpoint, options);
|
|
308
|
+
} catch (error) {
|
|
309
|
+
lastError = error;
|
|
310
|
+
if (error instanceof Crawl4AIError && error.status && error.status >= CLIENT_ERROR_MIN && error.status < CLIENT_ERROR_MAX && error.status !== RATE_LIMIT_STATUS) {
|
|
311
|
+
throw error;
|
|
312
|
+
}
|
|
313
|
+
if (attempt < this.config.retries) {
|
|
314
|
+
let delay = this.config.retryDelay * RETRY_BACKOFF_MULTIPLIER ** attempt;
|
|
315
|
+
if (error instanceof RateLimitError && error.retryAfter) {
|
|
316
|
+
delay = error.retryAfter * 1000;
|
|
317
|
+
this.log(`Rate limited. Waiting ${error.retryAfter}s before retry (attempt ${attempt + 1}/${this.config.retries})`);
|
|
318
|
+
} else {
|
|
319
|
+
this.log(`Retry attempt ${attempt + 1}/${this.config.retries} after ${delay}ms`);
|
|
320
|
+
}
|
|
321
|
+
await new Promise((resolve) => setTimeout(resolve, delay));
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
throw lastError;
|
|
326
|
+
}
|
|
327
|
+
async crawl(request, config) {
|
|
328
|
+
const urls = Array.isArray(request.urls) ? request.urls : [request.urls];
|
|
329
|
+
for (const url of urls) {
|
|
330
|
+
this.validateUrl(url);
|
|
331
|
+
}
|
|
332
|
+
const normalizedRequest = {
|
|
333
|
+
...request,
|
|
334
|
+
urls
|
|
335
|
+
};
|
|
336
|
+
const response = await this.requestWithRetry("/crawl", {
|
|
337
|
+
method: "POST",
|
|
338
|
+
body: JSON.stringify(normalizedRequest),
|
|
339
|
+
...config
|
|
340
|
+
});
|
|
341
|
+
return this.normalizeArrayResponse(response);
|
|
342
|
+
}
|
|
343
|
+
async markdown(request, config) {
|
|
344
|
+
this.validateUrl(request.url);
|
|
345
|
+
const response = await this.requestWithRetry("/md", {
|
|
346
|
+
method: "POST",
|
|
347
|
+
body: JSON.stringify(request),
|
|
348
|
+
...config
|
|
349
|
+
});
|
|
350
|
+
return typeof response === "string" ? response : response.markdown;
|
|
351
|
+
}
|
|
352
|
+
async html(request, config) {
|
|
353
|
+
this.validateUrl(request.url);
|
|
354
|
+
const response = await this.requestWithRetry("/html", {
|
|
355
|
+
method: "POST",
|
|
356
|
+
body: JSON.stringify(request),
|
|
357
|
+
...config
|
|
358
|
+
});
|
|
359
|
+
return typeof response === "string" ? response : response.html;
|
|
360
|
+
}
|
|
361
|
+
async executeJs(request, config) {
|
|
362
|
+
this.validateUrl(request.url);
|
|
363
|
+
return this.requestWithRetry("/execute_js", {
|
|
364
|
+
method: "POST",
|
|
365
|
+
body: JSON.stringify(request),
|
|
366
|
+
...config
|
|
367
|
+
});
|
|
368
|
+
}
|
|
369
|
+
async ask(params, config) {
|
|
370
|
+
const queryString = this.buildQueryParams({
|
|
371
|
+
context_type: params?.context_type,
|
|
372
|
+
query: params?.query,
|
|
373
|
+
score_ratio: params?.score_ratio,
|
|
374
|
+
max_results: params?.max_results
|
|
375
|
+
});
|
|
376
|
+
const endpoint = `/ask${queryString ? `?${queryString}` : ""}`;
|
|
377
|
+
const response = await this.requestWithRetry(endpoint, {
|
|
378
|
+
method: "GET",
|
|
379
|
+
...config
|
|
380
|
+
});
|
|
381
|
+
const results = response.doc_results || response.code_results || response.all_results || [];
|
|
382
|
+
const result = {
|
|
383
|
+
context: results.map((r) => r.text).join(`
|
|
384
|
+
|
|
385
|
+
`),
|
|
386
|
+
type: params?.context_type || "doc",
|
|
387
|
+
results_count: results.length
|
|
388
|
+
};
|
|
389
|
+
if (params?.query !== undefined) {
|
|
390
|
+
result.query = params.query;
|
|
391
|
+
}
|
|
392
|
+
return result;
|
|
393
|
+
}
|
|
394
|
+
async llm(url, query, config) {
|
|
395
|
+
this.validateUrl(url);
|
|
396
|
+
const encodedUrl = encodeURIComponent(url);
|
|
397
|
+
const queryParams = new URLSearchParams({ q: query });
|
|
398
|
+
const response = await this.requestWithRetry(`/llm/${encodedUrl}?${queryParams.toString()}`, {
|
|
399
|
+
method: "GET",
|
|
400
|
+
...config
|
|
401
|
+
});
|
|
402
|
+
return typeof response === "string" ? response : response.answer || "";
|
|
403
|
+
}
|
|
404
|
+
async health(config) {
|
|
405
|
+
return this.request("/health", {
|
|
406
|
+
method: "GET",
|
|
407
|
+
...config
|
|
408
|
+
});
|
|
409
|
+
}
|
|
410
|
+
async metrics(config) {
|
|
411
|
+
return this.request("/metrics", {
|
|
412
|
+
method: "GET",
|
|
413
|
+
...config
|
|
414
|
+
});
|
|
415
|
+
}
|
|
416
|
+
async schema(config) {
|
|
417
|
+
return this.request("/schema", {
|
|
418
|
+
method: "GET",
|
|
419
|
+
...config
|
|
420
|
+
});
|
|
421
|
+
}
|
|
422
|
+
async getRoot(config) {
|
|
423
|
+
return this.request("/", {
|
|
424
|
+
method: "GET",
|
|
425
|
+
...config
|
|
426
|
+
});
|
|
427
|
+
}
|
|
428
|
+
async testConnection(options) {
|
|
429
|
+
try {
|
|
430
|
+
await this.health({ timeout: HEALTH_CHECK_TIMEOUT });
|
|
431
|
+
return true;
|
|
432
|
+
} catch (error) {
|
|
433
|
+
if (options?.throwOnError) {
|
|
434
|
+
throw error;
|
|
435
|
+
}
|
|
436
|
+
return false;
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
async version(options) {
|
|
440
|
+
try {
|
|
441
|
+
const health = await this.health();
|
|
442
|
+
return health.version || "unknown";
|
|
443
|
+
} catch (error) {
|
|
444
|
+
if (options?.throwOnError) {
|
|
445
|
+
throw error;
|
|
446
|
+
}
|
|
447
|
+
return "unknown";
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
setApiToken(token) {
|
|
451
|
+
this.config.apiToken = token;
|
|
452
|
+
if (token) {
|
|
453
|
+
this.config.defaultHeaders.Authorization = `Bearer ${token}`;
|
|
454
|
+
} else {
|
|
455
|
+
delete this.config.defaultHeaders.Authorization;
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
setBaseUrl(baseUrl) {
|
|
459
|
+
this.config.baseUrl = baseUrl.replace(/\/$/, "");
|
|
460
|
+
}
|
|
461
|
+
setDebug(debug) {
|
|
462
|
+
this.config.debug = debug;
|
|
463
|
+
}
|
|
464
|
+
}
|
|
465
|
+
var sdk_default = Crawl4AI;
|
|
466
|
+
export {
|
|
467
|
+
isRateLimitError,
|
|
468
|
+
isNetworkError,
|
|
469
|
+
isCrawl4AIError,
|
|
470
|
+
isAuthError,
|
|
471
|
+
sdk_default as default,
|
|
472
|
+
createHttpError,
|
|
473
|
+
TimeoutError,
|
|
474
|
+
ServerError,
|
|
475
|
+
RequestValidationError,
|
|
476
|
+
RateLimitError,
|
|
477
|
+
ParseError,
|
|
478
|
+
NotFoundError,
|
|
479
|
+
NetworkError,
|
|
480
|
+
Crawl4AIError,
|
|
481
|
+
Crawl4AI,
|
|
482
|
+
AuthError
|
|
483
|
+
};
|
package/dist/sdk.d.ts
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawl4AI TypeScript SDK
|
|
3
|
+
* A comprehensive SDK for interacting with Crawl4AI REST API
|
|
4
|
+
*/
|
|
5
|
+
import type { AskRequest, AskResponse, Crawl4AIConfig, CrawlRequest, CrawlResult, ExecuteJsRequest, HealthResponse, HtmlRequest, MarkdownRequest, RequestConfig } from './types';
|
|
6
|
+
/**
|
|
7
|
+
* Crawl4AI SDK Client - Main class for interacting with Crawl4AI REST API
|
|
8
|
+
*
|
|
9
|
+
* Provides methods for web crawling, content extraction, and various
|
|
10
|
+
* web automation tasks through the Crawl4AI service.
|
|
11
|
+
*
|
|
12
|
+
* @example Basic usage
|
|
13
|
+
* ```typescript
|
|
14
|
+
* const client = new Crawl4AI({
|
|
15
|
+
* baseUrl: 'https://example.com',
|
|
16
|
+
* apiToken: 'your_token_here'
|
|
17
|
+
* });
|
|
18
|
+
*
|
|
19
|
+
* const result = await client.crawl({
|
|
20
|
+
* urls: 'https://example.com',
|
|
21
|
+
* browser_config: { headless: true }
|
|
22
|
+
* });
|
|
23
|
+
* ```
|
|
24
|
+
*
|
|
25
|
+
* @example With custom configuration
|
|
26
|
+
* ```typescript
|
|
27
|
+
* const client = new Crawl4AI({
|
|
28
|
+
* baseUrl: 'http://localhost:11235',
|
|
29
|
+
* timeout: 60000,
|
|
30
|
+
* retries: 5,
|
|
31
|
+
* debug: true
|
|
32
|
+
* });
|
|
33
|
+
* ```
|
|
34
|
+
*/
|
|
35
|
+
export declare class Crawl4AI {
|
|
36
|
+
private config;
|
|
37
|
+
/**
|
|
38
|
+
* Create a new Crawl4AI client instance
|
|
39
|
+
*
|
|
40
|
+
* @param config - Client configuration options
|
|
41
|
+
* @param config.baseUrl - Base URL of Crawl4AI server
|
|
42
|
+
* @param config.apiToken - Optional API token for authentication
|
|
43
|
+
* @param config.timeout - Request timeout in milliseconds (default: 300000)
|
|
44
|
+
* @param config.retries - Number of retry attempts (default: 3)
|
|
45
|
+
* @param config.retryDelay - Delay between retries in milliseconds (default: 1000)
|
|
46
|
+
* @param config.debug - Enable debug logging (default: false)
|
|
47
|
+
* @param config.throwOnError - Throw on HTTP errors (default: true)
|
|
48
|
+
*/
|
|
49
|
+
constructor(config: Crawl4AIConfig);
|
|
50
|
+
/**
|
|
51
|
+
* Validate URL format
|
|
52
|
+
*/
|
|
53
|
+
private validateUrl;
|
|
54
|
+
/**
|
|
55
|
+
* Log debug information
|
|
56
|
+
*/
|
|
57
|
+
private log;
|
|
58
|
+
/**
|
|
59
|
+
* Normalize different API response formats to a consistent array
|
|
60
|
+
*/
|
|
61
|
+
private normalizeArrayResponse;
|
|
62
|
+
/**
|
|
63
|
+
* Build query parameters from an object, filtering out undefined values
|
|
64
|
+
*/
|
|
65
|
+
private buildQueryParams;
|
|
66
|
+
private request;
|
|
67
|
+
private requestWithRetry;
|
|
68
|
+
/**
|
|
69
|
+
* Main crawl endpoint - Extract content from one or more URLs
|
|
70
|
+
*
|
|
71
|
+
* @param request - Crawl configuration including URLs and options
|
|
72
|
+
* @param config - Optional request configuration (timeout, headers, etc.)
|
|
73
|
+
* @returns Promise resolving to array of crawl results
|
|
74
|
+
*
|
|
75
|
+
* @example
|
|
76
|
+
* ```typescript
|
|
77
|
+
* const results = await client.crawl({
|
|
78
|
+
* urls: ['https://example.com'],
|
|
79
|
+
* browser_config: { headless: true },
|
|
80
|
+
* crawler_config: { cache_mode: 'bypass' }
|
|
81
|
+
* });
|
|
82
|
+
* ```
|
|
83
|
+
*
|
|
84
|
+
* @throws {RequestValidationError} If URLs are invalid
|
|
85
|
+
* @throws {NetworkError} If network request fails
|
|
86
|
+
* @throws {TimeoutError} If request times out
|
|
87
|
+
*/
|
|
88
|
+
crawl(request: CrawlRequest, config?: RequestConfig): Promise<CrawlResult[]>;
|
|
89
|
+
/**
|
|
90
|
+
* Get markdown content from URL with optional filtering
|
|
91
|
+
*
|
|
92
|
+
* @param request - Markdown extraction configuration
|
|
93
|
+
* @param request.url - URL to extract markdown from
|
|
94
|
+
* @param request.f - Content filter: 'raw' | 'fit' | 'bm25' | 'llm'
|
|
95
|
+
* @param request.q - Query for BM25/LLM filtering
|
|
96
|
+
* @param config - Optional request configuration
|
|
97
|
+
* @returns Promise resolving to markdown string
|
|
98
|
+
*
|
|
99
|
+
* @example
|
|
100
|
+
* ```typescript
|
|
101
|
+
* const markdown = await client.markdown({
|
|
102
|
+
* url: 'https://example.com',
|
|
103
|
+
* f: 'fit'
|
|
104
|
+
* });
|
|
105
|
+
* ```
|
|
106
|
+
*/
|
|
107
|
+
markdown(request: MarkdownRequest, config?: RequestConfig): Promise<string>;
|
|
108
|
+
/**
|
|
109
|
+
* Get HTML content from URL
|
|
110
|
+
* @param request HTML extraction options
|
|
111
|
+
*/
|
|
112
|
+
html(request: HtmlRequest, config?: RequestConfig): Promise<string>;
|
|
113
|
+
/**
|
|
114
|
+
* Execute JavaScript on webpage and return results
|
|
115
|
+
*
|
|
116
|
+
* @param request - JavaScript execution configuration
|
|
117
|
+
* @param request.url - URL to execute scripts on
|
|
118
|
+
* @param request.scripts - Array of JavaScript code to execute
|
|
119
|
+
* @param config - Optional request configuration
|
|
120
|
+
* @returns Promise resolving to CrawlResult with js_execution_result
|
|
121
|
+
*
|
|
122
|
+
* @example
|
|
123
|
+
* ```typescript
|
|
124
|
+
* const result = await client.executeJs({
|
|
125
|
+
* url: 'https://example.com',
|
|
126
|
+
* scripts: [
|
|
127
|
+
* 'return document.title;',
|
|
128
|
+
* 'return document.querySelectorAll("a").length;'
|
|
129
|
+
* ]
|
|
130
|
+
* });
|
|
131
|
+
* console.log(result.js_execution_result);
|
|
132
|
+
* ```
|
|
133
|
+
*/
|
|
134
|
+
executeJs(request: ExecuteJsRequest, config?: RequestConfig): Promise<CrawlResult>;
|
|
135
|
+
/**
|
|
136
|
+
* Get Crawl4AI library context for AI assistants
|
|
137
|
+
* @param params Query parameters
|
|
138
|
+
*/
|
|
139
|
+
ask(params?: AskRequest, config?: RequestConfig): Promise<AskResponse>;
|
|
140
|
+
/**
|
|
141
|
+
* LLM endpoint - Process a webpage with an LLM query
|
|
142
|
+
*
|
|
143
|
+
* @param url URL to process
|
|
144
|
+
* @param query Query string
|
|
145
|
+
* @returns Promise resolving to the LLM's answer
|
|
146
|
+
*
|
|
147
|
+
* @example
|
|
148
|
+
* ```typescript
|
|
149
|
+
* const answer = await client.llm(
|
|
150
|
+
* 'https://example.com',
|
|
151
|
+
* 'What is the main heading on this page?'
|
|
152
|
+
* );
|
|
153
|
+
* console.log(answer); // "The main heading on this page is..."
|
|
154
|
+
* ```
|
|
155
|
+
*/
|
|
156
|
+
llm(url: string, query: string, config?: RequestConfig): Promise<string>;
|
|
157
|
+
/**
|
|
158
|
+
* Get API health status
|
|
159
|
+
*/
|
|
160
|
+
health(config?: RequestConfig): Promise<HealthResponse>;
|
|
161
|
+
/**
|
|
162
|
+
* Get Prometheus metrics
|
|
163
|
+
*/
|
|
164
|
+
metrics(config?: RequestConfig): Promise<string>;
|
|
165
|
+
/**
|
|
166
|
+
* Get API schema
|
|
167
|
+
*/
|
|
168
|
+
schema(config?: RequestConfig): Promise<unknown>;
|
|
169
|
+
/**
|
|
170
|
+
* Get root endpoint information
|
|
171
|
+
*/
|
|
172
|
+
getRoot(config?: RequestConfig): Promise<string>;
|
|
173
|
+
/**
|
|
174
|
+
* Test connection to the Crawl4AI API server
|
|
175
|
+
*
|
|
176
|
+
* @param options - Optional configuration
|
|
177
|
+
* @param options.throwOnError - Throw error instead of returning false (default: false)
|
|
178
|
+
* @returns Promise resolving to true if connected, false otherwise
|
|
179
|
+
*
|
|
180
|
+
* @example
|
|
181
|
+
* ```typescript
|
|
182
|
+
* if (await client.testConnection()) {
|
|
183
|
+
* console.log('Connected to Crawl4AI');
|
|
184
|
+
* }
|
|
185
|
+
* ```
|
|
186
|
+
*
|
|
187
|
+
* @example With error details
|
|
188
|
+
* ```typescript
|
|
189
|
+
* try {
|
|
190
|
+
* await client.testConnection({ throwOnError: true });
|
|
191
|
+
* } catch (error) {
|
|
192
|
+
* console.error('Connection failed:', error);
|
|
193
|
+
* }
|
|
194
|
+
* ```
|
|
195
|
+
*/
|
|
196
|
+
testConnection(options?: {
|
|
197
|
+
throwOnError?: boolean;
|
|
198
|
+
}): Promise<boolean>;
|
|
199
|
+
/**
|
|
200
|
+
* Get API version
|
|
201
|
+
*
|
|
202
|
+
* @param options - Optional configuration
|
|
203
|
+
* @param options.throwOnError - Throw error instead of returning 'unknown' (default: false)
|
|
204
|
+
* @returns Promise resolving to version string or 'unknown' if unavailable
|
|
205
|
+
*
|
|
206
|
+
* @example
|
|
207
|
+
* ```typescript
|
|
208
|
+
* const version = await client.version();
|
|
209
|
+
* console.log('API version:', version);
|
|
210
|
+
* ```
|
|
211
|
+
*/
|
|
212
|
+
version(options?: {
|
|
213
|
+
throwOnError?: boolean;
|
|
214
|
+
}): Promise<string>;
|
|
215
|
+
/**
|
|
216
|
+
* Update API token for authentication
|
|
217
|
+
*
|
|
218
|
+
* @param token - New API token (empty string to remove)
|
|
219
|
+
*
|
|
220
|
+
* @example
|
|
221
|
+
* ```typescript
|
|
222
|
+
* client.setApiToken('new-api-token');
|
|
223
|
+
* ```
|
|
224
|
+
*/
|
|
225
|
+
setApiToken(token: string): void;
|
|
226
|
+
/**
|
|
227
|
+
* Update base URL
|
|
228
|
+
*/
|
|
229
|
+
setBaseUrl(baseUrl: string): void;
|
|
230
|
+
/**
|
|
231
|
+
* Enable/disable debug mode
|
|
232
|
+
*/
|
|
233
|
+
setDebug(debug: boolean): void;
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Default export - Crawl4AI client class
|
|
237
|
+
*/
|
|
238
|
+
export default Crawl4AI;
|