crawl4ai 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +429 -0
- package/dist/errors.d.ts +96 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.js +483 -0
- package/dist/sdk.d.ts +238 -0
- package/dist/types.d.ts +285 -0
- package/package.json +67 -0
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,285 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Crawl4AI TypeScript SDK - Type Definitions
|
|
3
|
+
* Based on actual API endpoints and Swagger documentation
|
|
4
|
+
*/
|
|
5
|
+
export type HttpMethod = 'GET' | 'POST' | 'PUT' | 'DELETE' | 'PATCH';
|
|
6
|
+
export type CacheMode = 'enabled' | 'disabled' | 'bypass' | 'read_only' | 'write_only';
|
|
7
|
+
export type BrowserType = 'chromium' | 'firefox' | 'webkit';
|
|
8
|
+
export type ContentFilter = 'raw' | 'fit' | 'bm25' | 'llm';
|
|
9
|
+
export type ContextType = 'code' | 'doc' | 'all';
|
|
10
|
+
export interface Viewport {
|
|
11
|
+
width: number;
|
|
12
|
+
height: number;
|
|
13
|
+
}
|
|
14
|
+
export interface BrowserConfig {
|
|
15
|
+
headless?: boolean;
|
|
16
|
+
browser_type?: BrowserType;
|
|
17
|
+
user_agent?: string;
|
|
18
|
+
proxy?: string;
|
|
19
|
+
page_timeout?: number;
|
|
20
|
+
verbose?: boolean;
|
|
21
|
+
simulate_user?: boolean;
|
|
22
|
+
magic?: boolean;
|
|
23
|
+
override_navigator?: boolean;
|
|
24
|
+
user_data_dir?: string;
|
|
25
|
+
use_managed_browser?: boolean;
|
|
26
|
+
viewport?: Viewport;
|
|
27
|
+
headers?: Record<string, string>;
|
|
28
|
+
cookies?: Cookie[];
|
|
29
|
+
extra_args?: string[];
|
|
30
|
+
ignore_https_errors?: boolean;
|
|
31
|
+
java_script_enabled?: boolean;
|
|
32
|
+
accept_downloads?: boolean;
|
|
33
|
+
downloads_path?: string;
|
|
34
|
+
}
|
|
35
|
+
export interface Cookie {
|
|
36
|
+
name: string;
|
|
37
|
+
value: string;
|
|
38
|
+
domain?: string;
|
|
39
|
+
path?: string;
|
|
40
|
+
expires?: number;
|
|
41
|
+
httpOnly?: boolean;
|
|
42
|
+
secure?: boolean;
|
|
43
|
+
sameSite?: 'Strict' | 'Lax' | 'None';
|
|
44
|
+
}
|
|
45
|
+
export interface JsonCssExtractionParams {
|
|
46
|
+
schema: CssExtractionSchema;
|
|
47
|
+
}
|
|
48
|
+
export interface CssExtractionSchema {
|
|
49
|
+
name?: string;
|
|
50
|
+
baseSelector: string;
|
|
51
|
+
fields: CssExtractionField[];
|
|
52
|
+
}
|
|
53
|
+
export interface CssExtractionField {
|
|
54
|
+
name: string;
|
|
55
|
+
selector: string;
|
|
56
|
+
type: 'text' | 'html' | 'attribute' | 'href' | 'src';
|
|
57
|
+
attribute?: string;
|
|
58
|
+
multiple?: boolean;
|
|
59
|
+
transform?: string;
|
|
60
|
+
}
|
|
61
|
+
export interface LlmExtractionParams {
|
|
62
|
+
provider: string;
|
|
63
|
+
api_token?: string;
|
|
64
|
+
api_key?: string;
|
|
65
|
+
schema?: Record<string, unknown>;
|
|
66
|
+
extraction_type?: 'schema' | 'block' | 'markdown';
|
|
67
|
+
instruction?: string;
|
|
68
|
+
model?: string;
|
|
69
|
+
base_url?: string;
|
|
70
|
+
extra_headers?: Record<string, string>;
|
|
71
|
+
extra_body?: Record<string, unknown>;
|
|
72
|
+
}
|
|
73
|
+
export interface CosineExtractionParams {
|
|
74
|
+
semantic_filter?: string;
|
|
75
|
+
word_count_threshold?: number;
|
|
76
|
+
max_dist?: number;
|
|
77
|
+
top_k?: number;
|
|
78
|
+
model_name?: string;
|
|
79
|
+
}
|
|
80
|
+
export type ExtractionStrategy = {
|
|
81
|
+
type: 'json_css';
|
|
82
|
+
params: JsonCssExtractionParams;
|
|
83
|
+
} | {
|
|
84
|
+
type: 'llm';
|
|
85
|
+
params: LlmExtractionParams;
|
|
86
|
+
} | {
|
|
87
|
+
type: 'cosine';
|
|
88
|
+
params: CosineExtractionParams;
|
|
89
|
+
};
|
|
90
|
+
export interface CrawlerRunConfig {
|
|
91
|
+
word_count_threshold?: number;
|
|
92
|
+
extraction_strategy?: ExtractionStrategy;
|
|
93
|
+
chunking_strategy?: ChunkingStrategy;
|
|
94
|
+
css_selector?: string;
|
|
95
|
+
screenshot?: boolean;
|
|
96
|
+
pdf?: boolean;
|
|
97
|
+
cache_mode?: CacheMode;
|
|
98
|
+
bypass_cache?: boolean;
|
|
99
|
+
disable_cache?: boolean;
|
|
100
|
+
no_cache_read?: boolean;
|
|
101
|
+
no_cache_write?: boolean;
|
|
102
|
+
log_console?: boolean;
|
|
103
|
+
stream?: boolean;
|
|
104
|
+
warmup?: boolean;
|
|
105
|
+
js_code?: string[];
|
|
106
|
+
js_only?: boolean;
|
|
107
|
+
wait_for?: string;
|
|
108
|
+
page_timeout?: number;
|
|
109
|
+
delay_before_return_html?: number;
|
|
110
|
+
remove_overlay_elements?: boolean;
|
|
111
|
+
mean_delay?: number;
|
|
112
|
+
max_range?: number;
|
|
113
|
+
semaphore_count?: number;
|
|
114
|
+
base_delay?: number;
|
|
115
|
+
random_delay?: number;
|
|
116
|
+
ignore_robots_txt?: boolean;
|
|
117
|
+
anti_bot?: boolean;
|
|
118
|
+
light_mode?: boolean;
|
|
119
|
+
extra?: Record<string, unknown>;
|
|
120
|
+
}
|
|
121
|
+
export interface ChunkingStrategy {
|
|
122
|
+
type: 'regex' | 'nltk' | 'recursive_url_based_chunking';
|
|
123
|
+
params?: Record<string, unknown>;
|
|
124
|
+
}
|
|
125
|
+
export interface CrawlRequest {
|
|
126
|
+
urls: string | string[];
|
|
127
|
+
browser_config?: BrowserConfig;
|
|
128
|
+
crawler_config?: CrawlerRunConfig;
|
|
129
|
+
session_id?: string;
|
|
130
|
+
priority?: number;
|
|
131
|
+
ttl?: number;
|
|
132
|
+
extra?: Record<string, unknown>;
|
|
133
|
+
}
|
|
134
|
+
export interface MarkdownRequest {
|
|
135
|
+
url: string;
|
|
136
|
+
f?: ContentFilter;
|
|
137
|
+
q?: string;
|
|
138
|
+
c?: string;
|
|
139
|
+
}
|
|
140
|
+
export interface HtmlRequest {
|
|
141
|
+
url: string;
|
|
142
|
+
}
|
|
143
|
+
export interface ScreenshotRequest {
|
|
144
|
+
url: string;
|
|
145
|
+
screenshot_wait_for?: number;
|
|
146
|
+
output_path?: string;
|
|
147
|
+
}
|
|
148
|
+
export interface PdfRequest {
|
|
149
|
+
url: string;
|
|
150
|
+
output_path?: string;
|
|
151
|
+
}
|
|
152
|
+
export interface ExecuteJsRequest {
|
|
153
|
+
url: string;
|
|
154
|
+
scripts: string[];
|
|
155
|
+
}
|
|
156
|
+
export interface TokenRequest {
|
|
157
|
+
email: string;
|
|
158
|
+
}
|
|
159
|
+
export interface ConfigDumpRequest {
|
|
160
|
+
code: string;
|
|
161
|
+
}
|
|
162
|
+
export interface AskRequest {
|
|
163
|
+
context_type?: ContextType;
|
|
164
|
+
query?: string;
|
|
165
|
+
score_ratio?: number;
|
|
166
|
+
max_results?: number;
|
|
167
|
+
}
|
|
168
|
+
export interface CrawlResult {
|
|
169
|
+
url: string;
|
|
170
|
+
html: string;
|
|
171
|
+
success: boolean;
|
|
172
|
+
cleaned_html?: string;
|
|
173
|
+
media?: MediaInfo;
|
|
174
|
+
links?: LinksInfo;
|
|
175
|
+
downloaded_files?: string[];
|
|
176
|
+
js_execution_result?: Record<string, unknown>;
|
|
177
|
+
screenshot?: string;
|
|
178
|
+
pdf?: string;
|
|
179
|
+
mhtml?: string;
|
|
180
|
+
markdown?: string;
|
|
181
|
+
fit_markdown?: string;
|
|
182
|
+
raw_markdown?: string;
|
|
183
|
+
markdown_with_citations?: string;
|
|
184
|
+
references_markdown?: string;
|
|
185
|
+
fit_html?: string;
|
|
186
|
+
extracted_content?: string;
|
|
187
|
+
metadata?: PageMetadata;
|
|
188
|
+
error_message?: string;
|
|
189
|
+
session_id?: string;
|
|
190
|
+
response_headers?: Record<string, string>;
|
|
191
|
+
status_code?: number;
|
|
192
|
+
ssl_certificate?: SSLCertificate;
|
|
193
|
+
dispatch_result?: DispatchResult;
|
|
194
|
+
redirected_url?: string;
|
|
195
|
+
network_requests?: Array<Record<string, unknown>>;
|
|
196
|
+
console_messages?: Array<Record<string, unknown>>;
|
|
197
|
+
crawl_depth?: number;
|
|
198
|
+
text?: string;
|
|
199
|
+
cookies?: Cookie[];
|
|
200
|
+
}
|
|
201
|
+
export interface MediaInfo {
|
|
202
|
+
images: MediaItem[];
|
|
203
|
+
videos: MediaItem[];
|
|
204
|
+
audios: MediaItem[];
|
|
205
|
+
}
|
|
206
|
+
export interface MediaItem {
|
|
207
|
+
src: string;
|
|
208
|
+
alt?: string;
|
|
209
|
+
desc?: string;
|
|
210
|
+
score?: number;
|
|
211
|
+
type?: string;
|
|
212
|
+
}
|
|
213
|
+
export interface LinksInfo {
|
|
214
|
+
internal: LinkItem[];
|
|
215
|
+
external: LinkItem[];
|
|
216
|
+
}
|
|
217
|
+
export interface LinkItem {
|
|
218
|
+
href: string;
|
|
219
|
+
text?: string;
|
|
220
|
+
title?: string;
|
|
221
|
+
}
|
|
222
|
+
export interface PageMetadata {
|
|
223
|
+
title?: string;
|
|
224
|
+
description?: string;
|
|
225
|
+
keywords?: string;
|
|
226
|
+
author?: string;
|
|
227
|
+
language?: string;
|
|
228
|
+
canonical_url?: string;
|
|
229
|
+
open_graph?: Record<string, string>;
|
|
230
|
+
twitter_card?: Record<string, string>;
|
|
231
|
+
}
|
|
232
|
+
export interface SSLCertificate {
|
|
233
|
+
issuer?: string;
|
|
234
|
+
subject?: string;
|
|
235
|
+
valid_from?: string;
|
|
236
|
+
valid_to?: string;
|
|
237
|
+
fingerprint?: string;
|
|
238
|
+
}
|
|
239
|
+
export interface DispatchResult {
|
|
240
|
+
status?: string;
|
|
241
|
+
message?: string;
|
|
242
|
+
data?: unknown;
|
|
243
|
+
}
|
|
244
|
+
export interface HealthResponse {
|
|
245
|
+
status: string;
|
|
246
|
+
timestamp: number;
|
|
247
|
+
version: string;
|
|
248
|
+
}
|
|
249
|
+
export interface TokenResponse {
|
|
250
|
+
token: string;
|
|
251
|
+
}
|
|
252
|
+
export interface AskResponse {
|
|
253
|
+
context: string;
|
|
254
|
+
type: ContextType;
|
|
255
|
+
query?: string;
|
|
256
|
+
results_count: number;
|
|
257
|
+
}
|
|
258
|
+
export interface ValidationError {
|
|
259
|
+
detail: Array<{
|
|
260
|
+
loc: Array<string | number>;
|
|
261
|
+
msg: string;
|
|
262
|
+
type: string;
|
|
263
|
+
}>;
|
|
264
|
+
}
|
|
265
|
+
export interface ApiError extends Error {
|
|
266
|
+
status?: number;
|
|
267
|
+
statusText?: string;
|
|
268
|
+
data?: ValidationError | Record<string, unknown>;
|
|
269
|
+
}
|
|
270
|
+
export interface Crawl4AIConfig {
|
|
271
|
+
baseUrl: string;
|
|
272
|
+
apiToken?: string;
|
|
273
|
+
timeout?: number;
|
|
274
|
+
retries?: number;
|
|
275
|
+
retryDelay?: number;
|
|
276
|
+
defaultHeaders?: Record<string, string>;
|
|
277
|
+
throwOnError?: boolean;
|
|
278
|
+
validateStatus?: (status: number) => boolean;
|
|
279
|
+
debug?: boolean;
|
|
280
|
+
}
|
|
281
|
+
export type RequestConfig = {
|
|
282
|
+
timeout?: number;
|
|
283
|
+
signal?: AbortSignal;
|
|
284
|
+
headers?: Record<string, string>;
|
|
285
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "crawl4ai",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "TypeScript SDK for Crawl4AI REST API - Bun & Node.js compatible",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"type": "module",
|
|
8
|
+
"scripts": {
|
|
9
|
+
"build": "bun build ./src/index.ts --outdir ./dist --target node",
|
|
10
|
+
"build:types": "tsc --declaration --emitDeclarationOnly --outDir dist",
|
|
11
|
+
"test": "bun test",
|
|
12
|
+
"lint": "biome check src --assist-enabled=true",
|
|
13
|
+
"fix": "biome check --write src --assist-enabled=true",
|
|
14
|
+
"format": "biome format --write src",
|
|
15
|
+
"prepare": "bun run build && bun run build:types",
|
|
16
|
+
"prepublishOnly": "bun run lint && bun run test && bun run build"
|
|
17
|
+
},
|
|
18
|
+
"keywords": [
|
|
19
|
+
"crawl4ai",
|
|
20
|
+
"web-scraping",
|
|
21
|
+
"web-crawler",
|
|
22
|
+
"typescript",
|
|
23
|
+
"bun",
|
|
24
|
+
"nodejs",
|
|
25
|
+
"api-client",
|
|
26
|
+
"llm",
|
|
27
|
+
"ai",
|
|
28
|
+
"data-extraction",
|
|
29
|
+
"web-automation"
|
|
30
|
+
],
|
|
31
|
+
"author": "pyronaur",
|
|
32
|
+
"license": "MIT",
|
|
33
|
+
"files": [
|
|
34
|
+
"dist/**/*",
|
|
35
|
+
"README.md",
|
|
36
|
+
"LICENSE"
|
|
37
|
+
],
|
|
38
|
+
"engines": {
|
|
39
|
+
"node": ">=18.0.0"
|
|
40
|
+
},
|
|
41
|
+
"devDependencies": {
|
|
42
|
+
"@types/node": "^20.0.0",
|
|
43
|
+
"typescript": "^5.0.0",
|
|
44
|
+
"@types/bun": "latest",
|
|
45
|
+
"@biomejs/biome": "latest"
|
|
46
|
+
},
|
|
47
|
+
"peerDependencies": {
|
|
48
|
+
"typescript": ">=4.5.0"
|
|
49
|
+
},
|
|
50
|
+
"peerDependenciesMeta": {
|
|
51
|
+
"typescript": {
|
|
52
|
+
"optional": true
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
"repository": {
|
|
56
|
+
"type": "git",
|
|
57
|
+
"url": "git+https://github.com/pyronaur/crawl4ai.git"
|
|
58
|
+
},
|
|
59
|
+
"bugs": {
|
|
60
|
+
"url": "https://github.com/pyronaur/crawl4ai/issues"
|
|
61
|
+
},
|
|
62
|
+
"homepage": "https://github.com/pyronaur/crawl4ai#readme",
|
|
63
|
+
"publishConfig": {
|
|
64
|
+
"access": "public",
|
|
65
|
+
"registry": "https://registry.npmjs.org/"
|
|
66
|
+
}
|
|
67
|
+
}
|