xcrawl-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.editorconfig +12 -0
- package/.env.example +3 -0
- package/.prettierrc +6 -0
- package/README.md +244 -0
- package/claude.md +295 -0
- package/dist/core/crawl.d.ts +246 -0
- package/dist/core/crawl.d.ts.map +1 -0
- package/dist/core/crawl.js +141 -0
- package/dist/core/crawl.js.map +1 -0
- package/dist/core/map.d.ts +34 -0
- package/dist/core/map.d.ts.map +1 -0
- package/dist/core/map.js +50 -0
- package/dist/core/map.js.map +1 -0
- package/dist/core/scrape.d.ts +201 -0
- package/dist/core/scrape.d.ts.map +1 -0
- package/dist/core/scrape.js +148 -0
- package/dist/core/scrape.js.map +1 -0
- package/dist/core/search.d.ts +144 -0
- package/dist/core/search.d.ts.map +1 -0
- package/dist/core/search.js +75 -0
- package/dist/core/search.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +516 -0
- package/dist/index.js.map +1 -0
- package/dist/stdio.d.ts +3 -0
- package/dist/stdio.d.ts.map +1 -0
- package/dist/stdio.js +551 -0
- package/dist/stdio.js.map +1 -0
- package/dist/tools.d.ts +540 -0
- package/dist/tools.d.ts.map +1 -0
- package/dist/tools.js +528 -0
- package/dist/tools.js.map +1 -0
- package/dist/types.d.ts +214 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/package.json +33 -0
- package/src/core/crawl.ts +149 -0
- package/src/core/map.ts +56 -0
- package/src/core/scrape.ts +156 -0
- package/src/core/search.ts +81 -0
- package/src/index.ts +565 -0
- package/src/stdio.ts +584 -0
- package/src/tools.ts +539 -0
- package/src/types.ts +221 -0
- package/tsconfig.build.json +14 -0
- package/tsconfig.json +45 -0
- package/vitest.config.mts +11 -0
- package/worker-configuration.d.ts +10848 -0
- package/wrangler.jsonc +26 -0
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* xCrawl API Request and Response Types
|
|
3
|
+
*/
|
|
4
|
+
export interface XCrawlScrapeRequest {
|
|
5
|
+
url: string;
|
|
6
|
+
mode?: "sync" | "async";
|
|
7
|
+
proxy?: {
|
|
8
|
+
location?: string;
|
|
9
|
+
sticky_session?: string;
|
|
10
|
+
};
|
|
11
|
+
request?: {
|
|
12
|
+
locale?: string;
|
|
13
|
+
device?: "desktop" | "mobile";
|
|
14
|
+
cookies?: Record<string, unknown>;
|
|
15
|
+
headers?: Record<string, unknown>;
|
|
16
|
+
only_main_content?: boolean;
|
|
17
|
+
block_ads?: boolean;
|
|
18
|
+
skip_tls_verification?: boolean;
|
|
19
|
+
};
|
|
20
|
+
js_render?: {
|
|
21
|
+
enabled?: boolean;
|
|
22
|
+
wait_until?: "load" | "domcontentloaded" | "networkidle";
|
|
23
|
+
viewport?: {
|
|
24
|
+
width?: number;
|
|
25
|
+
height?: number;
|
|
26
|
+
};
|
|
27
|
+
};
|
|
28
|
+
output?: {
|
|
29
|
+
formats?: Array<"html" | "raw_html" | "markdown" | "links" | "summary" | "screenshot" | "json">;
|
|
30
|
+
screenshot?: "full_page" | "viewport";
|
|
31
|
+
json?: {
|
|
32
|
+
prompt?: string;
|
|
33
|
+
json_schema?: Record<string, any>;
|
|
34
|
+
};
|
|
35
|
+
};
|
|
36
|
+
webhook?: {
|
|
37
|
+
url?: string;
|
|
38
|
+
headers?: Record<string, string>;
|
|
39
|
+
events?: Array<"started" | "completed" | "failed">;
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
export interface XCrawlScrapeResponse {
|
|
43
|
+
scrape_id: string;
|
|
44
|
+
endpoint: string;
|
|
45
|
+
version: string;
|
|
46
|
+
status: string;
|
|
47
|
+
url?: string;
|
|
48
|
+
data?: {
|
|
49
|
+
html?: string;
|
|
50
|
+
raw_html?: string;
|
|
51
|
+
markdown?: string;
|
|
52
|
+
links?: string[];
|
|
53
|
+
metadata?: Record<string, any>;
|
|
54
|
+
screenshot?: string;
|
|
55
|
+
summary?: string;
|
|
56
|
+
json?: Record<string, any>;
|
|
57
|
+
traffic_bytes?: number;
|
|
58
|
+
credits_used?: number;
|
|
59
|
+
credits_detail?: Record<string, any>;
|
|
60
|
+
};
|
|
61
|
+
started_at?: string;
|
|
62
|
+
ended_at?: string;
|
|
63
|
+
total_credits_used?: number;
|
|
64
|
+
message?: string;
|
|
65
|
+
}
|
|
66
|
+
/**
|
|
67
|
+
* Search API Request and Response Types
|
|
68
|
+
*/
|
|
69
|
+
export interface XCrawlSearchRequest {
|
|
70
|
+
query: string;
|
|
71
|
+
location?: string;
|
|
72
|
+
language?: string;
|
|
73
|
+
limit?: number;
|
|
74
|
+
serp_options?: {
|
|
75
|
+
q?: string;
|
|
76
|
+
location?: string;
|
|
77
|
+
uule?: string;
|
|
78
|
+
google_domain?: string;
|
|
79
|
+
gl?: string;
|
|
80
|
+
hl?: string;
|
|
81
|
+
cr?: string;
|
|
82
|
+
lr?: string;
|
|
83
|
+
safe?: number;
|
|
84
|
+
nfpr?: boolean;
|
|
85
|
+
filter?: boolean;
|
|
86
|
+
tbs?: string;
|
|
87
|
+
start?: number;
|
|
88
|
+
num?: number;
|
|
89
|
+
ludocid?: string;
|
|
90
|
+
lsig?: string;
|
|
91
|
+
kgmid?: string;
|
|
92
|
+
si?: string;
|
|
93
|
+
ibp?: string;
|
|
94
|
+
uds?: string;
|
|
95
|
+
no_cache?: boolean;
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
export interface XCrawlSearchResponse {
|
|
99
|
+
search_id: string;
|
|
100
|
+
endpoint: string;
|
|
101
|
+
version: string;
|
|
102
|
+
status: string;
|
|
103
|
+
query: string;
|
|
104
|
+
data?: {
|
|
105
|
+
results?: Record<string, any>;
|
|
106
|
+
credits_used?: number;
|
|
107
|
+
credits_detail?: Record<string, any>;
|
|
108
|
+
};
|
|
109
|
+
started_at?: string;
|
|
110
|
+
ended_at?: string;
|
|
111
|
+
total_credits_used?: number;
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Map API Request and Response Types
|
|
115
|
+
*/
|
|
116
|
+
export interface XCrawlMapRequest {
|
|
117
|
+
url: string;
|
|
118
|
+
filter?: string;
|
|
119
|
+
limit?: number;
|
|
120
|
+
include_subdomains?: boolean;
|
|
121
|
+
ignore_query_parameters?: boolean;
|
|
122
|
+
}
|
|
123
|
+
export interface XCrawlMapResponse {
|
|
124
|
+
map_id: string;
|
|
125
|
+
endpoint: string;
|
|
126
|
+
version: string;
|
|
127
|
+
status: string;
|
|
128
|
+
url: string;
|
|
129
|
+
data?: {
|
|
130
|
+
links?: string[];
|
|
131
|
+
total_links?: number;
|
|
132
|
+
credits_used?: number;
|
|
133
|
+
credits_detail?: Record<string, any>;
|
|
134
|
+
};
|
|
135
|
+
started_at?: string;
|
|
136
|
+
ended_at?: string;
|
|
137
|
+
total_credits_used?: number;
|
|
138
|
+
}
|
|
139
|
+
/**
|
|
140
|
+
* Crawl API Request and Response Types
|
|
141
|
+
*/
|
|
142
|
+
export interface XCrawlCrawlRequest {
|
|
143
|
+
url: string;
|
|
144
|
+
crawler?: {
|
|
145
|
+
limit?: number;
|
|
146
|
+
include?: string[];
|
|
147
|
+
exclude?: string[];
|
|
148
|
+
max_depth?: number;
|
|
149
|
+
include_entire_domain?: boolean;
|
|
150
|
+
include_subdomains?: boolean;
|
|
151
|
+
include_external_links?: boolean;
|
|
152
|
+
sitemaps?: boolean;
|
|
153
|
+
};
|
|
154
|
+
proxy?: {
|
|
155
|
+
location?: string;
|
|
156
|
+
sticky_session?: string;
|
|
157
|
+
};
|
|
158
|
+
request?: {
|
|
159
|
+
locale?: string;
|
|
160
|
+
device?: "desktop" | "mobile";
|
|
161
|
+
cookies?: Record<string, unknown>;
|
|
162
|
+
headers?: Record<string, unknown>;
|
|
163
|
+
only_main_content?: boolean;
|
|
164
|
+
block_ads?: boolean;
|
|
165
|
+
skip_tls_verification?: boolean;
|
|
166
|
+
};
|
|
167
|
+
js_render?: {
|
|
168
|
+
enabled?: boolean;
|
|
169
|
+
wait_until?: "load" | "domcontentloaded" | "networkidle";
|
|
170
|
+
viewport?: {
|
|
171
|
+
width?: number;
|
|
172
|
+
height?: number;
|
|
173
|
+
};
|
|
174
|
+
};
|
|
175
|
+
output?: {
|
|
176
|
+
formats?: Array<"html" | "raw_html" | "markdown" | "links" | "summary" | "screenshot" | "json">;
|
|
177
|
+
screenshot?: "full_page" | "viewport";
|
|
178
|
+
json?: {
|
|
179
|
+
prompt?: string;
|
|
180
|
+
json_schema?: Record<string, any>;
|
|
181
|
+
};
|
|
182
|
+
};
|
|
183
|
+
webhook?: {
|
|
184
|
+
url?: string;
|
|
185
|
+
headers?: Record<string, string>;
|
|
186
|
+
events?: Array<"started" | "completed" | "failed">;
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
export interface XCrawlCrawlResponse {
|
|
190
|
+
crawl_id: string;
|
|
191
|
+
endpoint: string;
|
|
192
|
+
version: string;
|
|
193
|
+
status: string;
|
|
194
|
+
url?: string;
|
|
195
|
+
data?: Array<{
|
|
196
|
+
url: string;
|
|
197
|
+
html?: string;
|
|
198
|
+
raw_html?: string;
|
|
199
|
+
markdown?: string;
|
|
200
|
+
links?: string[];
|
|
201
|
+
metadata?: Record<string, any>;
|
|
202
|
+
screenshot?: string;
|
|
203
|
+
summary?: string;
|
|
204
|
+
json?: Record<string, any>;
|
|
205
|
+
traffic_bytes?: number;
|
|
206
|
+
credits_used?: number;
|
|
207
|
+
credits_detail?: Record<string, any>;
|
|
208
|
+
}>;
|
|
209
|
+
started_at?: string;
|
|
210
|
+
ended_at?: string;
|
|
211
|
+
total_credits_used?: number;
|
|
212
|
+
message?: string;
|
|
213
|
+
}
|
|
214
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,MAAM,WAAW,mBAAmB;IACnC,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,CAAC,EAAE,MAAM,GAAG,OAAO,CAAC;IACxB,KAAK,CAAC,EAAE;QACP,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,cAAc,CAAC,EAAE,MAAM,CAAC;KACxB,CAAC;IACF,OAAO,CAAC,EAAE;QACT,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAC;QAC9B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAClC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAClC,iBAAiB,CAAC,EAAE,OAAO,CAAC;QAC5B,SAAS,CAAC,EAAE,OAAO,CAAC;QACpB,qBAAqB,CAAC,EAAE,OAAO,CAAC;KAChC,CAAC;IACF,SAAS,CAAC,EAAE;QACX,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,UAAU,CAAC,EAAE,MAAM,GAAG,kBAAkB,GAAG,aAAa,CAAC;QACzD,QAAQ,CAAC,EAAE;YACV,KAAK,CAAC,EAAE,MAAM,CAAC;YACf,MAAM,CAAC,EAAE,MAAM,CAAC;SAChB,CAAC;KACF,CAAC;IACF,MAAM,CAAC,EAAE;QACR,OAAO,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,UAAU,GAAG,UAAU,GAAG,OAAO,GAAG,SAAS,GAAG,YAAY,GAAG,MAAM,CAAC,CAAC;QAChG,UAAU,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC;QACtC,IAAI,CAAC,EAAE;YACN,MAAM,CAAC,EAAE,MAAM,CAAC;YAChB,WAAW,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;SAClC,CAAC;KACF,CAAC;IACF,OAAO,CAAC,EAAE;QACT,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,MAAM,CAAC,EAAE,KAAK,CAAC,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC,CAAC;KACnD,CAAC;CACF;AAED,MAAM,WAAW,oBAAoB;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,IAAI,CAAC,EAAE;QACN,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;QACjB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC3B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;KACrC,CAAC;IACF,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;CACjB;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IACnC,KAAK,EAAE,MAAM,CAAC;IACd,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,YAAY,CAAC,EAAE;QACd,CAAC,CAAC,EAAE,MAAM,CAAC;QACX,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,IAAI,CAAC,EAAE,OAAO,CAAC;QACf,MAAM,CAAC,EAAE,OAAO,CAAC;QACjB,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,EAAE,CAAC,EAAE,MAAM,CAAC;QACZ,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,QAAQ,CAAC,EAAE,OAAO,CAAC;KACnB,CAAC;CACF;AAED,MAAM,WAAW,oBAAoB;IACpC,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,CAAC,EAAE;QACN,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9B,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;KACrC,CAAC;IACF,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,gBAAgB;IAChC,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,OAAO,CAAC;IAC7B,uBAAuB,CAAC,EAAE,OAAO,CAAC;CAClC;AAED,MAAM,WAAW,iBAAiB;IACjC,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,CAAC,EAAE;QACN,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;QACjB,WAAW,CAAC,EAAE,MAAM,CAAC;QACrB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;KACrC,CAAC;IACF,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,MAAM,WAAW,kBAAkB;IAClC,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,CAAC,EAAE;QACT,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;QACnB,OAAO,CAAC,EAAE,MAAM,EAAE,CAAC;QACnB,SAAS,CAAC,EAAE,MAAM,CAAC;QACnB,qBAAqB,CAAC,EAAE,OAAO,CAAC;QAChC,kBAAkB,CAAC,EAAE,OAAO,CAAC;QAC7B,sBAAsB,CAAC,EAAE,OAAO,CAAC;QACjC,QAAQ,CAAC,EAAE,OAAO,CAAC;KACnB,CAAC;IACF,KAAK,CAAC,EAAE;QACP,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,cAAc,CAAC,EAAE,MAAM,CAAC;KACxB,CAAC;IACF,OAAO,CAAC,EAAE;QACT,MAAM,CAAC,EAAE,MAAM,CAAC;QAChB,MAAM,CAAC,EAAE,SAAS,GAAG,QAAQ,CAAC;QAC9B,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAClC,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAClC,iBAAiB,CAAC,EAAE,OAAO,CAAC;QAC5B,SAAS,CAAC,EAAE,OAAO,CAAC;QACpB,qBAAqB,CAAC,EAAE,OAAO,CAAC;KAChC,CAAC;IACF,SAAS,CAAC,EAAE;QACX,OAAO,CAAC,EAAE,OAAO,CAAC;QAClB,UAAU,CAAC,EAAE,MAAM,GAAG,kBAAkB,GAAG,aAAa,CAAC;QACzD,QAAQ,CAAC,EAAE;YACV,KAAK,CAAC,EAAE,MAAM,CAAC;YACf,MAAM,CAAC,EAAE,MAAM,CAAC;SAChB,CAAC;KACF,CAAC;IACF,MAAM,CAAC,EAAE;QACR,OAAO,CAAC,EAAE,KAAK,CAAC,MAAM,GAAG,UAAU,GAAG,UAAU,GAAG,OAAO,GAAG,SAAS,GAAG,YAAY,GAAG,MAAM,CAAC,CAAC;QAChG,UAAU,CAAC,EAAE,WAAW,GAAG,UAAU,CAAC;QACtC,IAAI,CAAC,EAAE;YACN,MAAM,CAAC,EAAE,MAAM,CAAC;YAChB,WAAW,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;SAClC,CAAC;KACF,CAAC;IACF,OAAO,CAAC,EAAE;QACT,GAAG,CAAC,EAAE,MAAM,CAAC;QACb,OAAO,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACjC,MAAM,CAAC,EAAE,KAAK,CAAC,SAAS,GAAG,WAAW,GAAG,QAAQ,CAAC,CAAC;KACnD,CAAC;CACF;AAED,MAAM,WAAW,mBAAmB;IACnC,QAAQ,EAAE,MAAM,CAAC;IACjB,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,IAAI,CAAC,EAAE,KAAK,CAAC;QACZ,GAAG,EAAE,MAAM,CAAC;QACZ,IAAI,CAAC,EAAE,MAAM,CAAC;QACd,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;QAClB,KAAK,CAAC,EAAE,MAAM,EAAE,CAAC;QACjB,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC/B,UAAU,CAAC,EAAE,MAAM,CAAC;QACpB,OAAO,CAAC,EAAE,MAAM,CAAC;QACjB,IAAI,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC3B,aAAa,CAAC,EAAE,MAAM,CAAC;QACvB,YAAY,CAAC,EAAE,MAAM,CAAC;QACtB,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;KACrC,CAAC,CAAC;IACH,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;CACjB"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA;;GAEG"}
|
package/package.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "xcrawl-mcp",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"license": "UNLICENSED",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/stdio.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"xcrawl-mcp": "./dist/stdio.js"
|
|
9
|
+
},
|
|
10
|
+
"scripts": {
|
|
11
|
+
"deploy": "wrangler deploy",
|
|
12
|
+
"dev": "wrangler dev",
|
|
13
|
+
"start": "wrangler dev",
|
|
14
|
+
"build": "tsc -p tsconfig.build.json",
|
|
15
|
+
"start:stdio": "node dist/stdio.js",
|
|
16
|
+
"start:stdio:dev": "tsx src/stdio.ts",
|
|
17
|
+
"test": "vitest",
|
|
18
|
+
"cf-typegen": "wrangler types"
|
|
19
|
+
},
|
|
20
|
+
"dependencies": {
|
|
21
|
+
"@modelcontextprotocol/sdk": "^1.0.4",
|
|
22
|
+
"zod": "^3.23.8"
|
|
23
|
+
},
|
|
24
|
+
"devDependencies": {
|
|
25
|
+
"@cloudflare/vitest-pool-workers": "^0.8.19",
|
|
26
|
+
"@cloudflare/workers-types": "^4.20241218.0",
|
|
27
|
+
"@types/node": "^25.0.3",
|
|
28
|
+
"tsx": "^4.19.2",
|
|
29
|
+
"typescript": "^5.5.2",
|
|
30
|
+
"vitest": "~3.2.0",
|
|
31
|
+
"wrangler": "^4.56.0"
|
|
32
|
+
}
|
|
33
|
+
}
|
|
@@ -0,0 +1,149 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { XCrawlCrawlRequest, XCrawlCrawlResponse } from "../types.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Zod schema for xcrawl_crawl tool parameters
|
|
6
|
+
*/
|
|
7
|
+
export const crawlToolSchema = z.object({
|
|
8
|
+
url: z.string().url().describe("The entry URL to start crawling from"),
|
|
9
|
+
crawler: z
|
|
10
|
+
.object({
|
|
11
|
+
limit: z.number().int().optional().describe("Maximum number of pages to crawl"),
|
|
12
|
+
include: z.array(z.string()).optional().describe("Regex patterns for URLs to include"),
|
|
13
|
+
exclude: z.array(z.string()).optional().describe("Regex patterns for URLs to exclude"),
|
|
14
|
+
max_depth: z.number().int().optional().describe("Maximum crawl depth"),
|
|
15
|
+
include_entire_domain: z.boolean().optional().describe("Whether to crawl the entire domain"),
|
|
16
|
+
include_subdomains: z.boolean().optional().describe("Whether to include subdomains"),
|
|
17
|
+
include_external_links: z.boolean().optional().describe("Whether to follow external links"),
|
|
18
|
+
sitemaps: z.boolean().optional().describe("Whether to use sitemap"),
|
|
19
|
+
})
|
|
20
|
+
.optional()
|
|
21
|
+
.describe("Crawler configuration"),
|
|
22
|
+
proxy: z
|
|
23
|
+
.object({
|
|
24
|
+
location: z.string().optional(),
|
|
25
|
+
sticky_session: z.string().optional(),
|
|
26
|
+
})
|
|
27
|
+
.optional(),
|
|
28
|
+
request: z
|
|
29
|
+
.object({
|
|
30
|
+
locale: z.string().optional(),
|
|
31
|
+
device: z.enum(["desktop", "mobile"]).optional(),
|
|
32
|
+
cookies: z.record(z.unknown()).optional(),
|
|
33
|
+
headers: z.record(z.unknown()).optional(),
|
|
34
|
+
only_main_content: z.boolean().optional(),
|
|
35
|
+
block_ads: z.boolean().optional(),
|
|
36
|
+
skip_tls_verification: z.boolean().optional(),
|
|
37
|
+
})
|
|
38
|
+
.optional(),
|
|
39
|
+
js_render: z
|
|
40
|
+
.object({
|
|
41
|
+
enabled: z.boolean().optional(),
|
|
42
|
+
wait_until: z.enum(["load", "domcontentloaded", "networkidle"]).default("load"),
|
|
43
|
+
viewport: z
|
|
44
|
+
.object({
|
|
45
|
+
width: z.number().int().optional(),
|
|
46
|
+
height: z.number().int().optional(),
|
|
47
|
+
})
|
|
48
|
+
.optional(),
|
|
49
|
+
})
|
|
50
|
+
.optional(),
|
|
51
|
+
output: z
|
|
52
|
+
.object({
|
|
53
|
+
formats: z.array(z.enum(["html", "raw_html", "markdown", "links", "summary", "screenshot", "json"]))
|
|
54
|
+
.optional()
|
|
55
|
+
.default(["markdown"])
|
|
56
|
+
.describe("Output formats. Default: ['markdown']. Set to [] to get only metadata."),
|
|
57
|
+
screenshot: z.enum(["full_page", "viewport"]).optional(),
|
|
58
|
+
json: z
|
|
59
|
+
.object({
|
|
60
|
+
prompt: z.string().optional(),
|
|
61
|
+
json_schema: z.record(z.any()).optional(),
|
|
62
|
+
})
|
|
63
|
+
.optional(),
|
|
64
|
+
})
|
|
65
|
+
.optional(),
|
|
66
|
+
webhook: z
|
|
67
|
+
.object({
|
|
68
|
+
url: z.string().optional(),
|
|
69
|
+
headers: z.record(z.string()).optional(),
|
|
70
|
+
events: z.array(z.enum(["started", "completed", "failed"])).optional(),
|
|
71
|
+
})
|
|
72
|
+
.optional(),
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
export type CrawlToolParams = z.infer<typeof crawlToolSchema>;
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Call xCrawl Crawl API to start a batch crawling task (async)
|
|
79
|
+
*/
|
|
80
|
+
export async function callXCrawlCrawlAPI(apiKey: string, params: XCrawlCrawlRequest): Promise<XCrawlCrawlResponse> {
|
|
81
|
+
const controller = new AbortController();
|
|
82
|
+
const timeoutId = setTimeout(() => controller.abort(), 300000); // 300 seconds timeout
|
|
83
|
+
|
|
84
|
+
try {
|
|
85
|
+
const response = await fetch("https://run.xcrawl.com/v1/crawl", {
|
|
86
|
+
method: "POST",
|
|
87
|
+
headers: {
|
|
88
|
+
"Content-Type": "application/json",
|
|
89
|
+
Authorization: `Bearer ${apiKey}`,
|
|
90
|
+
},
|
|
91
|
+
body: JSON.stringify(params),
|
|
92
|
+
signal: controller.signal,
|
|
93
|
+
});
|
|
94
|
+
|
|
95
|
+
if (!response.ok) {
|
|
96
|
+
const errorText = await response.text();
|
|
97
|
+
throw new Error(`xCrawl Crawl API error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
return (await response.json()) as XCrawlCrawlResponse;
|
|
101
|
+
} catch (error) {
|
|
102
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
103
|
+
throw new Error("Request timeout after 300 seconds");
|
|
104
|
+
}
|
|
105
|
+
throw error;
|
|
106
|
+
} finally {
|
|
107
|
+
clearTimeout(timeoutId);
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Check crawl task status and retrieve results
|
|
113
|
+
*/
|
|
114
|
+
export async function checkCrawlStatus(apiKey: string, crawlId: string): Promise<XCrawlCrawlResponse> {
|
|
115
|
+
const controller = new AbortController();
|
|
116
|
+
const timeoutId = setTimeout(() => controller.abort(), 300000); // 300 seconds timeout
|
|
117
|
+
|
|
118
|
+
try {
|
|
119
|
+
const response = await fetch(`https://run.xcrawl.com/v1/crawl/${crawlId}`, {
|
|
120
|
+
method: "GET",
|
|
121
|
+
headers: {
|
|
122
|
+
"Content-Type": "application/json",
|
|
123
|
+
Authorization: `Bearer ${apiKey}`,
|
|
124
|
+
},
|
|
125
|
+
signal: controller.signal,
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
if (!response.ok) {
|
|
129
|
+
const errorText = await response.text();
|
|
130
|
+
throw new Error(`xCrawl check crawl status error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
return (await response.json()) as XCrawlCrawlResponse;
|
|
134
|
+
} catch (error) {
|
|
135
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
136
|
+
throw new Error("Request timeout after 300 seconds");
|
|
137
|
+
}
|
|
138
|
+
throw error;
|
|
139
|
+
} finally {
|
|
140
|
+
clearTimeout(timeoutId);
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Format crawl response for MCP tool output
|
|
146
|
+
*/
|
|
147
|
+
export function formatCrawlResponse(response: XCrawlCrawlResponse): string {
|
|
148
|
+
return JSON.stringify(response, null, 2);
|
|
149
|
+
}
|
package/src/core/map.ts
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { XCrawlMapRequest, XCrawlMapResponse } from "../types.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Zod schema for xcrawl_map tool parameters
|
|
6
|
+
*/
|
|
7
|
+
export const mapToolSchema = z.object({
|
|
8
|
+
url: z.string().url().describe("The website URL to map"),
|
|
9
|
+
filter: z.string().optional().describe("Regex pattern to filter URLs"),
|
|
10
|
+
limit: z.number().int().min(1).max(100000).optional().describe("Maximum number of URLs to return (1-100000)"),
|
|
11
|
+
include_subdomains: z.boolean().optional().describe("Whether to include URLs from subdomains"),
|
|
12
|
+
ignore_query_parameters: z.boolean().optional().describe("Whether to ignore URLs with query parameters"),
|
|
13
|
+
});
|
|
14
|
+
|
|
15
|
+
export type MapToolParams = z.infer<typeof mapToolSchema>;
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Call xCrawl Map API to get all URLs from a website
|
|
19
|
+
*/
|
|
20
|
+
export async function callXCrawlMapAPI(apiKey: string, params: XCrawlMapRequest): Promise<XCrawlMapResponse> {
|
|
21
|
+
const controller = new AbortController();
|
|
22
|
+
const timeoutId = setTimeout(() => controller.abort(), 300000); // 300 seconds timeout
|
|
23
|
+
|
|
24
|
+
try {
|
|
25
|
+
const response = await fetch("https://run.xcrawl.com/v1/map", {
|
|
26
|
+
method: "POST",
|
|
27
|
+
headers: {
|
|
28
|
+
"Content-Type": "application/json",
|
|
29
|
+
Authorization: `Bearer ${apiKey}`,
|
|
30
|
+
},
|
|
31
|
+
body: JSON.stringify(params),
|
|
32
|
+
signal: controller.signal,
|
|
33
|
+
});
|
|
34
|
+
|
|
35
|
+
if (!response.ok) {
|
|
36
|
+
const errorText = await response.text();
|
|
37
|
+
throw new Error(`xCrawl Map API error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
return (await response.json()) as XCrawlMapResponse;
|
|
41
|
+
} catch (error) {
|
|
42
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
43
|
+
throw new Error("Request timeout after 300 seconds");
|
|
44
|
+
}
|
|
45
|
+
throw error;
|
|
46
|
+
} finally {
|
|
47
|
+
clearTimeout(timeoutId);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Format map response for MCP tool output
|
|
53
|
+
*/
|
|
54
|
+
export function formatMapResponse(response: XCrawlMapResponse): string {
|
|
55
|
+
return JSON.stringify(response, null, 2);
|
|
56
|
+
}
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { XCrawlScrapeRequest, XCrawlScrapeResponse } from "../types.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Zod schema for xcrawl_scrape tool parameters
|
|
6
|
+
*/
|
|
7
|
+
export const scrapeToolSchema = z.object({
|
|
8
|
+
url: z.string().url().describe("The URL to scrape"),
|
|
9
|
+
mode: z
|
|
10
|
+
.enum(["sync", "async"])
|
|
11
|
+
.default("sync")
|
|
12
|
+
.describe("Sync mode returns results immediately, async mode returns a scrape_id for later retrieval"),
|
|
13
|
+
proxy: z
|
|
14
|
+
.object({
|
|
15
|
+
location: z.string().optional().describe("ISO-3166-1 alpha-2 country code (e.g., US, JP, SG)"),
|
|
16
|
+
sticky_session: z.string().optional().describe("Sticky session ID to reuse the same proxy exit"),
|
|
17
|
+
})
|
|
18
|
+
.optional()
|
|
19
|
+
.describe("Proxy configuration"),
|
|
20
|
+
request: z
|
|
21
|
+
.object({
|
|
22
|
+
locale: z.string().optional().describe("Accept-Language header value"),
|
|
23
|
+
device: z.enum(["desktop", "mobile"]).optional().describe("Device type for user agent and viewport"),
|
|
24
|
+
cookies: z.record(z.string()).optional().describe("Cookies to send with the request"),
|
|
25
|
+
headers: z.record(z.string()).optional().describe("Custom HTTP headers"),
|
|
26
|
+
only_main_content: z.boolean().optional().describe("Only return main content (default: true)"),
|
|
27
|
+
block_ads: z.boolean().optional().describe("Attempt to block ads (default: true)"),
|
|
28
|
+
skip_tls_verification: z.boolean().optional().describe("Skip TLS certificate verification (default: true)"),
|
|
29
|
+
})
|
|
30
|
+
.optional()
|
|
31
|
+
.describe("Request configuration"),
|
|
32
|
+
js_render: z
|
|
33
|
+
.object({
|
|
34
|
+
enabled: z.boolean().default(true).describe("Enable JavaScript rendering"),
|
|
35
|
+
wait_until: z
|
|
36
|
+
.enum(["load", "domcontentloaded", "networkidle"])
|
|
37
|
+
.default("load")
|
|
38
|
+
.describe("Wait condition for page load"),
|
|
39
|
+
viewport: z
|
|
40
|
+
.object({
|
|
41
|
+
width: z.number().optional().describe("Viewport width"),
|
|
42
|
+
height: z.number().optional().describe("Viewport height"),
|
|
43
|
+
})
|
|
44
|
+
.optional()
|
|
45
|
+
.describe("Viewport dimensions"),
|
|
46
|
+
})
|
|
47
|
+
.optional()
|
|
48
|
+
.describe("JavaScript rendering configuration"),
|
|
49
|
+
output: z
|
|
50
|
+
.object({
|
|
51
|
+
formats: z
|
|
52
|
+
.array(z.enum(["html", "raw_html", "markdown", "links", "summary", "screenshot", "json"]))
|
|
53
|
+
.optional()
|
|
54
|
+
.default(["markdown"])
|
|
55
|
+
.describe("Output formats: 'html' (cleaned HTML without scripts), 'raw_html' (original HTML with all scripts and styles), 'markdown' (Markdown format), 'links' (all page links), 'summary' (AI-generated summary), 'screenshot' (page screenshot), 'json' (structured data extraction). Set to [] to get only metadata."),
|
|
56
|
+
screenshot: z
|
|
57
|
+
.enum(["full_page", "viewport"])
|
|
58
|
+
.optional()
|
|
59
|
+
.describe("Screenshot type: 'viewport' (default, captures visible area) or 'full_page' (captures entire page). Only effective when 'screenshot' is in formats array."),
|
|
60
|
+
json: z
|
|
61
|
+
.object({
|
|
62
|
+
prompt: z.string().optional().describe("Natural language description of what data to extract. The AI engine automatically structures the output based on this prompt."),
|
|
63
|
+
json_schema: z.record(z.any()).optional().describe("Optional JSON Schema for strict output validation. Rarely needed - the prompt field is usually sufficient."),
|
|
64
|
+
})
|
|
65
|
+
.optional()
|
|
66
|
+
.describe("JSON extraction configuration. Use 'prompt' to describe what to extract - json_schema is optional."),
|
|
67
|
+
})
|
|
68
|
+
.optional()
|
|
69
|
+
.describe("Output configuration"),
|
|
70
|
+
webhook: z
|
|
71
|
+
.object({
|
|
72
|
+
url: z.string().optional().describe("Webhook callback URL for async task completion"),
|
|
73
|
+
headers: z.record(z.string()).optional().describe("Custom headers for webhook callback request"),
|
|
74
|
+
events: z
|
|
75
|
+
.array(z.enum(["started", "completed", "failed"]))
|
|
76
|
+
.optional()
|
|
77
|
+
.describe("Events to receive webhook callbacks for (default: ['started', 'completed', 'failed'])"),
|
|
78
|
+
})
|
|
79
|
+
.optional()
|
|
80
|
+
.describe("Webhook configuration for async mode callbacks"),
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
export type ScrapeToolParams = z.infer<typeof scrapeToolSchema>;
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Call xCrawl API to scrape a URL
|
|
87
|
+
*/
|
|
88
|
+
export async function callXCrawlAPI(apiKey: string, params: XCrawlScrapeRequest): Promise<XCrawlScrapeResponse> {
|
|
89
|
+
const controller = new AbortController();
|
|
90
|
+
const timeoutId = setTimeout(() => controller.abort(), 300000); // 300 seconds timeout
|
|
91
|
+
|
|
92
|
+
try {
|
|
93
|
+
const response = await fetch("https://run.xcrawl.com/v1/scrape", {
|
|
94
|
+
method: "POST",
|
|
95
|
+
headers: {
|
|
96
|
+
"Content-Type": "application/json",
|
|
97
|
+
Authorization: `Bearer ${apiKey}`,
|
|
98
|
+
},
|
|
99
|
+
body: JSON.stringify(params),
|
|
100
|
+
signal: controller.signal,
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
if (!response.ok) {
|
|
104
|
+
const errorText = await response.text();
|
|
105
|
+
throw new Error(`xCrawl API error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
return (await response.json()) as XCrawlScrapeResponse;
|
|
109
|
+
} catch (error) {
|
|
110
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
111
|
+
throw new Error("Request timeout after 300 seconds");
|
|
112
|
+
}
|
|
113
|
+
throw error;
|
|
114
|
+
} finally {
|
|
115
|
+
clearTimeout(timeoutId);
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Format scrape response for MCP tool output
|
|
121
|
+
*/
|
|
122
|
+
export function formatScrapeResponse(response: XCrawlScrapeResponse): string {
|
|
123
|
+
return JSON.stringify(response, null, 2);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Check status and get result of an async scrape task
|
|
128
|
+
*/
|
|
129
|
+
export async function checkScrapeStatus(apiKey: string, scrapeId: string): Promise<XCrawlScrapeResponse> {
|
|
130
|
+
const controller = new AbortController();
|
|
131
|
+
const timeoutId = setTimeout(() => controller.abort(), 300000); // 300 seconds timeout
|
|
132
|
+
|
|
133
|
+
try {
|
|
134
|
+
const response = await fetch(`https://run.xcrawl.com/v1/scrape/${scrapeId}`, {
|
|
135
|
+
method: "GET",
|
|
136
|
+
headers: {
|
|
137
|
+
Authorization: `Bearer ${apiKey}`,
|
|
138
|
+
},
|
|
139
|
+
signal: controller.signal,
|
|
140
|
+
});
|
|
141
|
+
|
|
142
|
+
if (!response.ok) {
|
|
143
|
+
const errorText = await response.text();
|
|
144
|
+
throw new Error(`xCrawl API error: ${response.status} ${response.statusText} - ${errorText}`);
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
return (await response.json()) as XCrawlScrapeResponse;
|
|
148
|
+
} catch (error) {
|
|
149
|
+
if (error instanceof Error && error.name === "AbortError") {
|
|
150
|
+
throw new Error("Request timeout after 300 seconds");
|
|
151
|
+
}
|
|
152
|
+
throw error;
|
|
153
|
+
} finally {
|
|
154
|
+
clearTimeout(timeoutId);
|
|
155
|
+
}
|
|
156
|
+
}
|