@selextract/mcp-selextract 0.5.0 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -0
- package/dist/http.js +10 -7
- package/dist/tools.js +196 -19
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -64,6 +64,27 @@ Useful extras:
|
|
|
64
64
|
- Authenticated scraping: access profile tools (create/list/update/delete/build-session)
|
|
65
65
|
- Run lifecycle: `run_get`, `run_list`, `run_stop`, `run_delete`
|
|
66
66
|
|
|
67
|
+
## Locale / region controls
|
|
68
|
+
|
|
69
|
+
You can control language/region in a general way (works across many sites):
|
|
70
|
+
|
|
71
|
+
- `task_create`
|
|
72
|
+
- `url_params`: adds or overrides query params on the URL (example: `{"hl":"en","gl":"US"}`)
|
|
73
|
+
- `options.acceptLanguage`: sets the `Accept-Language` request header
|
|
74
|
+
- `options.locale`: sets Playwright locale (example: `en-US`)
|
|
75
|
+
- `options.timezoneId`: sets Playwright timezoneId (example: `America/Los_Angeles`)
|
|
76
|
+
- `run_create`
|
|
77
|
+
- `options`: same as above (applies to the run)
|
|
78
|
+
- `budgets.maxRows`: stop after N rows (example: `100`)
|
|
79
|
+
|
|
80
|
+
## Field types
|
|
81
|
+
|
|
82
|
+
For `task_create.fields`, `type` can be one of:
|
|
83
|
+
|
|
84
|
+
- `text`, `number`, `money`, `url`, `image_url`, `html`, `unknown`
|
|
85
|
+
|
|
86
|
+
Common aliases like `string`, `int`, `price`, and `link` are accepted and mapped.
|
|
87
|
+
|
|
67
88
|
## Resources (read-only)
|
|
68
89
|
|
|
69
90
|
- `selextract://help` (usage guide)
|
package/dist/http.js
CHANGED
|
@@ -17,15 +17,18 @@ export class SelextractApiClient {
|
|
|
17
17
|
const controller = new AbortController();
|
|
18
18
|
const timeout = setTimeout(() => controller.abort(), this.timeoutMs);
|
|
19
19
|
try {
|
|
20
|
+
const headers = {
|
|
21
|
+
accept: 'application/json',
|
|
22
|
+
// Worker accepts Bearer JWT or `sk_...` api keys; it treats `sk_...` as an API key.
|
|
23
|
+
authorization: `Bearer ${this.apiKey}`,
|
|
24
|
+
};
|
|
25
|
+
const hasBody = opts.body !== undefined;
|
|
26
|
+
if (hasBody)
|
|
27
|
+
headers['content-type'] = 'application/json';
|
|
20
28
|
const resp = await this.fetchImpl(url.toString(), {
|
|
21
29
|
method: opts.method,
|
|
22
|
-
headers
|
|
23
|
-
|
|
24
|
-
'content-type': 'application/json',
|
|
25
|
-
// Worker accepts Bearer JWT or `sk_...` api keys; it treats `sk_...` as an API key.
|
|
26
|
-
authorization: `Bearer ${this.apiKey}`,
|
|
27
|
-
},
|
|
28
|
-
body: opts.body === undefined ? undefined : JSON.stringify(opts.body),
|
|
30
|
+
headers,
|
|
31
|
+
body: hasBody ? JSON.stringify(opts.body) : undefined,
|
|
29
32
|
signal: controller.signal,
|
|
30
33
|
});
|
|
31
34
|
const contentType = resp.headers.get('content-type') ?? '';
|
package/dist/tools.js
CHANGED
|
@@ -11,37 +11,157 @@ function asTextResult(value, maxChars) {
|
|
|
11
11
|
],
|
|
12
12
|
};
|
|
13
13
|
}
|
|
14
|
+
const AgentFieldTypeSchema = z.enum(['text', 'number', 'money', 'url', 'image_url', 'html', 'unknown']);
|
|
15
|
+
function coerceAgentFieldType(raw) {
|
|
16
|
+
if (raw == null)
|
|
17
|
+
return undefined;
|
|
18
|
+
if (typeof raw !== 'string')
|
|
19
|
+
return undefined;
|
|
20
|
+
const cleaned = raw.trim().toLowerCase();
|
|
21
|
+
if (!cleaned)
|
|
22
|
+
return undefined;
|
|
23
|
+
const direct = AgentFieldTypeSchema.safeParse(cleaned);
|
|
24
|
+
if (direct.success)
|
|
25
|
+
return direct.data;
|
|
26
|
+
const map = {
|
|
27
|
+
string: 'text',
|
|
28
|
+
str: 'text',
|
|
29
|
+
text: 'text',
|
|
30
|
+
number: 'number',
|
|
31
|
+
int: 'number',
|
|
32
|
+
integer: 'number',
|
|
33
|
+
float: 'number',
|
|
34
|
+
double: 'number',
|
|
35
|
+
decimal: 'number',
|
|
36
|
+
currency: 'money',
|
|
37
|
+
price: 'money',
|
|
38
|
+
money: 'money',
|
|
39
|
+
link: 'url',
|
|
40
|
+
href: 'url',
|
|
41
|
+
uri: 'url',
|
|
42
|
+
url: 'url',
|
|
43
|
+
image: 'image_url',
|
|
44
|
+
img: 'image_url',
|
|
45
|
+
'image-url': 'image_url',
|
|
46
|
+
imageurl: 'image_url',
|
|
47
|
+
html: 'html',
|
|
48
|
+
markup: 'html',
|
|
49
|
+
any: 'unknown',
|
|
50
|
+
json: 'unknown',
|
|
51
|
+
unknown: 'unknown',
|
|
52
|
+
};
|
|
53
|
+
return map[cleaned];
|
|
54
|
+
}
|
|
14
55
|
const FieldSchema = z
|
|
15
|
-
.
|
|
56
|
+
.preprocess((raw) => {
|
|
57
|
+
if (!raw || typeof raw !== 'object')
|
|
58
|
+
return raw;
|
|
59
|
+
const value = raw;
|
|
60
|
+
return {
|
|
61
|
+
...value,
|
|
62
|
+
value: value.value ?? value.description,
|
|
63
|
+
type: coerceAgentFieldType(value.type) ?? value.type,
|
|
64
|
+
};
|
|
65
|
+
}, z.object({
|
|
16
66
|
name: z.string().min(1).optional(),
|
|
17
67
|
value: z.string().min(1).optional(),
|
|
18
|
-
type:
|
|
68
|
+
type: AgentFieldTypeSchema.optional(),
|
|
19
69
|
required: z.boolean().optional(),
|
|
20
70
|
examples: z.array(z.string()).optional(),
|
|
21
|
-
})
|
|
71
|
+
}))
|
|
22
72
|
.refine((data) => Boolean(data.name?.trim() || data.value?.trim()), {
|
|
23
73
|
message: 'Provide a field name (name) or a short description (value).',
|
|
24
74
|
path: ['value'],
|
|
25
75
|
});
|
|
76
|
+
const UrlParamsSchema = z.record(z.union([z.string(), z.number(), z.boolean(), z.null()]));
|
|
77
|
+
const TaskBuildOptionsSchema = z.preprocess((raw) => {
|
|
78
|
+
if (!raw || typeof raw !== 'object')
|
|
79
|
+
return raw;
|
|
80
|
+
const value = raw;
|
|
81
|
+
return {
|
|
82
|
+
...value,
|
|
83
|
+
userAgent: value.userAgent ?? value.user_agent,
|
|
84
|
+
acceptLanguage: value.acceptLanguage ?? value.accept_language,
|
|
85
|
+
timezoneId: value.timezoneId ?? value.timezone_id,
|
|
86
|
+
pageTimeoutMs: value.pageTimeoutMs ?? value.page_timeout_ms,
|
|
87
|
+
};
|
|
88
|
+
}, z
|
|
89
|
+
.object({
|
|
90
|
+
userAgent: z.string().min(1).max(500).optional(),
|
|
91
|
+
acceptLanguage: z.string().min(1).max(500).optional(),
|
|
92
|
+
locale: z.string().min(1).max(64).optional(),
|
|
93
|
+
timezoneId: z.string().min(1).max(64).optional(),
|
|
94
|
+
pageTimeoutMs: z.number().int().min(1000).max(120000).optional(),
|
|
95
|
+
})
|
|
96
|
+
.passthrough());
|
|
97
|
+
const RunOptionsSchema = z.preprocess((raw) => {
|
|
98
|
+
if (!raw || typeof raw !== 'object')
|
|
99
|
+
return raw;
|
|
100
|
+
const value = raw;
|
|
101
|
+
return {
|
|
102
|
+
...value,
|
|
103
|
+
userAgent: value.userAgent ?? value.user_agent,
|
|
104
|
+
acceptLanguage: value.acceptLanguage ?? value.accept_language,
|
|
105
|
+
timezoneId: value.timezoneId ?? value.timezone_id,
|
|
106
|
+
waitForSelector: value.waitForSelector ?? value.wait_for_selector ?? value.wait_for,
|
|
107
|
+
delay: value.delay ?? value.delay_ms,
|
|
108
|
+
};
|
|
109
|
+
}, z
|
|
110
|
+
.object({
|
|
111
|
+
timeout: z.number().int().min(1000).max(120000).optional(),
|
|
112
|
+
userAgent: z.string().min(1).max(500).optional(),
|
|
113
|
+
acceptLanguage: z.string().min(1).max(500).optional(),
|
|
114
|
+
locale: z.string().min(1).max(64).optional(),
|
|
115
|
+
timezoneId: z.string().min(1).max(64).optional(),
|
|
116
|
+
waitForSelector: z.string().min(1).max(500).optional(),
|
|
117
|
+
delay: z.number().int().min(0).max(60000).optional(),
|
|
118
|
+
retries: z.number().int().min(0).max(10).optional(),
|
|
119
|
+
})
|
|
120
|
+
.passthrough());
|
|
121
|
+
const RunBudgetsSchema = z
|
|
122
|
+
.object({
|
|
123
|
+
maxPages: z.number().int().min(1).max(1000).optional(),
|
|
124
|
+
maxScrolls: z.number().int().min(1).max(500).optional(),
|
|
125
|
+
maxTimeMs: z.number().int().min(1000).max(3600000).optional(),
|
|
126
|
+
maxRows: z.number().int().min(1).max(200000).optional(),
|
|
127
|
+
maxSteps: z.number().int().min(1).max(500).optional(),
|
|
128
|
+
maxRowBytes: z.number().int().min(100).max(1000000).optional(),
|
|
129
|
+
})
|
|
130
|
+
.passthrough();
|
|
26
131
|
const TaskCreateInputSchema = z.preprocess((raw) => {
|
|
27
132
|
if (!raw || typeof raw !== 'object')
|
|
28
133
|
return raw;
|
|
29
134
|
const value = raw;
|
|
135
|
+
const options = value.options && typeof value.options === 'object' ? { ...value.options } : {};
|
|
136
|
+
if (value.userAgent ?? value.user_agent)
|
|
137
|
+
options.userAgent = value.userAgent ?? value.user_agent;
|
|
138
|
+
if (value.acceptLanguage ?? value.accept_language)
|
|
139
|
+
options.acceptLanguage = value.acceptLanguage ?? value.accept_language;
|
|
140
|
+
if (value.locale)
|
|
141
|
+
options.locale = value.locale;
|
|
142
|
+
if (value.timezoneId ?? value.timezone_id)
|
|
143
|
+
options.timezoneId = value.timezoneId ?? value.timezone_id;
|
|
144
|
+
if (value.pageTimeoutMs ?? value.page_timeout_ms)
|
|
145
|
+
options.pageTimeoutMs = value.pageTimeoutMs ?? value.page_timeout_ms;
|
|
30
146
|
return {
|
|
31
147
|
...value,
|
|
32
148
|
access_profile_id: value.access_profile_id ?? value.accessProfileId,
|
|
33
149
|
field_mode: value.field_mode ?? value.fieldMode ?? value.mode,
|
|
34
150
|
max_preview_rows: value.max_preview_rows ?? value.maxPreviewRows,
|
|
35
151
|
goal: value.goal ?? value.description,
|
|
152
|
+
url_params: value.url_params ?? value.urlParams,
|
|
153
|
+
options: Object.keys(options).length ? options : undefined,
|
|
36
154
|
};
|
|
37
155
|
}, z
|
|
38
156
|
.object({
|
|
39
157
|
url: z.string().url(),
|
|
158
|
+
url_params: UrlParamsSchema.optional(),
|
|
40
159
|
access_profile_id: z.string().uuid().optional(),
|
|
41
160
|
field_mode: z.enum(['auto', 'manual']).optional(),
|
|
42
161
|
goal: z.string().min(1).optional(),
|
|
43
162
|
fields: z.array(FieldSchema).optional(),
|
|
44
163
|
max_preview_rows: z.number().int().positive().optional().default(10),
|
|
164
|
+
options: TaskBuildOptionsSchema.optional(),
|
|
45
165
|
})
|
|
46
166
|
.superRefine((data, ctx) => {
|
|
47
167
|
const effectiveFieldMode = data.field_mode ?? (data.fields?.length ? 'manual' : 'auto');
|
|
@@ -139,17 +259,48 @@ const AccessProfileBuildSessionInputSchema = z.preprocess((raw) => {
|
|
|
139
259
|
hint: z.string().min(1).optional(),
|
|
140
260
|
}));
|
|
141
261
|
export const ToolInputs = {
|
|
142
|
-
health: z.object({}),
|
|
143
262
|
// Tasks (AI-built)
|
|
144
263
|
task_create: TaskCreateInputSchema,
|
|
145
264
|
task_build_status: TaskBuildStatusInputSchema,
|
|
146
265
|
task_publish: TaskPublishInputSchema,
|
|
147
266
|
task_draft_delete: TaskDraftDeleteInputSchema,
|
|
148
267
|
// Runs
|
|
149
|
-
run_create: z.
|
|
268
|
+
run_create: z.preprocess((raw) => {
|
|
269
|
+
if (!raw || typeof raw !== 'object')
|
|
270
|
+
return raw;
|
|
271
|
+
const value = raw;
|
|
272
|
+
const options = value.options && typeof value.options === 'object' ? { ...value.options } : {};
|
|
273
|
+
if (value.userAgent ?? value.user_agent)
|
|
274
|
+
options.userAgent = value.userAgent ?? value.user_agent;
|
|
275
|
+
if (value.acceptLanguage ?? value.accept_language)
|
|
276
|
+
options.acceptLanguage = value.acceptLanguage ?? value.accept_language;
|
|
277
|
+
if (value.locale)
|
|
278
|
+
options.locale = value.locale;
|
|
279
|
+
if (value.timezoneId ?? value.timezone_id)
|
|
280
|
+
options.timezoneId = value.timezoneId ?? value.timezone_id;
|
|
281
|
+
if (value.timeout)
|
|
282
|
+
options.timeout = value.timeout;
|
|
283
|
+
if (value.waitForSelector ?? value.wait_for_selector ?? value.wait_for)
|
|
284
|
+
options.waitForSelector = value.waitForSelector ?? value.wait_for_selector ?? value.wait_for;
|
|
285
|
+
if (value.delay ?? value.delay_ms)
|
|
286
|
+
options.delay = value.delay ?? value.delay_ms;
|
|
287
|
+
if (value.retries)
|
|
288
|
+
options.retries = value.retries;
|
|
289
|
+
return {
|
|
290
|
+
...value,
|
|
291
|
+
pagination: value.pagination ?? value.page,
|
|
292
|
+
options: Object.keys(options).length ? options : undefined,
|
|
293
|
+
};
|
|
294
|
+
}, z
|
|
295
|
+
.object({
|
|
150
296
|
task_id: z.string().uuid(),
|
|
151
297
|
max_runtime_seconds: z.number().int().min(30).max(3600).optional(),
|
|
152
|
-
|
|
298
|
+
pagination: z.record(z.any()).optional(),
|
|
299
|
+
budgets: RunBudgetsSchema.optional(),
|
|
300
|
+
options: RunOptionsSchema.optional(),
|
|
301
|
+
test: z.boolean().optional(),
|
|
302
|
+
})
|
|
303
|
+
.passthrough()),
|
|
153
304
|
run_get: z.object({
|
|
154
305
|
run_id: z.string().uuid(),
|
|
155
306
|
}),
|
|
@@ -183,11 +334,6 @@ export const ToolInputs = {
|
|
|
183
334
|
};
|
|
184
335
|
export function toolDefinitions() {
|
|
185
336
|
return [
|
|
186
|
-
{
|
|
187
|
-
name: 'health',
|
|
188
|
-
description: 'Health check for the Selextract Worker API.',
|
|
189
|
-
inputSchema: { type: 'object', properties: {}, required: [] },
|
|
190
|
-
},
|
|
191
337
|
// Tasks (AI-built)
|
|
192
338
|
{
|
|
193
339
|
name: 'task_create',
|
|
@@ -196,11 +342,23 @@ export function toolDefinitions() {
|
|
|
196
342
|
type: 'object',
|
|
197
343
|
properties: {
|
|
198
344
|
url: { type: 'string', description: 'The page to analyze.' },
|
|
345
|
+
url_params: { type: 'object', description: 'Optional query params to add/override on the URL (ex: {"hl":"en","gl":"US"}).' },
|
|
199
346
|
access_profile_id: { type: 'string', description: 'Optional access profile ID for logged-in/session scraping.' },
|
|
200
347
|
field_mode: { type: 'string', enum: ['auto', 'manual'], description: 'auto = infer fields; manual = use provided fields.' },
|
|
201
348
|
goal: { type: 'string', description: 'Optional short description of what to extract (helps in auto mode).' },
|
|
202
349
|
fields: { type: 'array', items: { type: 'object' }, description: 'Fields to extract (required in manual mode).' },
|
|
203
350
|
max_preview_rows: { type: 'number', description: 'How many sample rows to generate in the preview (default: 10).' },
|
|
351
|
+
options: {
|
|
352
|
+
type: 'object',
|
|
353
|
+
description: 'Optional browsing controls (language/region, time zone, user agent, timeouts).',
|
|
354
|
+
properties: {
|
|
355
|
+
userAgent: { type: 'string' },
|
|
356
|
+
acceptLanguage: { type: 'string', description: 'Sets the Accept-Language request header.' },
|
|
357
|
+
locale: { type: 'string', description: 'Sets Playwright locale (ex: en-US).' },
|
|
358
|
+
timezoneId: { type: 'string', description: 'Sets Playwright timezoneId (IANA, ex: America/Los_Angeles).' },
|
|
359
|
+
pageTimeoutMs: { type: 'number', description: 'Page timeout (ms) used during the build.' },
|
|
360
|
+
},
|
|
361
|
+
},
|
|
204
362
|
},
|
|
205
363
|
required: ['url'],
|
|
206
364
|
},
|
|
@@ -245,6 +403,23 @@ export function toolDefinitions() {
|
|
|
245
403
|
properties: {
|
|
246
404
|
task_id: { type: 'string', description: 'Task ID.' },
|
|
247
405
|
max_runtime_seconds: { type: 'number', description: 'Optional hard limit for run time (seconds).' },
|
|
406
|
+
pagination: { type: 'object', description: 'Optional pagination override for this run only.' },
|
|
407
|
+
budgets: { type: 'object', description: 'Optional safety limits for this run only (ex: {"maxRows":100}).' },
|
|
408
|
+
options: {
|
|
409
|
+
type: 'object',
|
|
410
|
+
description: 'Optional browsing controls (language/region, time zone, user agent, timeouts).',
|
|
411
|
+
properties: {
|
|
412
|
+
timeout: { type: 'number', description: 'Navigation/step timeout (ms).' },
|
|
413
|
+
userAgent: { type: 'string' },
|
|
414
|
+
acceptLanguage: { type: 'string', description: 'Sets the Accept-Language request header.' },
|
|
415
|
+
locale: { type: 'string', description: 'Sets Playwright locale (ex: en-US).' },
|
|
416
|
+
timezoneId: { type: 'string', description: 'Sets Playwright timezoneId (IANA, ex: America/Los_Angeles).' },
|
|
417
|
+
waitForSelector: { type: 'string', description: 'Wait for a selector after navigation.' },
|
|
418
|
+
delay: { type: 'number', description: 'Extra delay (ms) after navigation.' },
|
|
419
|
+
retries: { type: 'number', description: 'Retries for certain flow steps (0-10).' },
|
|
420
|
+
},
|
|
421
|
+
},
|
|
422
|
+
test: { type: 'boolean', description: 'If true, reduces pagination (best-effort) for a quick smoke run.' },
|
|
248
423
|
},
|
|
249
424
|
required: ['task_id'],
|
|
250
425
|
},
|
|
@@ -373,13 +548,6 @@ export function toolDefinitions() {
|
|
|
373
548
|
}
|
|
374
549
|
export function toolHandlers(api, maxChars) {
|
|
375
550
|
const handlers = {
|
|
376
|
-
health: async () => {
|
|
377
|
-
const result = await api.request({
|
|
378
|
-
method: 'GET',
|
|
379
|
-
path: '/health',
|
|
380
|
-
});
|
|
381
|
-
return asTextResult(result, maxChars);
|
|
382
|
-
},
|
|
383
551
|
task_create: async (raw) => {
|
|
384
552
|
const input = ToolInputs.task_create.parse(raw);
|
|
385
553
|
const effectiveFieldMode = input.field_mode ?? (input.fields?.length ? 'manual' : 'auto');
|
|
@@ -388,6 +556,8 @@ export function toolHandlers(api, maxChars) {
|
|
|
388
556
|
path: '/v1/agent/extractions/build',
|
|
389
557
|
body: {
|
|
390
558
|
url: input.url,
|
|
559
|
+
...(input.url_params ? { urlParams: input.url_params } : {}),
|
|
560
|
+
...(input.options ? { options: input.options } : {}),
|
|
391
561
|
access_profile_id: input.access_profile_id,
|
|
392
562
|
field_mode: effectiveFieldMode,
|
|
393
563
|
goal: input.goal,
|
|
@@ -439,10 +609,17 @@ export function toolHandlers(api, maxChars) {
|
|
|
439
609
|
},
|
|
440
610
|
run_create: async (raw) => {
|
|
441
611
|
const input = ToolInputs.run_create.parse(raw);
|
|
612
|
+
const body = {
|
|
613
|
+
...(input.max_runtime_seconds ? { max_runtime_seconds: input.max_runtime_seconds } : {}),
|
|
614
|
+
...(input.pagination ? { pagination: input.pagination } : {}),
|
|
615
|
+
...(input.budgets ? { budgets: input.budgets } : {}),
|
|
616
|
+
...(input.options ? { options: input.options } : {}),
|
|
617
|
+
...(input.test === true ? { test: true } : {}),
|
|
618
|
+
};
|
|
442
619
|
const result = await api.request({
|
|
443
620
|
method: 'POST',
|
|
444
621
|
path: `/v1/tasks/${input.task_id}/runs`,
|
|
445
|
-
body:
|
|
622
|
+
body: Object.keys(body).length ? body : {},
|
|
446
623
|
});
|
|
447
624
|
return asTextResult(result, maxChars);
|
|
448
625
|
},
|