@selextract/mcp-selextract 0.5.0 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -64,6 +64,27 @@ Useful extras:
64
64
  - Authenticated scraping: access profile tools (create/list/update/delete/build-session)
65
65
  - Run lifecycle: `run_get`, `run_list`, `run_stop`, `run_delete`
66
66
 
67
+ ## Locale / region controls
68
+
69
+ You can control language/region in a general way (works across many sites):
70
+
71
+ - `task_create`
72
+ - `url_params`: adds or overrides query params on the URL (example: `{"hl":"en","gl":"US"}`)
73
+ - `options.acceptLanguage`: sets the `Accept-Language` request header
74
+ - `options.locale`: sets Playwright locale (example: `en-US`)
75
+ - `options.timezoneId`: sets Playwright timezoneId (example: `America/Los_Angeles`)
76
+ - `run_create`
77
+ - `options`: same as above (applies to the run)
78
+ - `budgets.maxRows`: stop after N rows (example: `100`)
79
+
80
+ ## Field types
81
+
82
+ For `task_create.fields`, `type` can be one of:
83
+
84
+ - `text`, `number`, `money`, `url`, `image_url`, `html`, `unknown`
85
+
86
+ Common aliases like `string`, `int`, `price`, and `link` are accepted and mapped.
87
+
67
88
  ## Resources (read-only)
68
89
 
69
90
  - `selextract://help` (usage guide)
package/dist/http.js CHANGED
@@ -17,15 +17,18 @@ export class SelextractApiClient {
17
17
  const controller = new AbortController();
18
18
  const timeout = setTimeout(() => controller.abort(), this.timeoutMs);
19
19
  try {
20
+ const headers = {
21
+ accept: 'application/json',
22
+ // Worker accepts Bearer JWT or `sk_...` api keys; it treats `sk_...` as an API key.
23
+ authorization: `Bearer ${this.apiKey}`,
24
+ };
25
+ const hasBody = opts.body !== undefined;
26
+ if (hasBody)
27
+ headers['content-type'] = 'application/json';
20
28
  const resp = await this.fetchImpl(url.toString(), {
21
29
  method: opts.method,
22
- headers: {
23
- accept: 'application/json',
24
- 'content-type': 'application/json',
25
- // Worker accepts Bearer JWT or `sk_...` api keys; it treats `sk_...` as an API key.
26
- authorization: `Bearer ${this.apiKey}`,
27
- },
28
- body: opts.body === undefined ? undefined : JSON.stringify(opts.body),
30
+ headers,
31
+ body: hasBody ? JSON.stringify(opts.body) : undefined,
29
32
  signal: controller.signal,
30
33
  });
31
34
  const contentType = resp.headers.get('content-type') ?? '';
package/dist/tools.js CHANGED
@@ -11,37 +11,157 @@ function asTextResult(value, maxChars) {
11
11
  ],
12
12
  };
13
13
  }
14
+ const AgentFieldTypeSchema = z.enum(['text', 'number', 'money', 'url', 'image_url', 'html', 'unknown']);
15
+ function coerceAgentFieldType(raw) {
16
+ if (raw == null)
17
+ return undefined;
18
+ if (typeof raw !== 'string')
19
+ return undefined;
20
+ const cleaned = raw.trim().toLowerCase();
21
+ if (!cleaned)
22
+ return undefined;
23
+ const direct = AgentFieldTypeSchema.safeParse(cleaned);
24
+ if (direct.success)
25
+ return direct.data;
26
+ const map = {
27
+ string: 'text',
28
+ str: 'text',
29
+ text: 'text',
30
+ number: 'number',
31
+ int: 'number',
32
+ integer: 'number',
33
+ float: 'number',
34
+ double: 'number',
35
+ decimal: 'number',
36
+ currency: 'money',
37
+ price: 'money',
38
+ money: 'money',
39
+ link: 'url',
40
+ href: 'url',
41
+ uri: 'url',
42
+ url: 'url',
43
+ image: 'image_url',
44
+ img: 'image_url',
45
+ 'image-url': 'image_url',
46
+ imageurl: 'image_url',
47
+ html: 'html',
48
+ markup: 'html',
49
+ any: 'unknown',
50
+ json: 'unknown',
51
+ unknown: 'unknown',
52
+ };
53
+ return map[cleaned];
54
+ }
14
55
  const FieldSchema = z
15
- .object({
56
+ .preprocess((raw) => {
57
+ if (!raw || typeof raw !== 'object')
58
+ return raw;
59
+ const value = raw;
60
+ return {
61
+ ...value,
62
+ value: value.value ?? value.description,
63
+ type: coerceAgentFieldType(value.type) ?? value.type,
64
+ };
65
+ }, z.object({
16
66
  name: z.string().min(1).optional(),
17
67
  value: z.string().min(1).optional(),
18
- type: z.string().optional(),
68
+ type: AgentFieldTypeSchema.optional(),
19
69
  required: z.boolean().optional(),
20
70
  examples: z.array(z.string()).optional(),
21
- })
71
+ }))
22
72
  .refine((data) => Boolean(data.name?.trim() || data.value?.trim()), {
23
73
  message: 'Provide a field name (name) or a short description (value).',
24
74
  path: ['value'],
25
75
  });
76
+ const UrlParamsSchema = z.record(z.union([z.string(), z.number(), z.boolean(), z.null()]));
77
+ const TaskBuildOptionsSchema = z.preprocess((raw) => {
78
+ if (!raw || typeof raw !== 'object')
79
+ return raw;
80
+ const value = raw;
81
+ return {
82
+ ...value,
83
+ userAgent: value.userAgent ?? value.user_agent,
84
+ acceptLanguage: value.acceptLanguage ?? value.accept_language,
85
+ timezoneId: value.timezoneId ?? value.timezone_id,
86
+ pageTimeoutMs: value.pageTimeoutMs ?? value.page_timeout_ms,
87
+ };
88
+ }, z
89
+ .object({
90
+ userAgent: z.string().min(1).max(500).optional(),
91
+ acceptLanguage: z.string().min(1).max(500).optional(),
92
+ locale: z.string().min(1).max(64).optional(),
93
+ timezoneId: z.string().min(1).max(64).optional(),
94
+ pageTimeoutMs: z.number().int().min(1000).max(120000).optional(),
95
+ })
96
+ .passthrough());
97
+ const RunOptionsSchema = z.preprocess((raw) => {
98
+ if (!raw || typeof raw !== 'object')
99
+ return raw;
100
+ const value = raw;
101
+ return {
102
+ ...value,
103
+ userAgent: value.userAgent ?? value.user_agent,
104
+ acceptLanguage: value.acceptLanguage ?? value.accept_language,
105
+ timezoneId: value.timezoneId ?? value.timezone_id,
106
+ waitForSelector: value.waitForSelector ?? value.wait_for_selector ?? value.wait_for,
107
+ delay: value.delay ?? value.delay_ms,
108
+ };
109
+ }, z
110
+ .object({
111
+ timeout: z.number().int().min(1000).max(120000).optional(),
112
+ userAgent: z.string().min(1).max(500).optional(),
113
+ acceptLanguage: z.string().min(1).max(500).optional(),
114
+ locale: z.string().min(1).max(64).optional(),
115
+ timezoneId: z.string().min(1).max(64).optional(),
116
+ waitForSelector: z.string().min(1).max(500).optional(),
117
+ delay: z.number().int().min(0).max(60000).optional(),
118
+ retries: z.number().int().min(0).max(10).optional(),
119
+ })
120
+ .passthrough());
121
+ const RunBudgetsSchema = z
122
+ .object({
123
+ maxPages: z.number().int().min(1).max(1000).optional(),
124
+ maxScrolls: z.number().int().min(1).max(500).optional(),
125
+ maxTimeMs: z.number().int().min(1000).max(3600000).optional(),
126
+ maxRows: z.number().int().min(1).max(200000).optional(),
127
+ maxSteps: z.number().int().min(1).max(500).optional(),
128
+ maxRowBytes: z.number().int().min(100).max(1000000).optional(),
129
+ })
130
+ .passthrough();
26
131
  const TaskCreateInputSchema = z.preprocess((raw) => {
27
132
  if (!raw || typeof raw !== 'object')
28
133
  return raw;
29
134
  const value = raw;
135
+ const options = value.options && typeof value.options === 'object' ? { ...value.options } : {};
136
+ if (value.userAgent ?? value.user_agent)
137
+ options.userAgent = value.userAgent ?? value.user_agent;
138
+ if (value.acceptLanguage ?? value.accept_language)
139
+ options.acceptLanguage = value.acceptLanguage ?? value.accept_language;
140
+ if (value.locale)
141
+ options.locale = value.locale;
142
+ if (value.timezoneId ?? value.timezone_id)
143
+ options.timezoneId = value.timezoneId ?? value.timezone_id;
144
+ if (value.pageTimeoutMs ?? value.page_timeout_ms)
145
+ options.pageTimeoutMs = value.pageTimeoutMs ?? value.page_timeout_ms;
30
146
  return {
31
147
  ...value,
32
148
  access_profile_id: value.access_profile_id ?? value.accessProfileId,
33
149
  field_mode: value.field_mode ?? value.fieldMode ?? value.mode,
34
150
  max_preview_rows: value.max_preview_rows ?? value.maxPreviewRows,
35
151
  goal: value.goal ?? value.description,
152
+ url_params: value.url_params ?? value.urlParams,
153
+ options: Object.keys(options).length ? options : undefined,
36
154
  };
37
155
  }, z
38
156
  .object({
39
157
  url: z.string().url(),
158
+ url_params: UrlParamsSchema.optional(),
40
159
  access_profile_id: z.string().uuid().optional(),
41
160
  field_mode: z.enum(['auto', 'manual']).optional(),
42
161
  goal: z.string().min(1).optional(),
43
162
  fields: z.array(FieldSchema).optional(),
44
163
  max_preview_rows: z.number().int().positive().optional().default(10),
164
+ options: TaskBuildOptionsSchema.optional(),
45
165
  })
46
166
  .superRefine((data, ctx) => {
47
167
  const effectiveFieldMode = data.field_mode ?? (data.fields?.length ? 'manual' : 'auto');
@@ -139,17 +259,48 @@ const AccessProfileBuildSessionInputSchema = z.preprocess((raw) => {
139
259
  hint: z.string().min(1).optional(),
140
260
  }));
141
261
  export const ToolInputs = {
142
- health: z.object({}),
143
262
  // Tasks (AI-built)
144
263
  task_create: TaskCreateInputSchema,
145
264
  task_build_status: TaskBuildStatusInputSchema,
146
265
  task_publish: TaskPublishInputSchema,
147
266
  task_draft_delete: TaskDraftDeleteInputSchema,
148
267
  // Runs
149
- run_create: z.object({
268
+ run_create: z.preprocess((raw) => {
269
+ if (!raw || typeof raw !== 'object')
270
+ return raw;
271
+ const value = raw;
272
+ const options = value.options && typeof value.options === 'object' ? { ...value.options } : {};
273
+ if (value.userAgent ?? value.user_agent)
274
+ options.userAgent = value.userAgent ?? value.user_agent;
275
+ if (value.acceptLanguage ?? value.accept_language)
276
+ options.acceptLanguage = value.acceptLanguage ?? value.accept_language;
277
+ if (value.locale)
278
+ options.locale = value.locale;
279
+ if (value.timezoneId ?? value.timezone_id)
280
+ options.timezoneId = value.timezoneId ?? value.timezone_id;
281
+ if (value.timeout)
282
+ options.timeout = value.timeout;
283
+ if (value.waitForSelector ?? value.wait_for_selector ?? value.wait_for)
284
+ options.waitForSelector = value.waitForSelector ?? value.wait_for_selector ?? value.wait_for;
285
+ if (value.delay ?? value.delay_ms)
286
+ options.delay = value.delay ?? value.delay_ms;
287
+ if (value.retries)
288
+ options.retries = value.retries;
289
+ return {
290
+ ...value,
291
+ pagination: value.pagination ?? value.page,
292
+ options: Object.keys(options).length ? options : undefined,
293
+ };
294
+ }, z
295
+ .object({
150
296
  task_id: z.string().uuid(),
151
297
  max_runtime_seconds: z.number().int().min(30).max(3600).optional(),
152
- }),
298
+ pagination: z.record(z.any()).optional(),
299
+ budgets: RunBudgetsSchema.optional(),
300
+ options: RunOptionsSchema.optional(),
301
+ test: z.boolean().optional(),
302
+ })
303
+ .passthrough()),
153
304
  run_get: z.object({
154
305
  run_id: z.string().uuid(),
155
306
  }),
@@ -183,11 +334,6 @@ export const ToolInputs = {
183
334
  };
184
335
  export function toolDefinitions() {
185
336
  return [
186
- {
187
- name: 'health',
188
- description: 'Health check for the Selextract Worker API.',
189
- inputSchema: { type: 'object', properties: {}, required: [] },
190
- },
191
337
  // Tasks (AI-built)
192
338
  {
193
339
  name: 'task_create',
@@ -196,11 +342,23 @@ export function toolDefinitions() {
196
342
  type: 'object',
197
343
  properties: {
198
344
  url: { type: 'string', description: 'The page to analyze.' },
345
+ url_params: { type: 'object', description: 'Optional query params to add/override on the URL (ex: {"hl":"en","gl":"US"}).' },
199
346
  access_profile_id: { type: 'string', description: 'Optional access profile ID for logged-in/session scraping.' },
200
347
  field_mode: { type: 'string', enum: ['auto', 'manual'], description: 'auto = infer fields; manual = use provided fields.' },
201
348
  goal: { type: 'string', description: 'Optional short description of what to extract (helps in auto mode).' },
202
349
  fields: { type: 'array', items: { type: 'object' }, description: 'Fields to extract (required in manual mode).' },
203
350
  max_preview_rows: { type: 'number', description: 'How many sample rows to generate in the preview (default: 10).' },
351
+ options: {
352
+ type: 'object',
353
+ description: 'Optional browsing controls (language/region, time zone, user agent, timeouts).',
354
+ properties: {
355
+ userAgent: { type: 'string' },
356
+ acceptLanguage: { type: 'string', description: 'Sets the Accept-Language request header.' },
357
+ locale: { type: 'string', description: 'Sets Playwright locale (ex: en-US).' },
358
+ timezoneId: { type: 'string', description: 'Sets Playwright timezoneId (IANA, ex: America/Los_Angeles).' },
359
+ pageTimeoutMs: { type: 'number', description: 'Page timeout (ms) used during the build.' },
360
+ },
361
+ },
204
362
  },
205
363
  required: ['url'],
206
364
  },
@@ -245,6 +403,23 @@ export function toolDefinitions() {
245
403
  properties: {
246
404
  task_id: { type: 'string', description: 'Task ID.' },
247
405
  max_runtime_seconds: { type: 'number', description: 'Optional hard limit for run time (seconds).' },
406
+ pagination: { type: 'object', description: 'Optional pagination override for this run only.' },
407
+ budgets: { type: 'object', description: 'Optional safety limits for this run only (ex: {"maxRows":100}).' },
408
+ options: {
409
+ type: 'object',
410
+ description: 'Optional browsing controls (language/region, time zone, user agent, timeouts).',
411
+ properties: {
412
+ timeout: { type: 'number', description: 'Navigation/step timeout (ms).' },
413
+ userAgent: { type: 'string' },
414
+ acceptLanguage: { type: 'string', description: 'Sets the Accept-Language request header.' },
415
+ locale: { type: 'string', description: 'Sets Playwright locale (ex: en-US).' },
416
+ timezoneId: { type: 'string', description: 'Sets Playwright timezoneId (IANA, ex: America/Los_Angeles).' },
417
+ waitForSelector: { type: 'string', description: 'Wait for a selector after navigation.' },
418
+ delay: { type: 'number', description: 'Extra delay (ms) after navigation.' },
419
+ retries: { type: 'number', description: 'Retries for certain flow steps (0-10).' },
420
+ },
421
+ },
422
+ test: { type: 'boolean', description: 'If true, reduces pagination (best-effort) for a quick smoke run.' },
248
423
  },
249
424
  required: ['task_id'],
250
425
  },
@@ -373,13 +548,6 @@ export function toolDefinitions() {
373
548
  }
374
549
  export function toolHandlers(api, maxChars) {
375
550
  const handlers = {
376
- health: async () => {
377
- const result = await api.request({
378
- method: 'GET',
379
- path: '/health',
380
- });
381
- return asTextResult(result, maxChars);
382
- },
383
551
  task_create: async (raw) => {
384
552
  const input = ToolInputs.task_create.parse(raw);
385
553
  const effectiveFieldMode = input.field_mode ?? (input.fields?.length ? 'manual' : 'auto');
@@ -388,6 +556,8 @@ export function toolHandlers(api, maxChars) {
388
556
  path: '/v1/agent/extractions/build',
389
557
  body: {
390
558
  url: input.url,
559
+ ...(input.url_params ? { urlParams: input.url_params } : {}),
560
+ ...(input.options ? { options: input.options } : {}),
391
561
  access_profile_id: input.access_profile_id,
392
562
  field_mode: effectiveFieldMode,
393
563
  goal: input.goal,
@@ -439,10 +609,17 @@ export function toolHandlers(api, maxChars) {
439
609
  },
440
610
  run_create: async (raw) => {
441
611
  const input = ToolInputs.run_create.parse(raw);
612
+ const body = {
613
+ ...(input.max_runtime_seconds ? { max_runtime_seconds: input.max_runtime_seconds } : {}),
614
+ ...(input.pagination ? { pagination: input.pagination } : {}),
615
+ ...(input.budgets ? { budgets: input.budgets } : {}),
616
+ ...(input.options ? { options: input.options } : {}),
617
+ ...(input.test === true ? { test: true } : {}),
618
+ };
442
619
  const result = await api.request({
443
620
  method: 'POST',
444
621
  path: `/v1/tasks/${input.task_id}/runs`,
445
- body: input.max_runtime_seconds ? { max_runtime_seconds: input.max_runtime_seconds } : {},
622
+ body: Object.keys(body).length ? body : {},
446
623
  });
447
624
  return asTextResult(result, maxChars);
448
625
  },
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@selextract/mcp-selextract",
3
- "version": "0.5.0",
3
+ "version": "0.5.3",
4
4
  "description": "Selextract Cloud MCP server (local stdio) for calling the Selextract Worker API",
5
5
  "license": "UNLICENSED",
6
6
  "type": "module",