npm - @selextract/mcp-selextract - Versions diffs - 0.4.0 → 0.5.1 - Mend

@selextract/mcp-selextract 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/README.md CHANGED Viewed

@@ -6,11 +6,12 @@ If you can run `npx`, you can add it to your MCP client with one config entry.
 ## Env
-- `SELEXTRACT_API_URL`
-  - Examples: `http://localhost:8246`, `https://api.selextract.com`, `https://api.selextract.com/api`
-  - This server normalizes it to end with `/api` (if you pass `/api/v1`, it will trim back to `/api`).
 - `SELEXTRACT_API_KEY` (your `sk_...` user API key)
   - Legacy alias: `SELEXTRACT_API_TOKEN`
+- Optional: `SELEXTRACT_API_URL` (defaults to `https://app.selextract.com/api`)
+  - Use this only if you are pointing at a local dev server or a self-hosted endpoint.
+  - Examples: `http://localhost:8246`, `https://app.selextract.com/api`
+  - This server normalizes it to end with `/api` (if you pass `/api/v1`, it will trim back to `/api`).
 - Optional: `SELEXTRACT_TIMEOUT_MS` (default: `30000`)
 - Optional: `SELEXTRACT_MAX_RESPONSE_CHARS` (default: `30000`)
 - Optional: `SELEXTRACT_ENV_FILE` (default: `.env`)
@@ -32,7 +33,6 @@ Add this to your MCP config (keep keys in env vars if possible):
       "command": "npx",
       "args": ["-y", "--package", "@selextract/mcp-selextract", "mcp-selextract"],
       "env": {
-        "SELEXTRACT_API_URL": "http://localhost:8246",
         "SELEXTRACT_API_KEY": "sk_REPLACE_ME"
       }
     }
@@ -40,6 +40,14 @@ Add this to your MCP config (keep keys in env vars if possible):
 }
 ```
+If you want to point at a local dev server, add:
+```json
+{
+  "SELEXTRACT_API_URL": "http://localhost:8246"
+}
+```
 ## What you can do
 Typical flow:
@@ -53,11 +61,30 @@ Typical flow:
 Useful extras:
 - Draft cleanup: `task_draft_delete`
-- Task repair (self-healing rebuild): `task_repair`
-- Recipe versioning (rollback/switch): `task_recipe_versions`, `task_set_recipe_version`
 - Authenticated scraping: access profile tools (create/list/update/delete/build-session)
 - Run lifecycle: `run_get`, `run_list`, `run_stop`, `run_delete`
+## Locale / region controls
+You can control language/region in a general way (works across many sites):
+- `task_create`
+  - `url_params`: adds or overrides query params on the URL (example: `{"hl":"en","gl":"US"}`)
+  - `options.acceptLanguage`: sets the `Accept-Language` request header
+  - `options.locale`: sets Playwright locale (example: `en-US`)
+  - `options.timezoneId`: sets Playwright timezoneId (example: `America/Los_Angeles`)
+- `run_create`
+  - `options`: same as above (applies to the run)
+  - `budgets.maxRows`: stop after N rows (example: `100`)
+## Field types
+For `task_create.fields`, `type` can be one of:
+- `text`, `number`, `money`, `url`, `image_url`, `html`, `unknown`
+Common aliases like `string`, `int`, `price`, and `link` are accepted and mapped.
 ## Resources (read-only)
 - `selextract://help` (usage guide)

package/dist/config.js CHANGED Viewed

@@ -1,6 +1,8 @@
 import { z } from 'zod';
+const DEFAULT_SELEXTRACT_API_URL = 'https://app.selextract.com/api';
 const EnvSchema = z.object({
-    SELEXTRACT_API_URL: z.string().min(1),
+    // Optional because Selextract Cloud is the default.
+    SELEXTRACT_API_URL: z.string().min(1).optional().default(DEFAULT_SELEXTRACT_API_URL),
     SELEXTRACT_API_KEY: z.string().min(1).optional(),
     // Legacy alias (kept for older configs).
     SELEXTRACT_API_TOKEN: z.string().min(1).optional(),

package/dist/resources.js CHANGED Viewed

@@ -31,16 +31,16 @@ export async function readResource(opts) {
             '',
             'This MCP server exposes only **basic** operations for:',
             '- Creating AI-built tasks (draft → publish)',
-            '- Repairing tasks (rebuild recipe)',
-            '- Switching task recipe versions (rollback)',
             '- Running tasks (create/get/list/stop/delete runs)',
             '- Managing access profiles (for logged-in/session scraping)',
             '',
             'It intentionally does **not** expose trace/scratchpad or other deep debugging data.',
             '',
             'Required env vars:',
-            '- SELEXTRACT_API_URL',
             '- SELEXTRACT_API_KEY (or legacy SELEXTRACT_API_TOKEN)',
+            '',
+            'Optional env vars:',
+            '- SELEXTRACT_API_URL (defaults to https://app.selextract.com/api)',
         ].join('\n');
         return {
             contents: [{ uri: opts.uri, mimeType: 'text/markdown', text: value }],

package/dist/tools.js CHANGED Viewed

@@ -11,44 +11,157 @@ function asTextResult(value, maxChars) {
         ],
     };
 }
+const AgentFieldTypeSchema = z.enum(['text', 'number', 'money', 'url', 'image_url', 'html', 'unknown']);
+function coerceAgentFieldType(raw) {
+    if (raw == null)
+        return undefined;
+    if (typeof raw !== 'string')
+        return undefined;
+    const cleaned = raw.trim().toLowerCase();
+    if (!cleaned)
+        return undefined;
+    const direct = AgentFieldTypeSchema.safeParse(cleaned);
+    if (direct.success)
+        return direct.data;
+    const map = {
+        string: 'text',
+        str: 'text',
+        text: 'text',
+        number: 'number',
+        int: 'number',
+        integer: 'number',
+        float: 'number',
+        double: 'number',
+        decimal: 'number',
+        currency: 'money',
+        price: 'money',
+        money: 'money',
+        link: 'url',
+        href: 'url',
+        uri: 'url',
+        url: 'url',
+        image: 'image_url',
+        img: 'image_url',
+        'image-url': 'image_url',
+        imageurl: 'image_url',
+        html: 'html',
+        markup: 'html',
+        any: 'unknown',
+        json: 'unknown',
+        unknown: 'unknown',
+    };
+    return map[cleaned];
+}
 const FieldSchema = z
-    .object({
+    .preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    return {
+        ...value,
+        value: value.value ?? value.description,
+        type: coerceAgentFieldType(value.type) ?? value.type,
+    };
+}, z.object({
     name: z.string().min(1).optional(),
     value: z.string().min(1).optional(),
-    type: z.string().optional(),
+    type: AgentFieldTypeSchema.optional(),
     required: z.boolean().optional(),
     examples: z.array(z.string()).optional(),
-})
+}))
     .refine((data) => Boolean(data.name?.trim() || data.value?.trim()), {
     message: 'Provide a field name (name) or a short description (value).',
     path: ['value'],
 });
+const UrlParamsSchema = z.record(z.union([z.string(), z.number(), z.boolean(), z.null()]));
+const TaskBuildOptionsSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    return {
+        ...value,
+        userAgent: value.userAgent ?? value.user_agent,
+        acceptLanguage: value.acceptLanguage ?? value.accept_language,
+        timezoneId: value.timezoneId ?? value.timezone_id,
+        pageTimeoutMs: value.pageTimeoutMs ?? value.page_timeout_ms,
+    };
+}, z
+    .object({
+    userAgent: z.string().min(1).max(500).optional(),
+    acceptLanguage: z.string().min(1).max(500).optional(),
+    locale: z.string().min(1).max(64).optional(),
+    timezoneId: z.string().min(1).max(64).optional(),
+    pageTimeoutMs: z.number().int().min(1000).max(120000).optional(),
+})
+    .passthrough());
+const RunOptionsSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    return {
+        ...value,
+        userAgent: value.userAgent ?? value.user_agent,
+        acceptLanguage: value.acceptLanguage ?? value.accept_language,
+        timezoneId: value.timezoneId ?? value.timezone_id,
+        waitForSelector: value.waitForSelector ?? value.wait_for_selector ?? value.wait_for,
+        delay: value.delay ?? value.delay_ms,
+    };
+}, z
+    .object({
+    timeout: z.number().int().min(1000).max(120000).optional(),
+    userAgent: z.string().min(1).max(500).optional(),
+    acceptLanguage: z.string().min(1).max(500).optional(),
+    locale: z.string().min(1).max(64).optional(),
+    timezoneId: z.string().min(1).max(64).optional(),
+    waitForSelector: z.string().min(1).max(500).optional(),
+    delay: z.number().int().min(0).max(60000).optional(),
+    retries: z.number().int().min(0).max(10).optional(),
+})
+    .passthrough());
+const RunBudgetsSchema = z
+    .object({
+    maxPages: z.number().int().min(1).max(1000).optional(),
+    maxScrolls: z.number().int().min(1).max(500).optional(),
+    maxTimeMs: z.number().int().min(1000).max(3600000).optional(),
+    maxRows: z.number().int().min(1).max(200000).optional(),
+    maxSteps: z.number().int().min(1).max(500).optional(),
+    maxRowBytes: z.number().int().min(100).max(1000000).optional(),
+})
+    .passthrough();
 const TaskCreateInputSchema = z.preprocess((raw) => {
     if (!raw || typeof raw !== 'object')
         return raw;
     const value = raw;
+    const options = value.options && typeof value.options === 'object' ? { ...value.options } : {};
+    if (value.userAgent ?? value.user_agent)
+        options.userAgent = value.userAgent ?? value.user_agent;
+    if (value.acceptLanguage ?? value.accept_language)
+        options.acceptLanguage = value.acceptLanguage ?? value.accept_language;
+    if (value.locale)
+        options.locale = value.locale;
+    if (value.timezoneId ?? value.timezone_id)
+        options.timezoneId = value.timezoneId ?? value.timezone_id;
+    if (value.pageTimeoutMs ?? value.page_timeout_ms)
+        options.pageTimeoutMs = value.pageTimeoutMs ?? value.page_timeout_ms;
     return {
         ...value,
         access_profile_id: value.access_profile_id ?? value.accessProfileId,
         field_mode: value.field_mode ?? value.fieldMode ?? value.mode,
-        build_mode: value.build_mode ?? value.buildMode,
-        script_source: value.script_source ?? value.scriptSource,
-        recipe_override: value.recipe_override ?? value.recipeOverride,
         max_preview_rows: value.max_preview_rows ?? value.maxPreviewRows,
         goal: value.goal ?? value.description,
+        url_params: value.url_params ?? value.urlParams,
+        options: Object.keys(options).length ? options : undefined,
     };
 }, z
     .object({
     url: z.string().url(),
+    url_params: UrlParamsSchema.optional(),
     access_profile_id: z.string().uuid().optional(),
     field_mode: z.enum(['auto', 'manual']).optional(),
-    build_mode: z.enum(['selectors', 'flow', 'code', 'auto']).optional().default('auto'),
-    script_source: z.string().min(1).optional(),
-    recipe_override: z.any().optional(),
     goal: z.string().min(1).optional(),
     fields: z.array(FieldSchema).optional(),
     max_preview_rows: z.number().int().positive().optional().default(10),
-    advanced: z.boolean().optional().default(false),
+    options: TaskBuildOptionsSchema.optional(),
 })
     .superRefine((data, ctx) => {
     const effectiveFieldMode = data.field_mode ?? (data.fields?.length ? 'manual' : 'auto');
@@ -79,39 +192,6 @@ const TaskPublishInputSchema = z.preprocess((raw) => {
 const TaskDraftDeleteInputSchema = z.object({
     preview_id: z.string().uuid(),
 });
-const TaskRepairInputSchema = z.preprocess((raw) => {
-    if (!raw || typeof raw !== 'object')
-        return raw;
-    const value = raw;
-    return {
-        ...value,
-        hint: value.hint ?? value.what_is_wrong ?? value.whatIsWrong ?? value.issue,
-    };
-}, z
-    .object({
-    task_id: z.string().uuid(),
-    hint: z.string().min(1).max(2000).optional(),
-    force: z.boolean().optional().default(false),
-})
-    .passthrough());
-const TaskRecipeVersionsInputSchema = z.object({
-    task_id: z.string().uuid(),
-});
-const TaskSetRecipeVersionInputSchema = z
-    .object({
-    task_id: z.string().uuid(),
-    recipe_version_id: z.string().uuid().optional(),
-    version: z.coerce.number().int().positive().optional(),
-})
-    .superRefine((data, ctx) => {
-    if (!data.recipe_version_id && !data.version) {
-        ctx.addIssue({
-            code: z.ZodIssueCode.custom,
-            message: 'Provide recipe_version_id or version.',
-            path: ['recipe_version_id'],
-        });
-    }
-});
 const AccessProfileHeaderSchema = z.object({
     name: z.string().min(1),
     value: z.string().min(1),
@@ -185,14 +265,43 @@ export const ToolInputs = {
     task_build_status: TaskBuildStatusInputSchema,
     task_publish: TaskPublishInputSchema,
     task_draft_delete: TaskDraftDeleteInputSchema,
-    task_repair: TaskRepairInputSchema,
-    task_recipe_versions: TaskRecipeVersionsInputSchema,
-    task_set_recipe_version: TaskSetRecipeVersionInputSchema,
     // Runs
-    run_create: z.object({
+    run_create: z.preprocess((raw) => {
+        if (!raw || typeof raw !== 'object')
+            return raw;
+        const value = raw;
+        const options = value.options && typeof value.options === 'object' ? { ...value.options } : {};
+        if (value.userAgent ?? value.user_agent)
+            options.userAgent = value.userAgent ?? value.user_agent;
+        if (value.acceptLanguage ?? value.accept_language)
+            options.acceptLanguage = value.acceptLanguage ?? value.accept_language;
+        if (value.locale)
+            options.locale = value.locale;
+        if (value.timezoneId ?? value.timezone_id)
+            options.timezoneId = value.timezoneId ?? value.timezone_id;
+        if (value.timeout)
+            options.timeout = value.timeout;
+        if (value.waitForSelector ?? value.wait_for_selector ?? value.wait_for)
+            options.waitForSelector = value.waitForSelector ?? value.wait_for_selector ?? value.wait_for;
+        if (value.delay ?? value.delay_ms)
+            options.delay = value.delay ?? value.delay_ms;
+        if (value.retries)
+            options.retries = value.retries;
+        return {
+            ...value,
+            pagination: value.pagination ?? value.page,
+            options: Object.keys(options).length ? options : undefined,
+        };
+    }, z
+        .object({
         task_id: z.string().uuid(),
         max_runtime_seconds: z.number().int().min(30).max(3600).optional(),
-    }),
+        pagination: z.record(z.any()).optional(),
+        budgets: RunBudgetsSchema.optional(),
+        options: RunOptionsSchema.optional(),
+        test: z.boolean().optional(),
+    })
+        .passthrough()),
     run_get: z.object({
         run_id: z.string().uuid(),
     }),
@@ -239,15 +348,23 @@ export function toolDefinitions() {
                 type: 'object',
                 properties: {
                     url: { type: 'string', description: 'The page to analyze.' },
+                    url_params: { type: 'object', description: 'Optional query params to add/override on the URL (ex: {"hl":"en","gl":"US"}).' },
                     access_profile_id: { type: 'string', description: 'Optional access profile ID for logged-in/session scraping.' },
                     field_mode: { type: 'string', enum: ['auto', 'manual'], description: 'auto = infer fields; manual = use provided fields.' },
-                    build_mode: { type: 'string', enum: ['selectors', 'flow', 'code', 'auto'], description: 'How to build the draft (default: auto).' },
-                    script_source: { type: 'string', description: 'Optional Playwright script source (code mode). If provided, the server previews this exact script.' },
-                    recipe_override: { type: 'object', description: 'Optional base recipe to reuse when previewing custom scripts (keeps flow/dom settings).' },
                     goal: { type: 'string', description: 'Optional short description of what to extract (helps in auto mode).' },
                     fields: { type: 'array', items: { type: 'object' }, description: 'Fields to extract (required in manual mode).' },
                     max_preview_rows: { type: 'number', description: 'How many sample rows to generate in the preview (default: 10).' },
-                    advanced: { type: 'boolean', description: 'Allow more complex extraction strategies (may take longer).' },
+                    options: {
+                        type: 'object',
+                        description: 'Optional browsing controls (language/region, time zone, user agent, timeouts).',
+                        properties: {
+                            userAgent: { type: 'string' },
+                            acceptLanguage: { type: 'string', description: 'Sets the Accept-Language request header.' },
+                            locale: { type: 'string', description: 'Sets Playwright locale (ex: en-US).' },
+                            timezoneId: { type: 'string', description: 'Sets Playwright timezoneId (IANA, ex: America/Los_Angeles).' },
+                            pageTimeoutMs: { type: 'number', description: 'Page timeout (ms) used during the build.' },
+                        },
+                    },
                 },
                 required: ['url'],
             },
@@ -283,41 +400,6 @@ export function toolDefinitions() {
                 required: ['preview_id'],
             },
         },
-        {
-            name: 'task_repair',
-            description: 'Repair a saved task by re-running the agent builder (creates a new recipe version and updates the task in place). Returns build_job_id.',
-            inputSchema: {
-                type: 'object',
-                properties: {
-                    task_id: { type: 'string', description: 'Task ID.' },
-                    hint: { type: 'string', description: 'Optional note about what is wrong (helps steer the repair).' },
-                    force: { type: 'boolean', description: 'If true, queue even if a build is already in progress.' },
-                },
-                required: ['task_id'],
-            },
-        },
-        {
-            name: 'task_recipe_versions',
-            description: 'List recipe versions for a task (for rollback/version switching).',
-            inputSchema: {
-                type: 'object',
-                properties: { task_id: { type: 'string', description: 'Task ID.' } },
-                required: ['task_id'],
-            },
-        },
-        {
-            name: 'task_set_recipe_version',
-            description: 'Switch a task to use a specific recipe version (by recipe_version_id or by version number).',
-            inputSchema: {
-                type: 'object',
-                properties: {
-                    task_id: { type: 'string', description: 'Task ID.' },
-                    recipe_version_id: { type: 'string', description: 'Recipe version ID.' },
-                    version: { type: 'number', description: 'Recipe version number (1, 2, 3, ...).' },
-                },
-                required: ['task_id'],
-            },
-        },
         // Runs
         {
             name: 'run_create',
@@ -327,6 +409,23 @@ export function toolDefinitions() {
                 properties: {
                     task_id: { type: 'string', description: 'Task ID.' },
                     max_runtime_seconds: { type: 'number', description: 'Optional hard limit for run time (seconds).' },
+                    pagination: { type: 'object', description: 'Optional pagination override for this run only.' },
+                    budgets: { type: 'object', description: 'Optional safety limits for this run only (ex: {"maxRows":100}).' },
+                    options: {
+                        type: 'object',
+                        description: 'Optional browsing controls (language/region, time zone, user agent, timeouts).',
+                        properties: {
+                            timeout: { type: 'number', description: 'Navigation/step timeout (ms).' },
+                            userAgent: { type: 'string' },
+                            acceptLanguage: { type: 'string', description: 'Sets the Accept-Language request header.' },
+                            locale: { type: 'string', description: 'Sets Playwright locale (ex: en-US).' },
+                            timezoneId: { type: 'string', description: 'Sets Playwright timezoneId (IANA, ex: America/Los_Angeles).' },
+                            waitForSelector: { type: 'string', description: 'Wait for a selector after navigation.' },
+                            delay: { type: 'number', description: 'Extra delay (ms) after navigation.' },
+                            retries: { type: 'number', description: 'Retries for certain flow steps (0-10).' },
+                        },
+                    },
+                    test: { type: 'boolean', description: 'If true, reduces pagination (best-effort) for a quick smoke run.' },
                 },
                 required: ['task_id'],
             },
@@ -470,15 +569,13 @@ export function toolHandlers(api, maxChars) {
                 path: '/v1/agent/extractions/build',
                 body: {
                     url: input.url,
+                    ...(input.url_params ? { urlParams: input.url_params } : {}),
+                    ...(input.options ? { options: input.options } : {}),
                     access_profile_id: input.access_profile_id,
                     field_mode: effectiveFieldMode,
-                    build_mode: input.build_mode,
-                    ...(input.script_source ? { script_source: input.script_source } : {}),
-                    ...(input.recipe_override ? { recipe_override: input.recipe_override } : {}),
                     goal: input.goal,
                     fields: input.fields,
                     maxPreviewRows: input.max_preview_rows,
-                    advanced: input.advanced,
                     debug: false,
                 },
             });
@@ -523,41 +620,19 @@ export function toolHandlers(api, maxChars) {
             });
             return asTextResult(result, maxChars);
         },
-        task_repair: async (raw) => {
-            const input = ToolInputs.task_repair.parse(raw);
-            const result = await api.request({
-                method: 'POST',
-                path: `/v1/agent/tasks/${input.task_id}/repair`,
-                body: input.hint || input.force ? { hint: input.hint, force: input.force } : {},
-            });
-            return asTextResult(result, maxChars);
-        },
-        task_recipe_versions: async (raw) => {
-            const input = ToolInputs.task_recipe_versions.parse(raw);
-            const result = await api.request({
-                method: 'GET',
-                path: `/v1/tasks/${input.task_id}/recipe-versions`,
-            });
-            return asTextResult(result, maxChars);
-        },
-        task_set_recipe_version: async (raw) => {
-            const input = ToolInputs.task_set_recipe_version.parse(raw);
-            const result = await api.request({
-                method: 'POST',
-                path: `/v1/tasks/${input.task_id}/recipe-version`,
-                body: {
-                    recipe_version_id: input.recipe_version_id,
-                    version: input.version,
-                },
-            });
-            return asTextResult(result, maxChars);
-        },
         run_create: async (raw) => {
             const input = ToolInputs.run_create.parse(raw);
+            const body = {
+                ...(input.max_runtime_seconds ? { max_runtime_seconds: input.max_runtime_seconds } : {}),
+                ...(input.pagination ? { pagination: input.pagination } : {}),
+                ...(input.budgets ? { budgets: input.budgets } : {}),
+                ...(input.options ? { options: input.options } : {}),
+                ...(input.test === true ? { test: true } : {}),
+            };
             const result = await api.request({
                 method: 'POST',
                 path: `/v1/tasks/${input.task_id}/runs`,
-                body: input.max_runtime_seconds ? { max_runtime_seconds: input.max_runtime_seconds } : {},
+                body: Object.keys(body).length ? body : {},
             });
             return asTextResult(result, maxChars);
         },

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@selextract/mcp-selextract",
-  "version": "0.4.0",
+  "version": "0.5.1",
   "description": "Selextract Cloud MCP server (local stdio) for calling the Selextract Worker API",
   "license": "UNLICENSED",
   "type": "module",