npm - @selextract/mcp-selextract - Versions diffs - 0.4.0 - Mend

@selextract/mcp-selextract 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md ADDED Viewed

@@ -0,0 +1,63 @@
+# Selextract MCP Server (local stdio)
+This is a local `stdio` MCP server that lets MCP-capable clients call Selextract Cloud (the Worker API) via HTTP.
+If you can run `npx`, you can add it to your MCP client with one config entry.
+## Env
+- `SELEXTRACT_API_URL`
+  - Examples: `http://localhost:8246`, `https://api.selextract.com`, `https://api.selextract.com/api`
+  - This server normalizes it to end with `/api` (if you pass `/api/v1`, it will trim back to `/api`).
+- `SELEXTRACT_API_KEY` (your `sk_...` user API key)
+  - Legacy alias: `SELEXTRACT_API_TOKEN`
+- Optional: `SELEXTRACT_TIMEOUT_MS` (default: `30000`)
+- Optional: `SELEXTRACT_MAX_RESPONSE_CHARS` (default: `30000`)
+- Optional: `SELEXTRACT_ENV_FILE` (default: `.env`)
+## Run (dev)
+From `selextract-cloud/`:
+`pnpm --filter @selextract/mcp-selextract dev`
+## MCP config example (`mcp.json`)
+Add this to your MCP config (keep keys in env vars if possible):
+```json
+{
+  "mcpServers": {
+    "selextract-cloud": {
+      "command": "npx",
+      "args": ["-y", "--package", "@selextract/mcp-selextract", "mcp-selextract"],
+      "env": {
+        "SELEXTRACT_API_URL": "http://localhost:8246",
+        "SELEXTRACT_API_KEY": "sk_REPLACE_ME"
+      }
+    }
+  }
+}
+```
+## What you can do
+Typical flow:
+1. `task_create` → returns `preview_id`
+2. `task_build_status` → poll until status is `complete` (or `failed`)
+3. `task_publish` → turns the draft into a saved task (`task_id`)
+4. `run_create` → runs the task (`run_id`)
+5. `run_results` → reads results (paginated)
+Useful extras:
+- Draft cleanup: `task_draft_delete`
+- Task repair (self-healing rebuild): `task_repair`
+- Recipe versioning (rollback/switch): `task_recipe_versions`, `task_set_recipe_version`
+- Authenticated scraping: access profile tools (create/list/update/delete/build-session)
+- Run lifecycle: `run_get`, `run_list`, `run_stop`, `run_delete`
+## Resources (read-only)
+- `selextract://help` (usage guide)

package/dist/cli.js ADDED Viewed

@@ -0,0 +1,28 @@
+#!/usr/bin/env node
+import dotenv from 'dotenv';
+import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
+import { readEnv } from './config.js';
+import { SelextractApiClient } from './http.js';
+import { createSelextractMcpServer } from './server.js';
+async function main() {
+    // Best-effort .env loading for local dev; users can omit or override with SELEXTRACT_ENV_FILE.
+    const envFile = process.env.SELEXTRACT_ENV_FILE ?? '.env';
+    dotenv.config({ path: envFile });
+    const env = readEnv(process.env);
+    const api = new SelextractApiClient({
+        baseUrl: env.SELEXTRACT_API_URL,
+        apiKey: env.SELEXTRACT_API_KEY,
+        timeoutMs: env.SELEXTRACT_TIMEOUT_MS,
+    });
+    const server = createSelextractMcpServer({
+        api,
+        maxResponseChars: env.SELEXTRACT_MAX_RESPONSE_CHARS,
+    });
+    const transport = new StdioServerTransport();
+    await server.connect(transport);
+}
+main().catch((err) => {
+    // Stdio MCP servers should only write to stderr.
+    process.stderr.write(`${err instanceof Error ? err.stack ?? err.message : String(err)}\n`);
+    process.exit(1);
+});

package/dist/config.js ADDED Viewed

@@ -0,0 +1,40 @@
+import { z } from 'zod';
+const EnvSchema = z.object({
+    SELEXTRACT_API_URL: z.string().min(1),
+    SELEXTRACT_API_KEY: z.string().min(1).optional(),
+    // Legacy alias (kept for older configs).
+    SELEXTRACT_API_TOKEN: z.string().min(1).optional(),
+    SELEXTRACT_TIMEOUT_MS: z.coerce.number().int().positive().optional().default(30000),
+    SELEXTRACT_MAX_RESPONSE_CHARS: z.coerce.number().int().positive().optional().default(30000),
+    SELEXTRACT_ENV_FILE: z.string().optional().default('.env'),
+});
+function normalizeApiUrl(raw) {
+    const url = new URL(raw);
+    const pathSegments = url.pathname.split('/').filter(Boolean);
+    const apiIndex = pathSegments.indexOf('api');
+    if (apiIndex === -1) {
+        url.pathname = pathSegments.length === 0 ? '/api' : `/${pathSegments.join('/')}/api`;
+    }
+    else {
+        // If the user passes .../api/v1 (or anything after /api), normalize to .../api.
+        url.pathname = `/${pathSegments.slice(0, apiIndex + 1).join('/')}`;
+    }
+    return url.toString().replace(/\/+$/, '');
+}
+export function readEnv(processEnv) {
+    const parsed = EnvSchema.safeParse(processEnv);
+    if (!parsed.success) {
+        const issues = parsed.error.issues.map((i) => `${i.path.join('.')}: ${i.message}`).join('; ');
+        throw new Error(`Invalid environment: ${issues}`);
+    }
+    const { SELEXTRACT_API_KEY, SELEXTRACT_API_TOKEN, ...rest } = parsed.data;
+    const apiKey = SELEXTRACT_API_KEY ?? SELEXTRACT_API_TOKEN;
+    if (!apiKey) {
+        throw new Error('Invalid environment: SELEXTRACT_API_KEY (or legacy SELEXTRACT_API_TOKEN) is required');
+    }
+    return {
+        ...rest,
+        SELEXTRACT_API_KEY: apiKey,
+        SELEXTRACT_API_URL: normalizeApiUrl(rest.SELEXTRACT_API_URL),
+    };
+}

package/dist/http.js ADDED Viewed

@@ -0,0 +1,47 @@
+export class SelextractApiClient {
+    constructor(opts) {
+        this.baseUrl = new URL(opts.baseUrl.endsWith('/') ? opts.baseUrl : `${opts.baseUrl}/`);
+        this.apiKey = opts.apiKey.trim();
+        this.timeoutMs = opts.timeoutMs;
+        this.fetchImpl = opts.fetchImpl ?? fetch;
+    }
+    async request(opts) {
+        const url = new URL(opts.path.replace(/^\//, ''), this.baseUrl);
+        if (opts.query) {
+            for (const [key, value] of Object.entries(opts.query)) {
+                if (value === undefined)
+                    continue;
+                url.searchParams.set(key, String(value));
+            }
+        }
+        const controller = new AbortController();
+        const timeout = setTimeout(() => controller.abort(), this.timeoutMs);
+        try {
+            const resp = await this.fetchImpl(url.toString(), {
+                method: opts.method,
+                headers: {
+                    accept: 'application/json',
+                    'content-type': 'application/json',
+                    // Worker accepts Bearer JWT or `sk_...` api keys; it treats `sk_...` as an API key.
+                    authorization: `Bearer ${this.apiKey}`,
+                },
+                body: opts.body === undefined ? undefined : JSON.stringify(opts.body),
+                signal: controller.signal,
+            });
+            const contentType = resp.headers.get('content-type') ?? '';
+            const isJson = contentType.includes('application/json');
+            const payload = isJson ? await resp.json().catch(() => null) : await resp.text().catch(() => '');
+            if (!resp.ok) {
+                const message = typeof payload?.message === 'string'
+                    ? String(payload.message)
+                    : `Request failed (${resp.status})`;
+                const err = { status: resp.status, message, payload };
+                throw err;
+            }
+            return payload;
+        }
+        finally {
+            clearTimeout(timeout);
+        }
+    }
+}

package/dist/resources.js ADDED Viewed

@@ -0,0 +1,50 @@
+import { ErrorCode, McpError } from '@modelcontextprotocol/sdk/types.js';
+export function resourceDefinitions() {
+    return {
+        resources: [
+            {
+                uri: 'selextract://help',
+                name: 'Help',
+                description: 'Quick usage guide for this MCP server.',
+                mimeType: 'text/markdown',
+            },
+        ],
+        resourceTemplates: [],
+    };
+}
+export async function readResource(opts) {
+    let url;
+    try {
+        url = new URL(opts.uri);
+    }
+    catch {
+        throw new McpError(ErrorCode.InvalidParams, `Invalid resource URI: ${opts.uri}`);
+    }
+    if (url.protocol !== 'selextract:') {
+        throw new McpError(ErrorCode.InvalidParams, `Unsupported resource scheme: ${url.protocol}`);
+    }
+    const pathParts = url.pathname.replace(/\/+$/, '').split('/').filter(Boolean);
+    const parts = (url.hostname ? [url.hostname, ...pathParts] : pathParts).filter(Boolean);
+    if (parts.length === 1 && parts[0] === 'help') {
+        const value = [
+            '# Selextract MCP Server',
+            '',
+            'This MCP server exposes only **basic** operations for:',
+            '- Creating AI-built tasks (draft → publish)',
+            '- Repairing tasks (rebuild recipe)',
+            '- Switching task recipe versions (rollback)',
+            '- Running tasks (create/get/list/stop/delete runs)',
+            '- Managing access profiles (for logged-in/session scraping)',
+            '',
+            'It intentionally does **not** expose trace/scratchpad or other deep debugging data.',
+            '',
+            'Required env vars:',
+            '- SELEXTRACT_API_URL',
+            '- SELEXTRACT_API_KEY (or legacy SELEXTRACT_API_TOKEN)',
+        ].join('\n');
+        return {
+            contents: [{ uri: opts.uri, mimeType: 'text/markdown', text: value }],
+        };
+    }
+    throw new McpError(ErrorCode.MethodNotFound, `Unknown resource: ${opts.uri}`);
+}

package/dist/serialize.js ADDED Viewed

@@ -0,0 +1,7 @@
+export function jsonStringifyLimited(value, maxChars) {
+    const raw = JSON.stringify(value, null, 2);
+    if (raw.length <= maxChars)
+        return { text: raw, truncated: false };
+    const head = raw.slice(0, Math.max(0, maxChars - 40));
+    return { text: `${head}\n...TRUNCATED (${raw.length} chars total)`, truncated: true };
+}

package/dist/server.js ADDED Viewed

@@ -0,0 +1,62 @@
+import { Server } from '@modelcontextprotocol/sdk/server/index.js';
+import { CallToolRequestSchema, ErrorCode, ListResourceTemplatesRequestSchema, ListResourcesRequestSchema, ListToolsRequestSchema, McpError, ReadResourceRequestSchema, } from '@modelcontextprotocol/sdk/types.js';
+import { readFileSync } from 'node:fs';
+import { toolDefinitions, toolHandlers } from './tools.js';
+import { readResource, resourceDefinitions } from './resources.js';
+function readPackageVersion() {
+    try {
+        const pkgUrl = new URL('../package.json', import.meta.url);
+        const raw = readFileSync(pkgUrl, 'utf8');
+        const parsed = JSON.parse(raw);
+        return typeof parsed?.version === 'string' ? parsed.version : '0.0.0';
+    }
+    catch {
+        return '0.0.0';
+    }
+}
+export function createSelextractMcpServer(opts) {
+    const server = new Server({ name: 'selextract-cloud', version: readPackageVersion() }, {
+        capabilities: {
+            tools: {},
+            resources: {},
+        },
+    });
+    const defs = toolDefinitions();
+    const handlers = toolHandlers(opts.api, opts.maxResponseChars);
+    const resources = resourceDefinitions();
+    server.setRequestHandler(ListToolsRequestSchema, async () => {
+        return {
+            tools: defs.map((tool) => ({
+                name: tool.name,
+                description: tool.description,
+                inputSchema: tool.inputSchema,
+            })),
+        };
+    });
+    server.setRequestHandler(ListResourcesRequestSchema, async () => {
+        return { resources: resources.resources };
+    });
+    server.setRequestHandler(ListResourceTemplatesRequestSchema, async () => {
+        return { resourceTemplates: resources.resourceTemplates };
+    });
+    server.setRequestHandler(ReadResourceRequestSchema, async (request) => {
+        const uri = String(request.params?.uri ?? '');
+        if (!uri) {
+            throw new McpError(ErrorCode.InvalidParams, 'Missing uri');
+        }
+        return await readResource({
+            api: opts.api,
+            maxResponseChars: opts.maxResponseChars,
+            uri,
+        });
+    });
+    server.setRequestHandler(CallToolRequestSchema, async (request) => {
+        const name = String(request.params?.name ?? '');
+        const args = request.params?.arguments;
+        if (!name || !(name in handlers)) {
+            throw new McpError(ErrorCode.MethodNotFound, `Unknown tool: ${name || '(empty)'}`);
+        }
+        return await handlers[name](args);
+    });
+    return server;
+}

package/dist/tools.js ADDED Viewed

@@ -0,0 +1,664 @@
+import { z } from 'zod';
+import { jsonStringifyLimited } from './serialize.js';
+function asTextResult(value, maxChars) {
+    const { text, truncated } = jsonStringifyLimited(value, maxChars);
+    return {
+        content: [
+            {
+                type: 'text',
+                text: truncated ? `${text}\n\nNote: Increase SELEXTRACT_MAX_RESPONSE_CHARS to see more.` : text,
+            },
+        ],
+    };
+}
+const FieldSchema = z
+    .object({
+    name: z.string().min(1).optional(),
+    value: z.string().min(1).optional(),
+    type: z.string().optional(),
+    required: z.boolean().optional(),
+    examples: z.array(z.string()).optional(),
+})
+    .refine((data) => Boolean(data.name?.trim() || data.value?.trim()), {
+    message: 'Provide a field name (name) or a short description (value).',
+    path: ['value'],
+});
+const TaskCreateInputSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    return {
+        ...value,
+        access_profile_id: value.access_profile_id ?? value.accessProfileId,
+        field_mode: value.field_mode ?? value.fieldMode ?? value.mode,
+        build_mode: value.build_mode ?? value.buildMode,
+        script_source: value.script_source ?? value.scriptSource,
+        recipe_override: value.recipe_override ?? value.recipeOverride,
+        max_preview_rows: value.max_preview_rows ?? value.maxPreviewRows,
+        goal: value.goal ?? value.description,
+    };
+}, z
+    .object({
+    url: z.string().url(),
+    access_profile_id: z.string().uuid().optional(),
+    field_mode: z.enum(['auto', 'manual']).optional(),
+    build_mode: z.enum(['selectors', 'flow', 'code', 'auto']).optional().default('auto'),
+    script_source: z.string().min(1).optional(),
+    recipe_override: z.any().optional(),
+    goal: z.string().min(1).optional(),
+    fields: z.array(FieldSchema).optional(),
+    max_preview_rows: z.number().int().positive().optional().default(10),
+    advanced: z.boolean().optional().default(false),
+})
+    .superRefine((data, ctx) => {
+    const effectiveFieldMode = data.field_mode ?? (data.fields?.length ? 'manual' : 'auto');
+    if (effectiveFieldMode === 'manual' && (!data.fields || data.fields.length === 0)) {
+        ctx.addIssue({
+            code: z.ZodIssueCode.custom,
+            message: 'In manual mode, fields must be provided.',
+            path: ['fields'],
+        });
+    }
+}));
+const TaskBuildStatusInputSchema = z.object({
+    preview_id: z.string().uuid(),
+});
+const TaskPublishInputSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    return {
+        ...value,
+        access_profile_id: value.access_profile_id ?? value.accessProfileId,
+    };
+}, z.object({
+    preview_id: z.string().uuid(),
+    name: z.string().min(1).max(255),
+    access_profile_id: z.string().uuid().optional(),
+}));
+const TaskDraftDeleteInputSchema = z.object({
+    preview_id: z.string().uuid(),
+});
+const TaskRepairInputSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    return {
+        ...value,
+        hint: value.hint ?? value.what_is_wrong ?? value.whatIsWrong ?? value.issue,
+    };
+}, z
+    .object({
+    task_id: z.string().uuid(),
+    hint: z.string().min(1).max(2000).optional(),
+    force: z.boolean().optional().default(false),
+})
+    .passthrough());
+const TaskRecipeVersionsInputSchema = z.object({
+    task_id: z.string().uuid(),
+});
+const TaskSetRecipeVersionInputSchema = z
+    .object({
+    task_id: z.string().uuid(),
+    recipe_version_id: z.string().uuid().optional(),
+    version: z.coerce.number().int().positive().optional(),
+})
+    .superRefine((data, ctx) => {
+    if (!data.recipe_version_id && !data.version) {
+        ctx.addIssue({
+            code: z.ZodIssueCode.custom,
+            message: 'Provide recipe_version_id or version.',
+            path: ['recipe_version_id'],
+        });
+    }
+});
+const AccessProfileHeaderSchema = z.object({
+    name: z.string().min(1),
+    value: z.string().min(1),
+});
+const AccessProfileInputSchema = z.object({
+    key: z.string().min(1),
+    label: z.string().min(1).optional(),
+    type: z.enum(['text', 'password']).optional(),
+    value: z.string().min(1),
+});
+const AccessProfileCreateInputSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    const login = value.login && typeof value.login === 'object' ? value.login : undefined;
+    return {
+        ...value,
+        canonical_domain: value.canonical_domain ?? value.canonicalDomain,
+        storage_state: value.storage_state ?? value.storageState,
+        login: login
+            ? {
+                ...login,
+                start_url: login.start_url ?? login.startUrl,
+            }
+            : value.login,
+    };
+}, z.object({
+    name: z.string().min(1).max(255),
+    url: z.string().url().optional(),
+    canonical_domain: z.string().min(1).optional(),
+    kind: z.enum(['custom', 'credentials', 'session']).optional(),
+    headers: z.array(AccessProfileHeaderSchema).optional(),
+    inputs: z.array(AccessProfileInputSchema).optional(),
+    storage_state: z.unknown().optional(),
+    login: z
+        .object({
+        start_url: z.string().url().optional(),
+        hint: z.string().min(1).optional(),
+    })
+        .optional(),
+}));
+const AccessProfileUpdateInputSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    if (value.patch && typeof value.patch === 'object')
+        return value;
+    const { access_profile_id, ...patch } = value;
+    return { access_profile_id, patch };
+}, z.object({
+    access_profile_id: z.string().uuid(),
+    patch: z.record(z.unknown()).optional().default({}),
+}));
+const AccessProfileBuildSessionInputSchema = z.preprocess((raw) => {
+    if (!raw || typeof raw !== 'object')
+        return raw;
+    const value = raw;
+    return {
+        ...value,
+        start_url: value.start_url ?? value.startUrl,
+    };
+}, z.object({
+    access_profile_id: z.string().uuid(),
+    start_url: z.string().url().optional(),
+    hint: z.string().min(1).optional(),
+}));
+export const ToolInputs = {
+    health: z.object({}),
+    // Tasks (AI-built)
+    task_create: TaskCreateInputSchema,
+    task_build_status: TaskBuildStatusInputSchema,
+    task_publish: TaskPublishInputSchema,
+    task_draft_delete: TaskDraftDeleteInputSchema,
+    task_repair: TaskRepairInputSchema,
+    task_recipe_versions: TaskRecipeVersionsInputSchema,
+    task_set_recipe_version: TaskSetRecipeVersionInputSchema,
+    // Runs
+    run_create: z.object({
+        task_id: z.string().uuid(),
+        max_runtime_seconds: z.number().int().min(30).max(3600).optional(),
+    }),
+    run_get: z.object({
+        run_id: z.string().uuid(),
+    }),
+    run_list: z.object({
+        task_id: z.string().uuid(),
+        limit: z.number().int().min(1).max(100).optional().default(20),
+    }),
+    run_stop: z.object({
+        run_id: z.string().uuid(),
+    }),
+    run_delete: z.object({
+        run_id: z.string().uuid(),
+    }),
+    run_results: z.object({
+        run_id: z.string().uuid(),
+        limit: z.number().int().min(1).max(1000).optional().default(100),
+        cursor: z.string().optional(),
+        offset: z.number().int().min(0).optional(),
+    }),
+    // Access profiles (authenticated scraping)
+    access_profile_list: z.object({}),
+    access_profile_get: z.object({
+        access_profile_id: z.string().uuid(),
+    }),
+    access_profile_create: AccessProfileCreateInputSchema,
+    access_profile_update: AccessProfileUpdateInputSchema,
+    access_profile_delete: z.object({
+        access_profile_id: z.string().uuid(),
+    }),
+    access_profile_build_session: AccessProfileBuildSessionInputSchema,
+};
+export function toolDefinitions() {
+    return [
+        {
+            name: 'health',
+            description: 'Health check for the Selextract Worker API.',
+            inputSchema: { type: 'object', properties: {}, required: [] },
+        },
+        // Tasks (AI-built)
+        {
+            name: 'task_create',
+            description: 'Create an AI-built task draft for a URL. Returns preview_id. This MCP server does not expose trace/scratchpad.',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    url: { type: 'string', description: 'The page to analyze.' },
+                    access_profile_id: { type: 'string', description: 'Optional access profile ID for logged-in/session scraping.' },
+                    field_mode: { type: 'string', enum: ['auto', 'manual'], description: 'auto = infer fields; manual = use provided fields.' },
+                    build_mode: { type: 'string', enum: ['selectors', 'flow', 'code', 'auto'], description: 'How to build the draft (default: auto).' },
+                    script_source: { type: 'string', description: 'Optional Playwright script source (code mode). If provided, the server previews this exact script.' },
+                    recipe_override: { type: 'object', description: 'Optional base recipe to reuse when previewing custom scripts (keeps flow/dom settings).' },
+                    goal: { type: 'string', description: 'Optional short description of what to extract (helps in auto mode).' },
+                    fields: { type: 'array', items: { type: 'object' }, description: 'Fields to extract (required in manual mode).' },
+                    max_preview_rows: { type: 'number', description: 'How many sample rows to generate in the preview (default: 10).' },
+                    advanced: { type: 'boolean', description: 'Allow more complex extraction strategies (may take longer).' },
+                },
+                required: ['url'],
+            },
+        },
+        {
+            name: 'task_build_status',
+            description: 'Get high-level build status for a task draft (queued/building/complete/failed). Returns only status + timing + error (no recipe, no notes).',
+            inputSchema: {
+                type: 'object',
+                properties: { preview_id: { type: 'string', description: 'The preview_id returned by task_create.' } },
+                required: ['preview_id'],
+            },
+        },
+        {
+            name: 'task_publish',
+            description: 'Publish a completed task draft as a saved task. Returns task_id. If you pass access_profile_id here, that access profile is attached to the saved task and used for runs.',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    preview_id: { type: 'string', description: 'The preview_id to publish.' },
+                    name: { type: 'string', description: 'Name for the saved task.' },
+                    access_profile_id: { type: 'string', description: 'Optional access profile to attach to the saved task.' },
+                },
+                required: ['preview_id', 'name'],
+            },
+        },
+        {
+            name: 'task_draft_delete',
+            description: 'Delete a task draft by preview_id (frees a draft quota slot).',
+            inputSchema: {
+                type: 'object',
+                properties: { preview_id: { type: 'string', description: 'The preview_id to delete.' } },
+                required: ['preview_id'],
+            },
+        },
+        {
+            name: 'task_repair',
+            description: 'Repair a saved task by re-running the agent builder (creates a new recipe version and updates the task in place). Returns build_job_id.',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    task_id: { type: 'string', description: 'Task ID.' },
+                    hint: { type: 'string', description: 'Optional note about what is wrong (helps steer the repair).' },
+                    force: { type: 'boolean', description: 'If true, queue even if a build is already in progress.' },
+                },
+                required: ['task_id'],
+            },
+        },
+        {
+            name: 'task_recipe_versions',
+            description: 'List recipe versions for a task (for rollback/version switching).',
+            inputSchema: {
+                type: 'object',
+                properties: { task_id: { type: 'string', description: 'Task ID.' } },
+                required: ['task_id'],
+            },
+        },
+        {
+            name: 'task_set_recipe_version',
+            description: 'Switch a task to use a specific recipe version (by recipe_version_id or by version number).',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    task_id: { type: 'string', description: 'Task ID.' },
+                    recipe_version_id: { type: 'string', description: 'Recipe version ID.' },
+                    version: { type: 'number', description: 'Recipe version number (1, 2, 3, ...).' },
+                },
+                required: ['task_id'],
+            },
+        },
+        // Runs
+        {
+            name: 'run_create',
+            description: 'Create and enqueue a run for a saved task. Returns run_id. Runs use the access_profile_id attached to the task (if any).',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    task_id: { type: 'string', description: 'Task ID.' },
+                    max_runtime_seconds: { type: 'number', description: 'Optional hard limit for run time (seconds).' },
+                },
+                required: ['task_id'],
+            },
+        },
+        {
+            name: 'run_get',
+            description: 'Get run status/metadata by run_id.',
+            inputSchema: {
+                type: 'object',
+                properties: { run_id: { type: 'string', description: 'Run ID.' } },
+                required: ['run_id'],
+            },
+        },
+        {
+            name: 'run_list',
+            description: 'List recent runs for a task.',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    task_id: { type: 'string', description: 'Task ID.' },
+                    limit: { type: 'number', description: 'Max runs to return (default: 20).' },
+                },
+                required: ['task_id'],
+            },
+        },
+        {
+            name: 'run_stop',
+            description: 'Request a run stop (best-effort).',
+            inputSchema: {
+                type: 'object',
+                properties: { run_id: { type: 'string', description: 'Run ID.' } },
+                required: ['run_id'],
+            },
+        },
+        {
+            name: 'run_delete',
+            description: 'Soft-delete (hide) a finalized run.',
+            inputSchema: {
+                type: 'object',
+                properties: { run_id: { type: 'string', description: 'Run ID.' } },
+                required: ['run_id'],
+            },
+        },
+        {
+            name: 'run_results',
+            description: 'Fetch results for a completed run (paginated). Use cursor for the next page; use limit to control page size.',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    run_id: { type: 'string', description: 'Run ID (must be completed).' },
+                    limit: { type: 'number', description: 'Rows per page (1-1000, default: 100).' },
+                    cursor: { type: 'string', description: 'Cursor from the previous page (recommended).' },
+                    offset: { type: 'number', description: 'Optional offset-based paging (simple, but less stable than cursor).' },
+                },
+                required: ['run_id'],
+            },
+        },
+        // Access profiles
+        {
+            name: 'access_profile_list',
+            description: 'List access profiles (used for logged-in/session scraping).',
+            inputSchema: { type: 'object', properties: {}, required: [] },
+        },
+        {
+            name: 'access_profile_get',
+            description: 'Fetch one access profile by access_profile_id.',
+            inputSchema: {
+                type: 'object',
+                properties: { access_profile_id: { type: 'string', description: 'Access profile ID.' } },
+                required: ['access_profile_id'],
+            },
+        },
+        {
+            name: 'access_profile_create',
+            description: 'Create an access profile. Secrets (headers/credentials/storage_state) are stored server-side; responses do not include secrets.',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    name: { type: 'string', description: 'Access profile name.' },
+                    url: { type: 'string', description: 'Optional URL this profile is for.' },
+                    canonical_domain: { type: 'string', description: 'Optional canonical domain (advanced).' },
+                    kind: { type: 'string', enum: ['custom', 'credentials', 'session'] },
+                    headers: { type: 'array', items: { type: 'object' }, description: 'Optional extra headers (values stored server-side).' },
+                    inputs: { type: 'array', items: { type: 'object' }, description: 'Optional credential inputs (values stored server-side).' },
+                    storage_state: { type: 'object', description: 'Optional Playwright storage_state JSON (stored server-side).' },
+                    login: { type: 'object', description: 'Optional login helper config.' },
+                },
+                required: ['name'],
+            },
+        },
+        {
+            name: 'access_profile_update',
+            description: 'Update an access profile (partial update).',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    access_profile_id: { type: 'string', description: 'Access profile ID.' },
+                    patch: { type: 'object', description: 'Partial update object.' },
+                },
+                required: ['access_profile_id'],
+            },
+        },
+        {
+            name: 'access_profile_delete',
+            description: 'Delete an access profile.',
+            inputSchema: {
+                type: 'object',
+                properties: { access_profile_id: { type: 'string', description: 'Access profile ID.' } },
+                required: ['access_profile_id'],
+            },
+        },
+        {
+            name: 'access_profile_build_session',
+            description: 'Enqueue an AI-assisted session build for an access profile (if enabled on the server).',
+            inputSchema: {
+                type: 'object',
+                properties: {
+                    access_profile_id: { type: 'string', description: 'Access profile ID.' },
+                    start_url: { type: 'string', description: 'Optional URL to start the login flow from.' },
+                    hint: { type: 'string', description: 'Optional note to help the agent log in.' },
+                },
+                required: ['access_profile_id'],
+            },
+        },
+    ];
+}
+export function toolHandlers(api, maxChars) {
+    const handlers = {
+        health: async () => {
+            const result = await api.request({
+                method: 'GET',
+                path: '/health',
+            });
+            return asTextResult(result, maxChars);
+        },
+        task_create: async (raw) => {
+            const input = ToolInputs.task_create.parse(raw);
+            const effectiveFieldMode = input.field_mode ?? (input.fields?.length ? 'manual' : 'auto');
+            const result = await api.request({
+                method: 'POST',
+                path: '/v1/agent/extractions/build',
+                body: {
+                    url: input.url,
+                    access_profile_id: input.access_profile_id,
+                    field_mode: effectiveFieldMode,
+                    build_mode: input.build_mode,
+                    ...(input.script_source ? { script_source: input.script_source } : {}),
+                    ...(input.recipe_override ? { recipe_override: input.recipe_override } : {}),
+                    goal: input.goal,
+                    fields: input.fields,
+                    maxPreviewRows: input.max_preview_rows,
+                    advanced: input.advanced,
+                    debug: false,
+                },
+            });
+            const previewId = typeof result?.preview_id === 'string' ? String(result.preview_id) : null;
+            return asTextResult(previewId ? { preview_id: previewId } : result, maxChars);
+        },
+        task_build_status: async (raw) => {
+            const input = ToolInputs.task_build_status.parse(raw);
+            const result = await api.request({
+                method: 'GET',
+                path: `/v1/agent/extractions/preview/${input.preview_id}`,
+            });
+            const safe = {
+                preview_id: result?.preview_id ?? input.preview_id,
+                status: result?.status ?? null,
+                error: result?.error ?? null,
+                queued_at: result?.queued_at ?? null,
+                started_at: result?.started_at ?? null,
+                finished_at: result?.finished_at ?? null,
+                duration_ms: result?.duration_ms ?? null,
+            };
+            return asTextResult(safe, maxChars);
+        },
+        task_publish: async (raw) => {
+            const input = ToolInputs.task_publish.parse(raw);
+            const result = await api.request({
+                method: 'POST',
+                path: `/v1/agent/drafts/${input.preview_id}/promote`,
+                body: {
+                    name: input.name,
+                    access_profile_id: input.access_profile_id,
+                },
+            });
+            const taskId = typeof result?.task_id === 'string' ? String(result.task_id) : null;
+            return asTextResult(taskId ? { task_id: taskId } : result, maxChars);
+        },
+        task_draft_delete: async (raw) => {
+            const input = ToolInputs.task_draft_delete.parse(raw);
+            const result = await api.request({
+                method: 'DELETE',
+                path: `/v1/agent/drafts/${input.preview_id}`,
+            });
+            return asTextResult(result, maxChars);
+        },
+        task_repair: async (raw) => {
+            const input = ToolInputs.task_repair.parse(raw);
+            const result = await api.request({
+                method: 'POST',
+                path: `/v1/agent/tasks/${input.task_id}/repair`,
+                body: input.hint || input.force ? { hint: input.hint, force: input.force } : {},
+            });
+            return asTextResult(result, maxChars);
+        },
+        task_recipe_versions: async (raw) => {
+            const input = ToolInputs.task_recipe_versions.parse(raw);
+            const result = await api.request({
+                method: 'GET',
+                path: `/v1/tasks/${input.task_id}/recipe-versions`,
+            });
+            return asTextResult(result, maxChars);
+        },
+        task_set_recipe_version: async (raw) => {
+            const input = ToolInputs.task_set_recipe_version.parse(raw);
+            const result = await api.request({
+                method: 'POST',
+                path: `/v1/tasks/${input.task_id}/recipe-version`,
+                body: {
+                    recipe_version_id: input.recipe_version_id,
+                    version: input.version,
+                },
+            });
+            return asTextResult(result, maxChars);
+        },
+        run_create: async (raw) => {
+            const input = ToolInputs.run_create.parse(raw);
+            const result = await api.request({
+                method: 'POST',
+                path: `/v1/tasks/${input.task_id}/runs`,
+                body: input.max_runtime_seconds ? { max_runtime_seconds: input.max_runtime_seconds } : {},
+            });
+            return asTextResult(result, maxChars);
+        },
+        run_get: async (raw) => {
+            const input = ToolInputs.run_get.parse(raw);
+            const result = await api.request({
+                method: 'GET',
+                path: `/v1/task-runs/${input.run_id}`,
+            });
+            return asTextResult(result, maxChars);
+        },
+        run_list: async (raw) => {
+            const input = ToolInputs.run_list.parse(raw);
+            const result = await api.request({
+                method: 'GET',
+                path: `/v1/tasks/${input.task_id}/runs`,
+                query: { limit: input.limit },
+            });
+            return asTextResult(result, maxChars);
+        },
+        run_stop: async (raw) => {
+            const input = ToolInputs.run_stop.parse(raw);
+            const result = await api.request({
+                method: 'POST',
+                path: `/v1/task-runs/${input.run_id}/stop`,
+            });
+            return asTextResult(result, maxChars);
+        },
+        run_delete: async (raw) => {
+            const input = ToolInputs.run_delete.parse(raw);
+            const result = await api.request({
+                method: 'DELETE',
+                path: `/v1/task-runs/${input.run_id}`,
+            });
+            return asTextResult(result, maxChars);
+        },
+        run_results: async (raw) => {
+            const input = ToolInputs.run_results.parse(raw);
+            const result = await api.request({
+                method: 'GET',
+                path: '/results/data',
+                query: {
+                    run_id: input.run_id,
+                    limit: input.limit,
+                    cursor: input.cursor,
+                    offset: input.offset,
+                },
+            });
+            return asTextResult(result, maxChars);
+        },
+        access_profile_list: async () => {
+            const result = await api.request({
+                method: 'GET',
+                path: '/v1/access-profiles',
+            });
+            return asTextResult(result, maxChars);
+        },
+        access_profile_get: async (raw) => {
+            const input = ToolInputs.access_profile_get.parse(raw);
+            const result = await api.request({
+                method: 'GET',
+                path: `/v1/access-profiles/${input.access_profile_id}`,
+            });
+            return asTextResult(result, maxChars);
+        },
+        access_profile_create: async (raw) => {
+            const input = ToolInputs.access_profile_create.parse(raw);
+            const result = await api.request({
+                method: 'POST',
+                path: '/v1/access-profiles',
+                body: input,
+            });
+            return asTextResult(result, maxChars);
+        },
+        access_profile_update: async (raw) => {
+            const input = ToolInputs.access_profile_update.parse(raw);
+            const result = await api.request({
+                method: 'PATCH',
+                path: `/v1/access-profiles/${input.access_profile_id}`,
+                body: input.patch,
+            });
+            return asTextResult(result, maxChars);
+        },
+        access_profile_delete: async (raw) => {
+            const input = ToolInputs.access_profile_delete.parse(raw);
+            const result = await api.request({
+                method: 'DELETE',
+                path: `/v1/access-profiles/${input.access_profile_id}`,
+            });
+            return asTextResult(result, maxChars);
+        },
+        access_profile_build_session: async (raw) => {
+            const input = ToolInputs.access_profile_build_session.parse(raw);
+            const body = input.start_url || input.hint ? { start_url: input.start_url, hint: input.hint } : undefined;
+            const result = await api.request({
+                method: 'POST',
+                path: `/v1/access-profiles/${input.access_profile_id}/build-session`,
+                body,
+            });
+            return asTextResult(result, maxChars);
+        },
+    };
+    return handlers;
+}

package/package.json ADDED Viewed

@@ -0,0 +1,45 @@
+{
+  "name": "@selextract/mcp-selextract",
+  "version": "0.4.0",
+  "description": "Selextract Cloud MCP server (local stdio) for calling the Selextract Worker API",
+  "license": "UNLICENSED",
+  "type": "module",
+  "files": [
+    "dist",
+    "README.md"
+  ],
+  "keywords": [
+    "mcp",
+    "model-context-protocol",
+    "selextract",
+    "scraping",
+    "web-scraping"
+  ],
+  "publishConfig": {
+    "access": "public"
+  },
+  "bin": {
+    "selextract-mcp": "dist/cli.js",
+    "mcp-selextract": "dist/cli.js"
+  },
+  "scripts": {
+    "dev": "tsx src/cli.ts",
+    "build": "rm -rf dist && tsc",
+    "prepublishOnly": "npm run build",
+    "test": "vitest run",
+    "type-check": "tsc --noEmit"
+  },
+  "dependencies": {
+    "@modelcontextprotocol/sdk": "^1.0.0",
+    "dotenv": "^16.3.1",
+    "zod": "^3.22.4"
+  },
+  "devDependencies": {
+    "@types/node": "^20.8.0",
+    "tsx": "^3.14.0",
+    "typescript": "^5.2.0"
+  },
+  "engines": {
+    "node": ">=18.0.0"
+  }
+}