npm - autoform-mcp-server - Versions diffs - 1.4.0 → 1.5.0 - Mend

autoform-mcp-server 1.4.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.js +6 -2
package/dist/services/pdfTextExtractor.d.ts +87 -0
package/dist/services/pdfTextExtractor.js +224 -0
package/dist/tools/analyzeStaticPdf.d.ts +27 -0
package/dist/tools/analyzeStaticPdf.js +157 -0
package/dist/tools/detectFields.js +97 -20
package/package.json +4 -3

package/dist/index.js CHANGED Viewed

@@ -7,17 +7,19 @@ import { handleFillPdf, fillPdfSchema } from './tools/fillPdf.js';
 import { handleFillAtCoordinates, fillAtCoordinatesSchema, handleGetPdfInfo, getPdfInfoSchema } from './tools/fillAtCoordinates.js';
 import { handleFillBatchAtCoordinates, fillBatchAtCoordinatesSchema } from './tools/fillBatchAtCoordinates.js';
 import { handleFillBatchAcroForm, fillBatchAcroFormSchema } from './tools/fillBatchAcroForm.js';
+import { handleAnalyzeStaticPdf, analyzeStaticPdfSchema } from './tools/analyzeStaticPdf.js';
 import { handleSaveCoordinatesAsTemplate, saveCoordinatesAsTemplateSchema } from './tools/saveCoordinatesAsTemplate.js';
 import { handleGenerateBatch, generateBatchSchema } from './tools/generateBatch.js';
 import { handleListTemplates, listTemplatesSchema } from './tools/manageTemplates.js';
 import { handleImportTemplate, importTemplateSchema } from './tools/importTemplate.js';
-const server = new Server({ name: 'autoform', version: '1.4.0' }, { capabilities: { tools: {} } });
+const server = new Server({ name: 'autoform', version: '1.5.0' }, { capabilities: { tools: {} } });
 // Register all tools
 server.setRequestHandler(ListToolsRequestSchema, async () => ({
     tools: [
         // Info & detection
         getPdfInfoSchema,
         detectFieldsSchema,
+        analyzeStaticPdfSchema,
         // Single fill
         fillPdfSchema,
         fillAtCoordinatesSchema,
@@ -40,6 +42,8 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
                 return await handleGetPdfInfo(args);
             case 'autoform_detect_fields':
                 return await handleDetectFields(args);
+            case 'autoform_analyze_static_pdf':
+                return await handleAnalyzeStaticPdf(args);
             case 'autoform_fill_pdf':
                 return await handleFillPdf(args);
             case 'autoform_fill_at_coordinates':
@@ -70,4 +74,4 @@ server.setRequestHandler(CallToolRequestSchema, async (request) => {
 // Start server
 const transport = new StdioServerTransport();
 await server.connect(transport);
-console.error('[AutoForm MCP] Server v1.4.0 started — 10 tools registered');
+console.error('[AutoForm MCP] Server v1.5.0 started — 11 tools registered');

package/dist/services/pdfTextExtractor.d.ts ADDED Viewed

@@ -0,0 +1,87 @@
+/**
+ * Position of a text element in PDF coordinates (origin: bottom-left).
+ */
+export interface TextItem {
+    text: string;
+    x: number;
+    y: number;
+    width: number;
+    height: number;
+    fontSize: number;
+    page: number;
+}
+/**
+ * Horizontal line detected as a drawing operator.
+ */
+export interface HorizontalLine {
+    x: number;
+    y: number;
+    width: number;
+    page: number;
+}
+/**
+ * A page's extracted information.
+ */
+export interface PageTextInfo {
+    page: number;
+    width: number;
+    height: number;
+    textItems: TextItem[];
+    totalCharacters: number;
+}
+/**
+ * Classifies a PDF based on its extractable content.
+ */
+export type PdfContentType = 'acroform' | 'text_vectorial' | 'image_only' | 'mixed';
+/**
+ * Extracts text content from a PDF using pdfjs-dist in Node.js-compatible mode.
+ * No native dependencies, works entirely in pure JavaScript.
+ */
+export declare class PdfTextExtractor {
+    /** Extract all text items from all pages of a PDF */
+    static extractAllText(pdfPath: string): Promise<PageTextInfo[]>;
+    /**
+     * Classify a PDF based on extractable text density.
+     * Thresholds are empirical; can be tuned.
+     */
+    static classifyContent(pages: PageTextInfo[]): PdfContentType;
+    /**
+     * For a given point (x, y) on a page, find the text items within a certain distance.
+     * Distance is computed in PDF points (not pixels).
+     * Returns items sorted by distance (closest first).
+     */
+    static findNearbyText(pageInfo: PageTextInfo, targetX: number, targetY: number, maxDistance?: number): Array<TextItem & {
+        distance: number;
+        relativePosition: 'left' | 'right' | 'above' | 'below' | 'inside';
+    }>;
+    /**
+     * Find the most likely label for a field at a given position.
+     * Prefers text that:
+     * - Is to the LEFT of the field (same Y level, typical "Label: ___")
+     * - Is ABOVE the field (stacked layouts)
+     * - Ends with ':' (explicit label)
+     * - Is short (labels are usually 1-3 words)
+     */
+    static inferLabelForField(pageInfo: PageTextInfo, fieldX: number, fieldY: number, fieldWidth: number, fieldHeight: number): {
+        label: string | null;
+        confidence: 'high' | 'medium' | 'low';
+        nearbyText: string[];
+    };
+    /**
+     * Detect probable field locations in a static PDF (no AcroForm).
+     * Heuristics:
+     * 1. Text ending in ':' followed by horizontal space → likely label + input area
+     * 2. Text that looks like a label (short, ends with ':') positioned above a gap
+     */
+    static detectStaticFieldCandidates(pages: PageTextInfo[]): Array<{
+        page: number;
+        inferredLabel: string;
+        hintFromText: string;
+        suggestedX: number;
+        suggestedY: number;
+        suggestedWidth: number;
+        suggestedHeight: number;
+        confidence: 'high' | 'medium' | 'low';
+        reason: string;
+    }>;
+}

package/dist/services/pdfTextExtractor.js ADDED Viewed

@@ -0,0 +1,224 @@
+import * as fs from 'fs';
+// Use legacy build — works in Node without DOM/canvas
+// @ts-ignore — pdfjs-dist types are incomplete for this path
+import * as pdfjsLib from 'pdfjs-dist/legacy/build/pdf.mjs';
+/**
+ * Extracts text content from a PDF using pdfjs-dist in Node.js-compatible mode.
+ * No native dependencies, works entirely in pure JavaScript.
+ */
+export class PdfTextExtractor {
+    /** Extract all text items from all pages of a PDF */
+    static async extractAllText(pdfPath) {
+        const bytes = fs.readFileSync(pdfPath);
+        const pdf = await pdfjsLib.getDocument({
+            data: new Uint8Array(bytes),
+            useSystemFonts: true,
+            // Silence pdfjs warnings on stderr when running in Node
+            verbosity: 0
+        }).promise;
+        const pages = [];
+        for (let pageNum = 1; pageNum <= pdf.numPages; pageNum++) {
+            const page = await pdf.getPage(pageNum);
+            const viewport = page.getViewport({ scale: 1 });
+            const textContent = await page.getTextContent();
+            const items = [];
+            let totalChars = 0;
+            for (const raw of textContent.items) {
+                const text = String(raw.str ?? '').trim();
+                if (text === '')
+                    continue;
+                // transform = [scaleX, skewX, skewY, scaleY, translateX, translateY]
+                const [, , , scaleY, tx, ty] = raw.transform;
+                const fontSize = Math.abs(scaleY);
+                // pdfjs gives us the baseline; approximate the bounding box
+                // Width is already computed by pdfjs; height ≈ fontSize
+                const width = Math.abs(raw.width ?? 0);
+                const height = Math.abs(raw.height ?? fontSize);
+                items.push({
+                    text,
+                    x: tx,
+                    y: ty, // already in PDF coordinate system (bottom-left origin)
+                    width,
+                    height,
+                    fontSize,
+                    page: pageNum
+                });
+                totalChars += text.length;
+            }
+            pages.push({
+                page: pageNum,
+                width: viewport.width,
+                height: viewport.height,
+                textItems: items,
+                totalCharacters: totalChars
+            });
+        }
+        return pages;
+    }
+    /**
+     * Classify a PDF based on extractable text density.
+     * Thresholds are empirical; can be tuned.
+     */
+    static classifyContent(pages) {
+        if (pages.length === 0)
+            return 'image_only';
+        const totalChars = pages.reduce((sum, p) => sum + p.totalCharacters, 0);
+        const totalItems = pages.reduce((sum, p) => sum + p.textItems.length, 0);
+        const avgCharsPerPage = totalChars / pages.length;
+        // Empty or near-empty: scanned image
+        if (totalChars < 10 || totalItems < 3)
+            return 'image_only';
+        // Low density: possibly mixed (image + some labels)
+        if (avgCharsPerPage < 50)
+            return 'mixed';
+        return 'text_vectorial';
+    }
+    /**
+     * For a given point (x, y) on a page, find the text items within a certain distance.
+     * Distance is computed in PDF points (not pixels).
+     * Returns items sorted by distance (closest first).
+     */
+    static findNearbyText(pageInfo, targetX, targetY, maxDistance = 100) {
+        const results = [];
+        for (const item of pageInfo.textItems) {
+            // Use the center of the text item
+            const itemCenterX = item.x + item.width / 2;
+            const itemCenterY = item.y + item.height / 2;
+            const dx = targetX - itemCenterX;
+            const dy = targetY - itemCenterY;
+            const distance = Math.sqrt(dx * dx + dy * dy);
+            if (distance > maxDistance)
+                continue;
+            // Determine relative position
+            let relativePosition;
+            const absDx = Math.abs(dx);
+            const absDy = Math.abs(dy);
+            if (absDx < item.width / 2 && absDy < item.height / 2) {
+                relativePosition = 'inside';
+            }
+            else if (absDx > absDy) {
+                relativePosition = dx > 0 ? 'left' : 'right';
+            }
+            else {
+                relativePosition = dy > 0 ? 'below' : 'above';
+            }
+            results.push({ ...item, distance, relativePosition });
+        }
+        return results.sort((a, b) => a.distance - b.distance);
+    }
+    /**
+     * Find the most likely label for a field at a given position.
+     * Prefers text that:
+     * - Is to the LEFT of the field (same Y level, typical "Label: ___")
+     * - Is ABOVE the field (stacked layouts)
+     * - Ends with ':' (explicit label)
+     * - Is short (labels are usually 1-3 words)
+     */
+    static inferLabelForField(pageInfo, fieldX, fieldY, fieldWidth, fieldHeight) {
+        // Target point: slightly to the LEFT of the field's center Y
+        const targetX = fieldX; // left edge of field
+        const targetY = fieldY + fieldHeight / 2;
+        const nearby = this.findNearbyText(pageInfo, targetX, targetY, 150);
+        if (nearby.length === 0) {
+            return { label: null, confidence: 'low', nearbyText: [] };
+        }
+        // Score each candidate
+        const scored = nearby.map(item => {
+            let score = 0;
+            // Prefer text to the LEFT
+            if (item.relativePosition === 'left')
+                score += 50;
+            // Secondary: above the field
+            else if (item.relativePosition === 'above')
+                score += 30;
+            // Penalize right/below/inside
+            else
+                score += 10;
+            // Closer is better (inversely proportional)
+            score += Math.max(0, 100 - item.distance);
+            // Ends with ':' → almost certainly a label
+            if (item.text.trim().endsWith(':'))
+                score += 40;
+            // Short text is more label-like (1-4 words)
+            const wordCount = item.text.trim().split(/\s+/).length;
+            if (wordCount >= 1 && wordCount <= 4)
+                score += 20;
+            else if (wordCount > 10)
+                score -= 20;
+            // Penalize very generic text
+            if (/^(página|page|de|the|and|or)$/i.test(item.text.trim()))
+                score -= 30;
+            return { item, score };
+        });
+        scored.sort((a, b) => b.score - a.score);
+        const best = scored[0];
+        const cleanLabel = best.item.text.trim().replace(/:\s*$/, '');
+        // Confidence based on score
+        let confidence;
+        if (best.score >= 100)
+            confidence = 'high';
+        else if (best.score >= 60)
+            confidence = 'medium';
+        else
+            confidence = 'low';
+        // Return top 5 nearby text items for context
+        const nearbyText = scored.slice(0, 5).map(s => s.item.text.trim());
+        return { label: cleanLabel, confidence, nearbyText };
+    }
+    /**
+     * Detect probable field locations in a static PDF (no AcroForm).
+     * Heuristics:
+     * 1. Text ending in ':' followed by horizontal space → likely label + input area
+     * 2. Text that looks like a label (short, ends with ':') positioned above a gap
+     */
+    static detectStaticFieldCandidates(pages) {
+        const candidates = [];
+        for (const pageInfo of pages) {
+            // Find all "label-like" text items (end with ':' or are short and capitalized)
+            const labels = pageInfo.textItems.filter(item => {
+                const t = item.text.trim();
+                if (t.length === 0 || t.length > 40)
+                    return false;
+                return t.endsWith(':') || /^[A-ZÁÉÍÓÚÑ][\wáéíóúñ\s]*$/.test(t);
+            });
+            for (const label of labels) {
+                const labelText = label.text.trim().replace(/:\s*$/, '');
+                if (labelText.length < 2)
+                    continue;
+                // Assume the input goes to the RIGHT of the label
+                // (most common "Label: ___" pattern)
+                const fieldX = label.x + label.width + 5;
+                const fieldY = label.y;
+                const fieldWidth = Math.max(150, (pageInfo.width - fieldX - 20) * 0.4);
+                const fieldHeight = label.fontSize * 1.5;
+                // Only suggest if there's space to the right
+                if (fieldX + fieldWidth > pageInfo.width)
+                    continue;
+                // Check: is there text already occupying that space?
+                const occupied = pageInfo.textItems.some(other => {
+                    if (other === label)
+                        return false;
+                    const otherCenterX = other.x + other.width / 2;
+                    const otherCenterY = other.y + other.height / 2;
+                    return (otherCenterX >= fieldX &&
+                        otherCenterX <= fieldX + fieldWidth &&
+                        Math.abs(otherCenterY - (fieldY + fieldHeight / 2)) < fieldHeight);
+                });
+                if (occupied)
+                    continue;
+                candidates.push({
+                    page: pageInfo.page,
+                    inferredLabel: labelText,
+                    hintFromText: label.text.trim(),
+                    suggestedX: Math.round(fieldX),
+                    suggestedY: Math.round(fieldY),
+                    suggestedWidth: Math.round(fieldWidth),
+                    suggestedHeight: Math.round(fieldHeight),
+                    confidence: labelText.endsWith(':') ? 'high' : 'medium',
+                    reason: `Label text "${label.text.trim()}" detected at (${Math.round(label.x)}, ${Math.round(label.y)}); suggested input area to its right.`
+                });
+            }
+        }
+        return candidates;
+    }
+}

package/dist/tools/analyzeStaticPdf.d.ts ADDED Viewed

@@ -0,0 +1,27 @@
+export declare const analyzeStaticPdfSchema: {
+    name: string;
+    description: string;
+    inputSchema: {
+        type: "object";
+        properties: {
+            pdf_path: {
+                type: string;
+                description: string;
+            };
+        };
+        required: string[];
+    };
+};
+export declare function handleAnalyzeStaticPdf(args: any): Promise<{
+    content: {
+        type: "text";
+        text: string;
+    }[];
+    isError: boolean;
+} | {
+    content: {
+        type: "text";
+        text: string;
+    }[];
+    isError?: undefined;
+}>;

package/dist/tools/analyzeStaticPdf.js ADDED Viewed

@@ -0,0 +1,157 @@
+import { PdfTextExtractor } from '../services/pdfTextExtractor.js';
+export const analyzeStaticPdfSchema = {
+    name: 'autoform_analyze_static_pdf',
+    description: `Analiza un PDF estatico (sin AcroForm) para sugerir donde colocar los campos automaticamente. Extrae texto con posiciones y usa heuristicas para detectar labels y areas de input probables.
+CUANDO USAR:
+- El PDF no tiene AcroForm (autoform_detect_fields devolvio has_acroform=false).
+- Es un certificado, diploma, constancia, formulario diseñado en Canva/Word/Figma.
+- Quieres sugerencias automaticas de donde van los campos sin tener que calcular coordenadas visualmente.
+QUE DEVUELVE:
+- Clasificacion del PDF: "text_vectorial" (texto extraible), "mixed" (texto + imagenes), "image_only" (escaneado).
+- Para text_vectorial: "suggested_fields" con labels inferidos y coordenadas sugeridas.
+- Para image_only: "suggested_approaches" con opciones claras para el usuario (NO rechaza el PDF).
+- Texto completo con posiciones por pagina ("all_text_items").
+⚠️ CASO PDF-IMAGEN (pdf_type: "image_only"):
+Si recibes este tipo, NO intentes procesar el PDF con los otros tools. En su lugar, PRESENTA AL USUARIO las suggested_approaches de forma clara y amigable, y espera su eleccion. Las opciones tipicas son:
+1. Adjuntar el PDF al chat de Claude Desktop (Claude puede verlo visualmente)
+2. Usar la app web de AutoForm para definir campos manualmente
+3. Proporcionar coordenadas manualmente si ya las conoces
+⚠️ CASO text_vectorial:
+- suggested_fields son SUGERENCIAS heuristicas (asumen layout "Label: ___" a la derecha).
+- Para layouts atipicos (certificados con campo DEBAJO del label, diplomas con layout centrado, etc.) las sugerencias pueden estar en posiciones incorrectas.
+- USA all_text_items para ver TODOS los textos del PDF con sus posiciones exactas y razona sobre el layout real antes de decidir las coordenadas finales.
+- Los campos "CERTIFICADO", "Firma 1/2" que se detectan como sugerencias probablemente NO son campos editables — son etiquetas del certificado. Descarta sugerencias que claramente son texto decorativo.
+- Cuando no estes seguro, PREGUNTA al usuario donde va cada dato en lugar de adivinar.
+⚠️ CASO mixed:
+Hay algo de texto pero poca densidad. Puede haber campos que NO se detectaron automaticamente porque el label es una imagen. Usa suggested_fields como punto de partida pero verifica visualmente si faltan campos.`,
+    inputSchema: {
+        type: 'object',
+        properties: {
+            pdf_path: {
+                type: 'string',
+                description: 'Ruta absoluta al PDF estatico a analizar'
+            }
+        },
+        required: ['pdf_path']
+    }
+};
+export async function handleAnalyzeStaticPdf(args) {
+    const { pdf_path } = args;
+    let pages;
+    try {
+        pages = await PdfTextExtractor.extractAllText(pdf_path);
+    }
+    catch (err) {
+        return {
+            content: [{
+                    type: 'text',
+                    text: JSON.stringify({
+                        error: true,
+                        message: `No se pudo leer el PDF: ${err instanceof Error ? err.message : String(err)}`
+                    }, null, 2)
+                }],
+            isError: true
+        };
+    }
+    if (pages.length === 0) {
+        return {
+            content: [{
+                    type: 'text',
+                    text: JSON.stringify({
+                        error: true,
+                        message: 'El PDF no contiene paginas procesables.'
+                    }, null, 2)
+                }],
+            isError: true
+        };
+    }
+    const contentType = PdfTextExtractor.classifyContent(pages);
+    // Case: image-only PDF — return actionable options, never reject
+    if (contentType === 'image_only') {
+        return {
+            content: [{
+                    type: 'text',
+                    text: JSON.stringify({
+                        pdf_type: 'image_only',
+                        pdf_info: {
+                            pages: pages.length,
+                            dimensions_pt: {
+                                width: Math.round(pages[0].width),
+                                height: Math.round(pages[0].height)
+                            }
+                        },
+                        reason: 'Este PDF no contiene texto vectorial extraible. Probablemente es una imagen escaneada, un PDF aplanado, o un diseño donde todos los textos fueron convertidos a curvas.',
+                        message_for_user: 'Este PDF es tipo imagen — no puedo extraer texto automaticamente para detectar los campos. Sin embargo, tenemos varias opciones para procesarlo. Elige la que prefieras:',
+                        suggested_approaches: [
+                            {
+                                id: 'attach_to_chat',
+                                title: 'Adjuntar el PDF directamente al chat',
+                                description: 'Arrastra el PDF al chat de Claude Desktop. Podre verlo visualmente como imagen y estimar las coordenadas de los campos con buena precision. Luego usaremos autoform_fill_at_coordinates o autoform_fill_batch_at_coordinates para llenar los datos.',
+                                effort: 'bajo',
+                                precision: 'media-alta',
+                                recommended: true
+                            },
+                            {
+                                id: 'web_app',
+                                title: 'Usar la app web de AutoForm',
+                                description: 'Abre la aplicacion web de AutoForm en tu navegador, carga el PDF, dibuja los campos visualmente donde correspondan, exporta como JSON y luego usa autoform_import_template aqui. Es la opcion mas precisa para PDFs complejos.',
+                                effort: 'medio',
+                                precision: 'alta'
+                            },
+                            {
+                                id: 'manual_coordinates',
+                                title: 'Proporcionar coordenadas manualmente',
+                                description: 'Si ya conoces las coordenadas exactas de los campos (por ejemplo porque trabajaste con el PDF antes), puedes pasarlas directamente a autoform_fill_at_coordinates o autoform_fill_batch_at_coordinates.',
+                                effort: 'alto',
+                                precision: 'exacta'
+                            }
+                        ],
+                        next_instruction: 'Presenta estas opciones al usuario de forma clara (no copies el JSON literal, reformulalo conversacionalmente). Espera su eleccion antes de continuar. NO intentes procesar el PDF con otros tools hasta tener instrucciones del usuario.'
+                    }, null, 2)
+                }]
+        };
+    }
+    // Case: text_vectorial or mixed — extract field candidates
+    const suggestedFields = PdfTextExtractor.detectStaticFieldCandidates(pages);
+    const allTextItems = pages.flatMap(p => p.textItems.map(item => ({
+        text: item.text,
+        page: item.page,
+        x: Math.round(item.x),
+        y: Math.round(item.y),
+        fontSize: Math.round(item.fontSize * 10) / 10
+    })));
+    const summary = suggestedFields.length > 0
+        ? `Se detectaron ${suggestedFields.length} campos candidatos automaticamente:\n${suggestedFields.map(f => `- "${f.inferredLabel}" en pag ${f.page}, pos (${f.suggestedX}, ${f.suggestedY}), ${f.suggestedWidth}x${f.suggestedHeight} [${f.confidence}]`).join('\n')}`
+        : 'No se detectaron campos candidatos automaticamente. El PDF tiene texto pero no hay patrones claros de "Label: ___" reconocibles.';
+    return {
+        content: [{
+                type: 'text',
+                text: JSON.stringify({
+                    pdf_type: contentType,
+                    pdf_info: {
+                        pages: pages.length,
+                        dimensions_pt: pages.map(p => ({
+                            page: p.page,
+                            width: Math.round(p.width),
+                            height: Math.round(p.height)
+                        }))
+                    },
+                    total_text_items: allTextItems.length,
+                    suggested_fields: suggestedFields,
+                    all_text_items: allTextItems.slice(0, 100), // Limit to avoid huge responses
+                    text_items_truncated: allTextItems.length > 100,
+                    summary,
+                    guidance: contentType === 'mixed'
+                        ? 'El PDF tiene densidad baja de texto — puede tener elementos visuales (titulos como imagen, logos) que no se extraen. Usa los suggested_fields como punto de partida pero verifica si faltan campos.'
+                        : suggestedFields.length > 0
+                            ? 'Usa los suggested_fields directamente con autoform_fill_at_coordinates (un documento) o autoform_fill_batch_at_coordinates (multiples documentos).'
+                            : 'No hay campos obvios detectables. Pregunta al usuario donde quiere colocar los datos o pidele que abra el PDF en la app web de AutoForm.'
+                }, null, 2)
+            }]
+    };
+}

package/dist/tools/detectFields.js CHANGED Viewed

@@ -1,22 +1,35 @@
 import { PdfService } from '../services/pdfService.js';
+import { PdfTextExtractor } from '../services/pdfTextExtractor.js';
 export const detectFieldsSchema = {
     name: 'autoform_detect_fields',
-    description: `Detecta campos de formulario AcroForm en un PDF interactivo (creado con Adobe Acrobat, LibreOffice, etc).
+    description: `Detecta campos de formulario AcroForm en un PDF interactivo Y correlaciona cada campo con su label visible extraido del texto del PDF.
-CUANDO USAR: El PDF es un formulario donde puedes hacer click y escribir. Si el PDF es una imagen/diseño estatico (certificado, diploma), esta tool dira "sin campos" — en ese caso usa autoform_get_pdf_info + analisis visual.
+CUANDO USAR:
+- El PDF es un formulario donde puedes hacer click y escribir (AcroForm).
+- Quieres saber que campos tiene + como se llaman visualmente.
-⚠️ IMPORTANTE — Los nombres tecnicos NO son los labels visibles:
-- Esta tool devuelve nombres TECNICOS internos del PDF (ej: "Text1", "fld_001", "untitled_field"), que pueden NO coincidir con los labels visibles ("Nombre:", "Apellido:", "DNI:").
-- NO asumas el significado de cada campo por su nombre tecnico ni por su posicion.
-- DESPUES de detectar los campos, DEBES mirar el PDF visualmente (ya puedes ver PDFs como imagenes) y correlacionar cada campo tecnico con su label visible usando las coordenadas (x, y, page) que devuelve esta tool.
-- Solo DESPUES de esa correlacion visual procede a llenar con los datos correctos.
+QUE DEVUELVE:
+- Lista de campos AcroForm con nombre tecnico, tipo, pagina y coordenadas.
+- Para CADA campo, un "inferred_label" calculado automaticamente correlacionando la posicion del campo con el texto circundante del PDF.
+- "nearby_text": hasta 5 textos cercanos al campo, ordenados por relevancia.
+- "confidence": high/medium/low segun que tan claro es el label correlacionado.
-FLUJO RECOMENDADO:
-1. autoform_detect_fields → obtienes lista de campos tecnicos con coordenadas
-2. MIRAS EL PDF VISUALMENTE → asocias cada coordenada con el label visible cercano
-3. Decides que dato va en que campo tecnico
-4. Para UN documento: autoform_fill_pdf con field_values={"nombre_tecnico": "valor"}
-5. Para MULTIPLES documentos (batch): autoform_fill_batch_acroform con data_rows y field_map si las claves son logicas.`,
+⚠️ COMO USAR EL RESULTADO:
+- Si confidence es "high": confia en inferred_label y usalo directamente.
+- Si confidence es "medium": el label es probable pero verifica con nearby_text si tiene sentido.
+- Si confidence es "low": NO asumas — pregunta al usuario o mira el PDF visualmente si es posible.
+- NUNCA inventes el significado de un campo por su posicion o numero.
+SI EL PDF NO TIENE ACROFORM: esta tool devolvera has_acroform=false. En ese caso usa autoform_analyze_static_pdf.
+FLUJO RECOMENDADO PARA UN DOCUMENTO:
+1. autoform_detect_fields → obtienes campos + inferred_labels
+2. Usa autoform_fill_pdf con field_values={nombre_tecnico: valor}
+   O usa autoform_fill_pdf con field_map si prefieres claves logicas
+FLUJO RECOMENDADO PARA MULTIPLES DOCUMENTOS:
+1. autoform_detect_fields → obtienes campos + inferred_labels
+2. Usa autoform_fill_batch_acroform con data_rows y field_map si las claves son logicas.`,
     inputSchema: {
         type: 'object',
         properties: {
@@ -31,21 +44,85 @@ FLUJO RECOMENDADO:
 export async function handleDetectFields(args) {
     const { pdf_path } = args;
     const { fields, hasAcroform } = await PdfService.detectFields(pdf_path);
-    let summary;
-    if (fields.length === 0) {
-        summary = `El PDF "${pdf_path}" no contiene campos AcroForm.\n\nPara usar este PDF, defina los campos manualmente en la app web de AutoForm y exporte la plantilla como JSON. Luego impórtela con autoform_import_template.`;
+    // If no AcroForm, recommend the static analysis tool
+    if (!hasAcroform || fields.length === 0) {
+        return {
+            content: [{
+                    type: 'text',
+                    text: JSON.stringify({
+                        has_acroform: false,
+                        total_fields: 0,
+                        fields: [],
+                        message: `El PDF no contiene campos AcroForm interactivos. Use autoform_analyze_static_pdf para analizar PDFs estaticos (certificados, diplomas, formularios diseñados en Canva/Word).`,
+                        next_action: 'autoform_analyze_static_pdf'
+                    }, null, 2)
+                }]
+        };
+    }
+    // Extract text from the PDF to correlate with AcroForm fields
+    let pagesText;
+    try {
+        pagesText = await PdfTextExtractor.extractAllText(pdf_path);
+    }
+    catch (err) {
+        // If extraction fails, return fields without label inference
+        return {
+            content: [{
+                    type: 'text',
+                    text: JSON.stringify({
+                        has_acroform: true,
+                        total_fields: fields.length,
+                        fields,
+                        text_extraction_error: err instanceof Error ? err.message : String(err),
+                        warning: 'No se pudo extraer texto del PDF para inferir labels. Los nombres tecnicos se devuelven sin correlacion visual.'
+                    }, null, 2)
+                }]
+        };
+    }
+    // Enrich each field with inferred label and nearby text
+    const enrichedFields = fields.map(field => {
+        const pageInfo = pagesText.find(p => p.page === field.page);
+        if (!pageInfo) {
+            return { ...field, inferred_label: null, confidence: 'low', nearby_text: [] };
+        }
+        const { label, confidence, nearbyText } = PdfTextExtractor.inferLabelForField(pageInfo, field.x, field.y, field.width, field.height);
+        return {
+            ...field,
+            inferred_label: label,
+            confidence,
+            nearby_text: nearbyText
+        };
+    });
+    // Build a summary that emphasizes inferred labels
+    const labelSummary = enrichedFields
+        .map(f => {
+        const labelDisplay = f.inferred_label
+            ? `"${f.inferred_label}" (${f.confidence})`
+            : '[sin label inferido]';
+        return `- ${f.name}: ${labelDisplay} — pag ${f.page}, pos (${Math.round(f.x)}, ${Math.round(f.y)}), ${Math.round(f.width)}x${Math.round(f.height)}`;
+    })
+        .join('\n');
+    const highConfidenceCount = enrichedFields.filter(f => f.confidence === 'high').length;
+    const lowConfidenceCount = enrichedFields.filter(f => f.confidence === 'low').length;
+    let guidance;
+    if (lowConfidenceCount === 0) {
+        guidance = 'Todos los campos tienen labels correlacionados con alta confianza. Puedes proceder a llenar usando los inferred_label directamente.';
+    }
+    else if (lowConfidenceCount < enrichedFields.length) {
+        guidance = `${highConfidenceCount} campos tienen labels claros, pero ${lowConfidenceCount} tienen baja confianza. Verifica los nearby_text antes de llenar.`;
     }
     else {
-        summary = `Encontrados ${fields.length} campos AcroForm:\n\n${fields.map(f => `- ${f.name} (${f.type}) — Pag ${f.page}, pos: ${Math.round(f.x)},${Math.round(f.y)} ${Math.round(f.width)}x${Math.round(f.height)}${f.value ? ` = "${f.value}"` : ''}`).join('\n')}`;
+        guidance = 'Los labels inferidos tienen baja confianza. Revisa el nearby_text de cada campo o pregunta al usuario que represente cada uno antes de llenar.';
     }
     return {
         content: [{
                 type: 'text',
                 text: JSON.stringify({
-                    has_acroform: hasAcroform,
+                    has_acroform: true,
                     total_fields: fields.length,
-                    fields,
-                    summary
+                    fields: enrichedFields,
+                    summary: labelSummary,
+                    guidance
                 }, null, 2)
             }]
     };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "autoform-mcp-server",
-  "version": "1.4.0",
+  "version": "1.5.0",
   "description": "MCP server for bulk PDF form filling. Detect fields, fill templates, and generate hundreds of PDFs from data — directly from Claude.",
   "type": "module",
   "main": "dist/index.js",
@@ -41,10 +41,11 @@
   "dependencies": {
     "@modelcontextprotocol/sdk": "^1.12.1",
     "jszip": "^3.10.1",
-    "pdf-lib": "^1.17.1"
+    "pdf-lib": "^1.17.1",
+    "pdfjs-dist": "^4.10.38"
   },
   "devDependencies": {
     "@types/node": "^20.0.0",
     "typescript": "^5.2.2"
   }
-}
+}