npm - paddleocr-skills - Versions diffs - 1.0.0 - Mend

paddleocr-skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +220 -0
package/bin/paddleocr-skills.js +20 -0
package/lib/copy.js +39 -0
package/lib/installer.js +70 -0
package/lib/prompts.js +67 -0
package/lib/python.js +75 -0
package/lib/verify.js +121 -0
package/package.json +42 -0
package/templates/.env.example +12 -0
package/templates/paddleocr-vl/references/paddleocr-vl/layout_schema.md +64 -0
package/templates/paddleocr-vl/references/paddleocr-vl/output_format.md +154 -0
package/templates/paddleocr-vl/references/paddleocr-vl/vl_model_spec.md +157 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/_lib.py +780 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/configure.py +270 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/optimize_file.py +226 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements-optimize.txt +8 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements.txt +7 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/smoke_test.py +199 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/vl_caller.py +232 -0
package/templates/paddleocr-vl/skills/paddleocr-vl/SKILL.md +481 -0
package/templates/ppocrv5/references/ppocrv5/agent_policy.md +258 -0
package/templates/ppocrv5/references/ppocrv5/normalized_schema.md +257 -0
package/templates/ppocrv5/references/ppocrv5/provider_api.md +140 -0
package/templates/ppocrv5/scripts/ppocrv5/_lib.py +635 -0
package/templates/ppocrv5/scripts/ppocrv5/configure.py +346 -0
package/templates/ppocrv5/scripts/ppocrv5/ocr_caller.py +684 -0
package/templates/ppocrv5/scripts/ppocrv5/requirements.txt +4 -0
package/templates/ppocrv5/scripts/ppocrv5/smoke_test.py +139 -0
package/templates/ppocrv5/skills/ppocrv5/SKILL.md +272 -0

package/templates/paddleocr-vl/scripts/paddleocr-vl/_lib.py ADDED Viewed

@@ -0,0 +1,780 @@
+"""
+Core library for PaddleOCR-VL API Skill
+- Config: Configuration manager for VL API
+- VLClient: HTTP client with retry logic and caching
+- QualityEvaluator: Confidence scoring and quality assessment
+- Utility functions for error handling
+"""
+import hashlib
+import json
+import logging
+import os
+import re
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, Optional, Tuple
+import httpx
+logger = logging.getLogger(__name__)
+# =============================================================================
+# Constants
+# =============================================================================
+DEFAULT_TIMEOUT_MS = 30000  # VL processing takes longer
+DEFAULT_MAX_RETRY = 2
+DEFAULT_CACHE_TTL_SEC = 600
+# Quality scoring thresholds
+QUALITY_EXCELLENT = 0.90
+QUALITY_GOOD = 0.75
+QUALITY_ACCEPTABLE = 0.60
+# Unified error codes (aligned with ppocrv5)
+ERROR_CONFIG = "CONFIG_ERROR"
+ERROR_AUTH = "PROVIDER_AUTH_ERROR"
+ERROR_QUOTA = "PROVIDER_QUOTA_EXCEEDED"
+ERROR_OVERLOADED = "PROVIDER_OVERLOADED"
+ERROR_TIMEOUT = "PROVIDER_TIMEOUT"
+ERROR_BAD_REQUEST = "PROVIDER_BAD_REQUEST"
+ERROR_PROVIDER = "PROVIDER_ERROR"
+ERROR_NETWORK = "NETWORK_ERROR"
+ERROR_PARSE = "PARSE_ERROR"
+# =============================================================================
+# Configuration
+# =============================================================================
+class Config:
+    """
+    Configuration manager for PaddleOCR-VL
+    Reads from:
+    1. Environment variables (highest priority)
+    2. .env file in project root
+    3. Raise error if not found
+    """
+    _env_loaded = False
+    @staticmethod
+    def load_env():
+        """Load .env file using python-dotenv"""
+        if Config._env_loaded:
+            return
+        try:
+            from dotenv import load_dotenv
+            # Find .env file (project root is 2 levels up)
+            project_root = Path(__file__).parent.parent.parent
+            env_file = project_root / ".env"
+            if env_file.exists():
+                load_dotenv(env_file)
+                logger.debug(f"Loaded .env from {env_file}")
+            else:
+                logger.debug(f".env file not found at {env_file}")
+            Config._env_loaded = True
+        except ImportError:
+            logger.warning("python-dotenv not installed")
+            logger.warning("Install with: pip install python-dotenv")
+            Config._env_loaded = True
+    @staticmethod
+    def get_vl_api_url() -> str:
+        """
+        Get PaddleOCR-VL API URL from environment
+        Returns:
+            Full API URL
+        Raises:
+            ValueError: If API_URL not configured
+        """
+        Config.load_env()
+        api_url = os.getenv("VL_API_URL", "").strip()
+        if not api_url:
+            raise ValueError(
+                "API not configured. Get your API at: https://aistudio.baidu.com/paddleocr/task"
+            )
+        # Validate URL format
+        if not api_url.startswith(("http://", "https://")):
+            api_url = f"https://{api_url}"
+        return api_url
+    @staticmethod
+    def get_vl_token() -> str:
+        """
+        Get PaddleOCR-VL access token from environment
+        Returns:
+            Access token
+        Raises:
+            ValueError: If token not configured
+        """
+        Config.load_env()
+        token = os.getenv("VL_TOKEN", "").strip()
+        if not token:
+            raise ValueError(
+                "TOKEN not configured. Get your API at: https://aistudio.baidu.com/paddleocr/task"
+            )
+        return token
+    @staticmethod
+    def is_configured() -> bool:
+        """Check if VL API is properly configured"""
+        Config.load_env()
+        api_url = os.getenv("VL_API_URL", "").strip()
+        token = os.getenv("VL_TOKEN", "").strip()
+        return bool(api_url and token)
+    @staticmethod
+    def get_timeout_ms() -> int:
+        """Get timeout in milliseconds from environment"""
+        Config.load_env()
+        timeout_str = os.getenv("VL_TIMEOUT_MS", str(DEFAULT_TIMEOUT_MS))
+        try:
+            return int(timeout_str)
+        except ValueError:
+            return DEFAULT_TIMEOUT_MS
+    @staticmethod
+    def get_max_retry() -> int:
+        """Get max retry count from environment"""
+        Config.load_env()
+        retry_str = os.getenv("VL_MAX_RETRY", str(DEFAULT_MAX_RETRY))
+        try:
+            return int(retry_str)
+        except ValueError:
+            return DEFAULT_MAX_RETRY
+    @staticmethod
+    def get_cache_ttl_sec() -> int:
+        """Get cache TTL in seconds from environment"""
+        Config.load_env()
+        ttl_str = os.getenv("VL_CACHE_TTL_SEC", str(DEFAULT_CACHE_TTL_SEC))
+        try:
+            return int(ttl_str)
+        except ValueError:
+            return DEFAULT_CACHE_TTL_SEC
+    @staticmethod
+    def get_max_file_size_mb() -> int:
+        """
+        Get maximum file size in MB from environment
+        Default: 0 (no limit)
+        Set VL_MAX_FILE_SIZE_MB in .env to enforce a limit if needed
+        """
+        Config.load_env()
+        size_str = os.getenv("VL_MAX_FILE_SIZE_MB", "0")
+        try:
+            return int(size_str)
+        except ValueError:
+            return 0
+# =============================================================================
+# Simple Cache
+# =============================================================================
+class SimpleCache:
+    """
+    Simple in-memory TTL cache
+    Caches API responses to avoid redundant requests.
+    """
+    def __init__(self, ttl_seconds: int = DEFAULT_CACHE_TTL_SEC):
+        self.cache: Dict[str, Tuple[Any, float]] = {}
+        self.ttl = ttl_seconds
+    def get(self, key: str) -> Optional[Any]:
+        """Get value from cache if not expired"""
+        if key not in self.cache:
+            return None
+        value, timestamp = self.cache[key]
+        if time.time() - timestamp > self.ttl:
+            del self.cache[key]
+            return None
+        logger.debug(f"Cache hit: {key[:16]}...")
+        return value
+    def set(self, key: str, value: Any):
+        """Set value in cache with current timestamp"""
+        self.cache[key] = (value, time.time())
+        logger.debug(f"Cache set: {key[:16]}...")
+    def clear(self):
+        """Clear all cache entries"""
+        self.cache.clear()
+        logger.debug("Cache cleared")
+    @staticmethod
+    def make_key(file_path: Optional[str] = None, file_url: Optional[str] = None) -> str:
+        """Generate cache key from request parameters"""
+        if file_url:
+            content = f"url:{file_url}"
+        elif file_path:
+            content = f"path:{file_path}"
+        else:
+            content = "empty"
+        return hashlib.sha256(content.encode()).hexdigest()
+# =============================================================================
+# Quality Evaluator
+# =============================================================================
+class QualityEvaluator:
+    """
+    Evaluates confidence and quality of VL parsing results
+    """
+    @staticmethod
+    def evaluate(api_response: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Evaluate quality of API response
+        Args:
+            api_response: Complete API response from VL
+        Returns:
+            Quality assessment dict with:
+            - overall_confidence: 0.0-1.0
+            - quality_level: "excellent" | "good" | "acceptable" | "poor"
+            - warnings: List of quality warnings
+            - region_stats: Statistics about regions
+        """
+        if not api_response.get("ok", False):
+            return {
+                "overall_confidence": 0.0,
+                "quality_level": "error",
+                "warnings": ["API request failed"],
+                "region_stats": {}
+            }
+        result = api_response.get("result", {})
+        layout = result.get("layout", {})
+        regions = layout.get("regions", [])
+        if not regions:
+            return {
+                "overall_confidence": 0.0,
+                "quality_level": "poor",
+                "warnings": ["No content detected in document"],
+                "region_stats": {"total_regions": 0}
+            }
+        # Calculate average confidence
+        confidences = []
+        region_types = {}
+        for region in regions:
+            confidence = region.get("confidence", 0.8)  # Default if not provided
+            confidences.append(confidence)
+            region_type = region.get("type", "unknown")
+            region_types[region_type] = region_types.get(region_type, 0) + 1
+        overall_confidence = sum(confidences) / len(confidences) if confidences else 0.0
+        # Determine quality level
+        if overall_confidence >= QUALITY_EXCELLENT:
+            quality_level = "excellent"
+        elif overall_confidence >= QUALITY_GOOD:
+            quality_level = "good"
+        elif overall_confidence >= QUALITY_ACCEPTABLE:
+            quality_level = "acceptable"
+        else:
+            quality_level = "poor"
+        # Generate warnings
+        warnings = []
+        if overall_confidence < QUALITY_ACCEPTABLE:
+            warnings.append(f"Low confidence score: {overall_confidence:.2f}")
+        low_confidence_regions = [r for r in regions if r.get("confidence", 0.8) < QUALITY_ACCEPTABLE]
+        if low_confidence_regions:
+            warnings.append(f"{len(low_confidence_regions)} regions have low confidence")
+        return {
+            "overall_confidence": overall_confidence,
+            "quality_level": quality_level,
+            "warnings": warnings,
+            "region_stats": {
+                "total_regions": len(regions),
+                "by_type": region_types,
+                "low_confidence_count": len(low_confidence_regions)
+            }
+        }
+# =============================================================================
+# VL API Client
+# =============================================================================
+class VLClient:
+    """
+    HTTP client for PaddleOCR-VL API with retry and caching
+    """
+    def __init__(
+        self,
+        api_url: Optional[str] = None,
+        token: Optional[str] = None,
+        timeout_ms: Optional[int] = None,
+        max_retry: Optional[int] = None,
+        enable_cache: bool = True
+    ):
+        """
+        Initialize VL client
+        Args:
+            api_url: API endpoint URL (auto-loaded from config if None)
+            token: Access token (auto-loaded from config if None)
+            timeout_ms: Request timeout in milliseconds
+            max_retry: Maximum retry count for transient errors
+            enable_cache: Enable response caching
+        """
+        self.api_url = api_url or Config.get_vl_api_url()
+        self.token = token or Config.get_vl_token()
+        self.timeout_ms = timeout_ms or Config.get_timeout_ms()
+        self.max_retry = max_retry or Config.get_max_retry()
+        self.cache = SimpleCache(Config.get_cache_ttl_sec()) if enable_cache else None
+        logger.info(f"VLClient initialized: {self.api_url}")
+        logger.debug(f"Timeout: {self.timeout_ms}ms, Max retry: {self.max_retry}")
+    def call_api(
+        self,
+        file_path: Optional[str] = None,
+        file_url: Optional[str] = None,
+        use_cache: bool = True
+    ) -> Dict[str, Any]:
+        """
+        Call PaddleOCR-VL API with automatic retry
+        Args:
+            file_path: Local file path
+            file_url: URL to file
+            use_cache: Use cached response if available
+        Returns:
+            Complete API response with standardized error codes
+        Raises:
+            ValueError: If neither file_path nor file_url provided
+        """
+        # Validate input parameters
+        is_valid, error_msg = validate_file_input(file_path, file_url)
+        if not is_valid:
+            return self._make_error_response(ERROR_BAD_REQUEST, error_msg)
+        # Check cache
+        if use_cache and self.cache:
+            cache_key = SimpleCache.make_key(file_path, file_url)
+            cached = self.cache.get(cache_key)
+            if cached:
+                logger.info("Returning cached response")
+                return cached
+        # Build request payload
+        payload = {
+            "parse_all": True,
+            "include_layout": True,
+            "include_all_elements": True
+        }
+        if file_url:
+            payload["file_url"] = file_url
+            # Make request with retry (JSON mode)
+            result = self._request_with_retry(payload)
+        elif file_path:
+            # Make request with retry (multipart file upload mode)
+            result = self._request_with_retry(payload, upload_file_path=str(Path(file_path).absolute()))
+        else:
+            result = self._request_with_retry(payload)
+        # Cache successful response
+        if use_cache and self.cache and result.get("ok", False):
+            self.cache.set(cache_key, result)
+        return result
+    def _request_with_retry(self, payload: Dict[str, Any], upload_file_path: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Make HTTP request with automatic retry on transient failures
+        Args:
+            payload: Request payload
+            upload_file_path: If provided, upload this file as multipart form-data
+        Returns:
+            API response dict
+        """
+        timeout_sec = self.timeout_ms / 1000.0
+        attempt = 0
+        while attempt <= self.max_retry:
+            attempt += 1
+            try:
+                logger.debug(f"API request attempt {attempt}/{self.max_retry + 1}")
+                with httpx.Client(timeout=timeout_sec) as client:
+                    if upload_file_path:
+                        # Encode local file as base64 and send as JSON
+                        import base64
+                        headers = {
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.token}"
+                        }
+                        ext = Path(upload_file_path).suffix.lower()
+                        with open(upload_file_path, 'rb') as f:
+                            file_data = base64.b64encode(f.read()).decode('utf-8')
+                        file_payload = {
+                            "file": file_data,
+                            "fileType": 0 if ext == '.pdf' else 1,
+                        }
+                        response = client.post(
+                            self.api_url,
+                            json=file_payload,
+                            headers=headers
+                        )
+                    else:
+                        # JSON request for URL-based input
+                        headers = {
+                            "Content-Type": "application/json",
+                            "Authorization": f"Bearer {self.token}"
+                        }
+                        response = client.post(
+                            self.api_url,
+                            json=payload,
+                            headers=headers
+                        )
+                    # Handle different status codes
+                    status = response.status_code
+                    # Success
+                    if status == 200:
+                        try:
+                            result = response.json()
+                            logger.info(f"API request successful (attempt {attempt})")
+                            return self._wrap_success(result)
+                        except json.JSONDecodeError as e:
+                            return self._make_error_response(
+                                ERROR_PARSE,
+                                f"Invalid JSON response: {str(e)}"
+                            )
+                    # Authentication errors (no retry)
+                    if status in (401, 403):
+                        logger.error(f"Authentication failed: HTTP {status}")
+                        return self._make_error_response(
+                            ERROR_AUTH,
+                            f"Authentication failed (HTTP {status}). Check your VL_TOKEN."
+                        )
+                    # Quota exceeded (no retry)
+                    if status == 429:
+                        logger.error("API quota exceeded")
+                        return self._make_error_response(
+                            ERROR_QUOTA,
+                            "API quota exceeded. Please wait or upgrade your plan."
+                        )
+                    # Transient errors (retry)
+                    if status in (503, 504):
+                        error_code = ERROR_OVERLOADED if status == 503 else ERROR_TIMEOUT
+                        if attempt <= self.max_retry:
+                            wait_time = 2 ** (attempt - 1)  # Exponential backoff
+                            logger.warning(f"Transient error HTTP {status}, retrying in {wait_time}s...")
+                            time.sleep(wait_time)
+                            continue
+                        else:
+                            logger.error(f"Max retries exceeded for HTTP {status}")
+                            return self._make_error_response(
+                                error_code,
+                                f"Service unavailable (HTTP {status}) after {self.max_retry} retries"
+                            )
+                    # Other errors
+                    logger.error(f"API request failed: HTTP {status}")
+                    return self._make_error_response(
+                        ERROR_PROVIDER,
+                        f"API request failed (HTTP {status}): {response.text[:200]}"
+                    )
+            except httpx.TimeoutException:
+                logger.error(f"Request timeout after {timeout_sec}s")
+                if attempt <= self.max_retry:
+                    logger.warning(f"Retrying after timeout (attempt {attempt})...")
+                    continue
+                return self._make_error_response(
+                    ERROR_TIMEOUT,
+                    f"Request timeout after {timeout_sec}s. Document may be too large."
+                )
+            except httpx.RequestError as e:
+                logger.error(f"Network error: {str(e)}")
+                return self._make_error_response(
+                    ERROR_NETWORK,
+                    f"Network error: {str(e)}"
+                )
+        # Should not reach here
+        return self._make_error_response(
+            ERROR_PROVIDER,
+            "Unexpected error: max retries logic failed"
+        )
+    def _wrap_success(self, api_response: Dict[str, Any]) -> Dict[str, Any]:
+        """Wrap successful API response in standard format"""
+        # If response already has 'ok' field, return as-is
+        if "ok" in api_response:
+            return api_response
+        # Otherwise wrap it
+        return {
+            "ok": True,
+            "result": api_response,
+            "error": None
+        }
+    def _make_error_response(self, error_code: str, message: str) -> Dict[str, Any]:
+        """Create standardized error response"""
+        return {
+            "ok": False,
+            "result": None,
+            "error": {
+                "code": error_code,
+                "message": message
+            }
+        }
+# =============================================================================
+# High-level API Functions
+# =============================================================================
+def make_api_request(
+    file_path: Optional[str] = None,
+    file_url: Optional[str] = None,
+    timeout_ms: Optional[int] = None,
+    use_cache: bool = True
+) -> Dict[str, Any]:
+    """
+    High-level function to call PaddleOCR-VL API
+    Args:
+        file_path: Local file path
+        file_url: URL to file
+        timeout_ms: Request timeout in milliseconds
+        use_cache: Use cached response if available
+    Returns:
+        Complete API response dict with standardized format
+    Raises:
+        ValueError: If configuration is invalid
+    """
+    # Create client
+    client = VLClient(timeout_ms=timeout_ms)
+    # Call API
+    return client.call_api(
+        file_path=file_path,
+        file_url=file_url,
+        use_cache=use_cache
+    )
+# =============================================================================
+# File Type Detection
+# =============================================================================
+def detect_file_format(file_input: str) -> Tuple[bool, str]:
+    """
+    Detect and validate file format from URL or path.
+    Args:
+        file_input: File URL or path
+    Returns:
+        Tuple of (is_valid, format_type)
+        - is_valid: True if format is supported
+        - format_type: "pdf", "png", "jpg", "jpeg", etc. or "unknown"
+    Supported formats:
+    - PDF: .pdf
+    - Images: .png, .jpg, .jpeg, .bmp, .tiff, .tif, .webp
+    """
+    import re
+    from urllib.parse import urlparse, unquote
+    # Clean up the input
+    file_str = file_input.lower().strip()
+    # If it's a URL, extract the path
+    if file_str.startswith(('http://', 'https://')):
+        try:
+            parsed = urlparse(file_str)
+            file_str = unquote(parsed.path)
+        except:
+            pass
+    # Remove query parameters and fragments
+    file_str = re.sub(r'[?#].*$', '', file_str)
+    # Supported formats
+    supported_formats = {
+        '.pdf': 'pdf',
+        '.png': 'png',
+        '.jpg': 'jpg',
+        '.jpeg': 'jpeg',
+        '.bmp': 'bmp',
+        '.tiff': 'tiff',
+        '.tif': 'tif',
+        '.webp': 'webp'
+    }
+    # Check each extension
+    for ext, format_name in supported_formats.items():
+        if file_str.endswith(ext):
+            return (True, format_name)
+    # Unsupported format
+    return (False, "unknown")
+def validate_file_input(
+    file_path: Optional[str] = None,
+    file_url: Optional[str] = None
+) -> Tuple[bool, str]:
+    """
+    Validate file input parameters
+    Args:
+        file_path: Local file path
+        file_url: File URL
+    Returns:
+        Tuple of (is_valid, error_message)
+    """
+    if not file_path and not file_url:
+        return (False, "Either file_path or file_url must be provided")
+    file_input = file_url if file_url else file_path
+    # Validate format
+    is_valid, format_type = detect_file_format(file_input)
+    if not is_valid:
+        return (False, f"Unsupported file format. Supported: PDF, PNG, JPG, JPEG, BMP, TIFF, WEBP")
+    # Validate local file exists
+    if file_path:
+        if not Path(file_path).exists():
+            return (False, f"File not found: {file_path}")
+        # Check file size (configurable via VL_MAX_FILE_SIZE_MB)
+        file_size = Path(file_path).stat().st_size
+        max_size_mb = Config.get_max_file_size_mb()
+        max_size_bytes = max_size_mb * 1024 * 1024
+        if file_size > max_size_bytes:
+            file_size_mb = file_size / 1024 / 1024
+            return (False, f"File too large: {file_size_mb:.1f}MB (max {max_size_mb}MB). Use --file-url or increase VL_MAX_FILE_SIZE_MB")
+    return (True, "")
+# =============================================================================
+# Utility Functions
+# =============================================================================
+def format_error_output(error: Exception, error_code: str = ERROR_PROVIDER) -> Dict[str, Any]:
+    """
+    Format exception as JSON error output
+    Args:
+        error: Exception to format
+        error_code: Unified error code
+    Returns:
+        Error dict in standard format
+    """
+    return {
+        "ok": False,
+        "result": None,
+        "error": {
+            "code": error_code,
+            "type": type(error).__name__,
+            "message": str(error)
+        }
+    }
+def wrap_success_output(result: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Wrap successful result in standard format
+    Args:
+        result: API response
+    Returns:
+        Wrapped output with ok=True
+    """
+    # If API response already has 'ok' field, return as-is
+    if "ok" in result:
+        return result
+    # Otherwise wrap it
+    return {
+        "ok": True,
+        "result": result,
+        "error": None
+    }
+def setup_logging(level: Optional[str] = None):
+    """
+    Setup logging configuration
+    Args:
+        level: Log level (DEBUG, INFO, WARNING, ERROR)
+               If None, reads from VL_LOG_LEVEL environment variable
+    """
+    if level is None:
+        level = os.getenv("VL_LOG_LEVEL", "INFO").upper()
+    numeric_level = getattr(logging, level, logging.INFO)
+    logging.basicConfig(
+        level=numeric_level,
+        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S"
+    )
+    logger.setLevel(numeric_level)