npm - paddleocr-skills - Versions diffs - 1.0.0 - Mend

paddleocr-skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +220 -0
package/bin/paddleocr-skills.js +20 -0
package/lib/copy.js +39 -0
package/lib/installer.js +70 -0
package/lib/prompts.js +67 -0
package/lib/python.js +75 -0
package/lib/verify.js +121 -0
package/package.json +42 -0
package/templates/.env.example +12 -0
package/templates/paddleocr-vl/references/paddleocr-vl/layout_schema.md +64 -0
package/templates/paddleocr-vl/references/paddleocr-vl/output_format.md +154 -0
package/templates/paddleocr-vl/references/paddleocr-vl/vl_model_spec.md +157 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/_lib.py +780 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/configure.py +270 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/optimize_file.py +226 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements-optimize.txt +8 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements.txt +7 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/smoke_test.py +199 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/vl_caller.py +232 -0
package/templates/paddleocr-vl/skills/paddleocr-vl/SKILL.md +481 -0
package/templates/ppocrv5/references/ppocrv5/agent_policy.md +258 -0
package/templates/ppocrv5/references/ppocrv5/normalized_schema.md +257 -0
package/templates/ppocrv5/references/ppocrv5/provider_api.md +140 -0
package/templates/ppocrv5/scripts/ppocrv5/_lib.py +635 -0
package/templates/ppocrv5/scripts/ppocrv5/configure.py +346 -0
package/templates/ppocrv5/scripts/ppocrv5/ocr_caller.py +684 -0
package/templates/ppocrv5/scripts/ppocrv5/requirements.txt +4 -0
package/templates/ppocrv5/scripts/ppocrv5/smoke_test.py +139 -0
package/templates/ppocrv5/skills/ppocrv5/SKILL.md +272 -0

package/templates/ppocrv5/scripts/ppocrv5/ocr_caller.py ADDED Viewed

@@ -0,0 +1,684 @@
+#!/usr/bin/env python3
+"""
+PP-OCRv5 API Caller
+Calls Paddle AI Studio PP-OCRv5 /ocr API with user-provided host/token.
+Supports fast/quality/auto modes with agent-based retry and quality scoring.
+"""
+import argparse
+import base64
+import json
+import logging
+import os
+import sys
+import time
+import uuid
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+# Add scripts dir to path for imports
+sys.path.insert(0, str(Path(__file__).parent))
+from _lib import (
+    AgentPolicy,
+    Config,
+    Mapper,
+    Normalizer,
+    ProviderClient,
+    QualityEvaluator,
+    SimpleCache
+)
+# Configure logging
+log_level = os.getenv("PADDLE_OCR_LOG_LEVEL", "INFO").upper()
+logging.basicConfig(
+    level=getattr(logging, log_level),
+    format="%(asctime)s [%(levelname)s] %(message)s"
+)
+logger = logging.getLogger(__name__)
+# =============================================================================
+# Main OCR Logic
+# =============================================================================
+def detect_file_type(file_input: str) -> int:
+    """
+    Detect file type from URL or path.
+    Returns: 0 for PDF, 1 for Image
+    Supported extensions:
+    - PDF: .pdf
+    - Image: .png, .jpg, .jpeg, .bmp, .tiff, .tif, .webp, .gif
+    """
+    import re
+    from urllib.parse import urlparse, unquote
+    # Clean up the input
+    file_str = file_input.lower().strip()
+    # If it's a URL, extract the path
+    if file_str.startswith(('http://', 'https://')):
+        try:
+            parsed = urlparse(file_str)
+            file_str = unquote(parsed.path)
+        except:
+            pass
+    # Remove query parameters and fragments
+    file_str = re.sub(r'[?#].*$', '', file_str)
+    # Check for PDF extension
+    pdf_extensions = ['.pdf']
+    for ext in pdf_extensions:
+        if file_str.endswith(ext):
+            return 0
+    # Check for image extensions
+    image_extensions = ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif', '.webp', '.gif']
+    for ext in image_extensions:
+        if file_str.endswith(ext):
+            return 1
+    # Default to image if uncertain
+    return 1
+def load_file_as_base64(file_path: str) -> str:
+    """Load local file and encode as base64 (without data URI prefix)"""
+    with open(file_path, "rb") as f:
+        data = f.read()
+    return base64.b64encode(data).decode("utf-8")
+def build_payload(
+    file_input: str,
+    file_type: int,
+    visualize: bool,
+    options: Dict[str, Any]
+) -> Dict[str, Any]:
+    """
+    Build provider API payload.
+    Args:
+        file_input: URL or base64 string
+        file_type: 0=PDF, 1=Image
+        visualize: Whether to return visualization
+        options: Additional OCR options (snake_case)
+    Returns:
+        Provider payload dict (camelCase)
+    """
+    # Base payload
+    payload = {
+        "file": file_input,
+        "file_type": file_type,
+        "visualize": visualize
+    }
+    # Add OCR options (with defaults)
+    payload.update({
+        "use_doc_orientation_classify": options.get("use_doc_orientation_classify", False),
+        "use_doc_unwarping": options.get("use_doc_unwarping", False),
+        "use_textline_orientation": options.get("use_textline_orientation", False),
+        "text_det_limit_side_len": options.get("text_det_limit_side_len", 736),
+        "text_det_limit_type": options.get("text_det_limit_type", "max"),
+        "text_det_thresh": options.get("text_det_thresh", 0.3),
+        "text_det_box_thresh": options.get("text_det_box_thresh", 0.6),
+        "text_det_unclip_ratio": options.get("text_det_unclip_ratio", 1.5),
+        "text_rec_score_thresh": options.get("text_rec_score_thresh", 0.0)
+    })
+    # Convert to camelCase for provider
+    return Mapper.dict_to_camel(payload)
+def analyze_image(
+    client: ProviderClient,
+    file_input: str,
+    file_type: int,
+    visualize: bool = False
+) -> Dict[str, Any]:
+    """
+    Analyze image quality with minimal cost (1 fast call) and recommend mode.
+    Returns:
+        {
+            "quality_score": float,
+            "text_items": int,
+            "avg_confidence": float,
+            "recommended_mode": str,
+            "reason": str,
+            "fast_result": dict  # Full result from fast mode
+        }
+    """
+    logger.info("Analyzing image to recommend mode...")
+    # Use fast mode (cheapest) for analysis
+    options = {
+        "use_doc_orientation_classify": False,
+        "use_doc_unwarping": False,
+        "use_textline_orientation": False
+    }
+    payload = build_payload(file_input, file_type, visualize, options)
+    provider_resp, status_code, elapsed_ms = client.call(payload)
+    # Check for errors
+    if provider_resp.get("errorCode", -1) != 0:
+        return {
+            "quality_score": 0.0,
+            "text_items": 0,
+            "avg_confidence": 0.0,
+            "recommended_mode": "quality",
+            "reason": f"Analysis failed (error {provider_resp.get('errorCode')}), recommend quality mode for robustness",
+            "fast_result": None,
+            "error": provider_resp.get("errorMsg", "Unknown error")
+        }
+    # Extract and evaluate quality
+    result = provider_resp.get("result", {})
+    ocr_results = result.get("ocrResults", [])
+    all_rec_texts = []
+    all_rec_scores = []
+    for ocr_res in ocr_results:
+        pruned = ocr_res.get("prunedResult", {})
+        all_rec_texts.extend(pruned.get("rec_texts", []))
+        all_rec_scores.extend(pruned.get("rec_scores", []))
+    quality = QualityEvaluator.evaluate(all_rec_texts, all_rec_scores)
+    # Recommend mode based on quality score
+    score = quality["quality_score"]
+    text_items = quality["text_items"]
+    if score >= 0.80:
+        recommended_mode = "fast"
+        reason = f"Excellent quality (score: {score:.2f}). Fast mode is sufficient."
+    elif score >= 0.65:
+        recommended_mode = "fast"
+        reason = f"Good quality (score: {score:.2f}). Fast mode recommended, but auto mode can optimize further."
+    elif score >= 0.45:
+        recommended_mode = "auto"
+        reason = f"Medium quality (score: {score:.2f}). Auto mode recommended for adaptive retry."
+    elif text_items > 0:
+        recommended_mode = "quality"
+        reason = f"Low quality (score: {score:.2f}). Quality mode recommended for better preprocessing."
+    else:
+        recommended_mode = "quality"
+        reason = f"No text detected. Quality mode recommended to enable all corrections."
+    return {
+        "quality_score": score,
+        "text_items": text_items,
+        "avg_confidence": quality["avg_rec_score"],
+        "recommended_mode": recommended_mode,
+        "reason": reason,
+        "fast_result": provider_resp,
+        "analysis_time_ms": elapsed_ms
+    }
+def run_ocr(
+    client: ProviderClient,
+    file_input: str,
+    file_type: int,
+    mode: str,
+    max_attempts: int,
+    budget_ms: int,
+    quality_target: float,
+    visualize: bool,
+    return_raw: bool,
+    cache: Optional[SimpleCache] = None
+) -> Dict[str, Any]:
+    """
+    Run OCR with auto mode support.
+    Args:
+        client: ProviderClient instance
+        file_input: URL or base64 string
+        file_type: 0=PDF, 1=Image
+        mode: 'fast', 'quality', or 'auto'
+        max_attempts: Max attempts for auto mode
+        budget_ms: Max total time budget
+        quality_target: Target quality score for auto mode
+        visualize: Whether to return visualization
+        return_raw: Whether to include raw provider response
+        cache: Optional cache instance
+    Returns:
+        Normalized output dict
+    """
+    request_id = f"req_{uuid.uuid4().hex[:12]}"
+    start_time = time.time()
+    # Get attempt configurations
+    attempts_config = AgentPolicy.get_attempts_config(mode, max_attempts)
+    # Cache check (only for fast/quality mode, not auto)
+    if cache and mode != "auto":
+        options = attempts_config[0]
+        cache_key = SimpleCache.make_key(file_input, options)
+        cached = cache.get(cache_key)
+        if cached:
+            logger.info(f"Cache hit for request {request_id}")
+            return cached
+    attempts_history = []
+    best_attempt = None
+    best_quality = -1.0
+    for attempt_idx, options in enumerate(attempts_config):
+        attempt_num = attempt_idx + 1
+        # Check budget
+        elapsed_ms = (time.time() - start_time) * 1000
+        if elapsed_ms >= budget_ms:
+            logger.warning(f"Budget exceeded after {attempt_num - 1} attempts")
+            break
+        logger.info(f"Attempt {attempt_num}/{len(attempts_config)} with options: {options}")
+        # Build payload
+        payload = build_payload(file_input, file_type, visualize, options)
+        # Call provider
+        provider_resp, status_code, provider_time_ms = client.call(payload)
+        # Parse response to get quality
+        error_code = provider_resp.get("errorCode", -1)
+        if error_code != 0:
+            # Error - record and stop
+            logger.error(f"Attempt {attempt_num} failed with errorCode={error_code}")
+            attempts_history.append({
+                "attempt": attempt_num,
+                "provider_time_ms": round(provider_time_ms, 2),
+                "quality_score": 0.0,
+                "avg_rec_score": 0.0,
+                "text_items": 0,
+                "warnings": [f"Provider error: {provider_resp.get('errorMsg', 'Unknown')}"],
+                "options_effective": options
+            })
+            # Return error immediately
+            return Normalizer.normalize_response(
+                provider_resp,
+                request_id,
+                client.api_url,
+                status_code,
+                mode,
+                attempt_num,
+                attempts_history,
+                return_raw
+            )
+        # Success - evaluate quality
+        result = provider_resp.get("result", {})
+        ocr_results = result.get("ocrResults", [])
+        # Gather all rec_texts and rec_scores from all pages
+        all_rec_texts = []
+        all_rec_scores = []
+        for ocr_res in ocr_results:
+            pruned = ocr_res.get("prunedResult", {})
+            all_rec_texts.extend(pruned.get("rec_texts", []))
+            all_rec_scores.extend(pruned.get("rec_scores", []))
+        quality = QualityEvaluator.evaluate(all_rec_texts, all_rec_scores)
+        attempts_history.append({
+            "attempt": attempt_num,
+            "provider_time_ms": round(provider_time_ms, 2),
+            "quality_score": quality["quality_score"],
+            "avg_rec_score": quality["avg_rec_score"],
+            "text_items": quality["text_items"],
+            "warnings": quality["warnings"],
+            "options_effective": options
+        })
+        # Track best
+        if quality["quality_score"] > best_quality:
+            best_quality = quality["quality_score"]
+            best_attempt = {
+                "attempt_num": attempt_num,
+                "provider_resp": provider_resp,
+                "status_code": status_code
+            }
+        logger.info(f"Attempt {attempt_num} quality_score: {quality['quality_score']:.4f}")
+        # Stop if quality target met (only for auto mode)
+        if mode == "auto" and quality["quality_score"] >= quality_target:
+            logger.info(f"Quality target {quality_target} met, stopping early")
+            break
+    # Select best attempt
+    if best_attempt is None:
+        # No successful attempts
+        return {
+            "ok": False,
+            "request_id": request_id,
+            "provider": {
+                "api_url": client.api_url,
+                "status_code": 500,
+                "log_id": None
+            },
+            "result": None,
+            "quality": None,
+            "agent_trace": {
+                "mode": mode,
+                "selected_attempt": 0,
+                "attempts": attempts_history
+            },
+            "raw_provider": None,
+            "error": {
+                "code": "PROVIDER_ERROR",
+                "message": "All attempts failed",
+                "details": {}
+            }
+        }
+    # Normalize best result
+    normalized = Normalizer.normalize_response(
+        best_attempt["provider_resp"],
+        request_id,
+        client.api_url,
+        best_attempt["status_code"],
+        mode,
+        best_attempt["attempt_num"],
+        attempts_history,
+        return_raw
+    )
+    # Cache result (only for fast/quality mode)
+    if cache and mode != "auto" and normalized["ok"]:
+        options = attempts_config[0]
+        cache_key = SimpleCache.make_key(file_input, options)
+        cache.set(cache_key, normalized)
+    return normalized
+# =============================================================================
+# CLI
+# =============================================================================
+def main():
+    parser = argparse.ArgumentParser(
+        description="PP-OCRv5 API Caller - OCR images/PDFs via Paddle AI Studio",
+        epilog="""
+Configuration:
+  Configuration is read from .env file in project root.
+  First-time setup:
+    python scripts/configure.py
+  Or manually create .env file:
+    API_URL=https://your-subdomain.aistudio-app.com/ocr
+    PADDLE_OCR_TOKEN=your_token_here
+        """
+    )
+    # Note: Configuration is now handled via .env file
+    # Run: python scripts/configure.py to set up
+    # Input (one of these required)
+    input_group = parser.add_mutually_exclusive_group(required=True)
+    input_group.add_argument("--file-url", help="URL to image or PDF")
+    input_group.add_argument("--file-path", help="Local path to image or PDF")
+    input_group.add_argument("--file-base64", help="Base64-encoded file (no data URI prefix)")
+    # File type
+    parser.add_argument(
+        "--file-type",
+        choices=["auto", "pdf", "image"],
+        default="auto",
+        help="File type (default: auto)"
+    )
+    # Mode
+    mode_group = parser.add_argument_group('mode selection')
+    mode_group.add_argument(
+        "--mode",
+        choices=["fast", "quality", "auto"],
+        default="auto",
+        help="OCR mode: fast (single quick call), quality (single high-quality call), auto (adaptive retry)"
+    )
+    mode_group.add_argument(
+        "--analyze",
+        action="store_true",
+        help="Analyze image and recommend mode (low cost, no full OCR)"
+    )
+    mode_group.add_argument(
+        "--interactive",
+        action="store_true",
+        help="Analyze first, then prompt for mode selection"
+    )
+    # Auto mode options
+    parser.add_argument(
+        "--max-attempts",
+        type=int,
+        default=3,
+        help="Max attempts for auto mode (default: 3)"
+    )
+    parser.add_argument(
+        "--budget-ms",
+        type=int,
+        default=25000,
+        help="Max total time budget in ms (default: 25000)"
+    )
+    parser.add_argument(
+        "--quality-target",
+        type=float,
+        default=0.72,
+        help="Target quality score for auto mode (default: 0.72)"
+    )
+    # Output options
+    parser.add_argument(
+        "--visualize",
+        action="store_true",
+        help="Request visualization from provider (may increase response size/time)"
+    )
+    parser.add_argument(
+        "--return-raw-provider",
+        action="store_true",
+        help="Include raw provider response in output"
+    )
+    parser.add_argument(
+        "--pretty",
+        action="store_true",
+        help="Pretty-print JSON output"
+    )
+    parser.add_argument(
+        "--output", "-o",
+        metavar="FILE",
+        help="Save result to JSON file (absolute or relative path)"
+    )
+    args = parser.parse_args()
+    # Determine file input
+    if args.file_url:
+        file_input = args.file_url
+    elif args.file_path:
+        file_input = load_file_as_base64(args.file_path)
+    elif args.file_base64:
+        file_input = args.file_base64
+    else:
+        print("Error: No file input provided", file=sys.stderr)
+        sys.exit(2)
+    # Determine file type
+    if args.file_type == "auto":
+        file_type = detect_file_type(args.file_url or args.file_path or "")
+    elif args.file_type == "pdf":
+        file_type = 0
+    else:
+        file_type = 1
+    # Load config from .env file
+    try:
+        api_url = Config.get_api_url()
+        token = Config.get_token()
+        timeout_ms = Config.get_timeout_ms()
+        max_retry = Config.get_max_retry()
+        cache_ttl_sec = Config.get_cache_ttl_sec()
+    except ValueError as e:
+        print(f"\nConfiguration error: {e}", file=sys.stderr)
+        sys.exit(2)
+    # Create client
+    client = ProviderClient(api_url, token, timeout_ms, max_retry)
+    # Create cache
+    cache = SimpleCache(cache_ttl_sec)
+    try:
+        # Mode 1: Analyze only (no full OCR)
+        if args.analyze:
+            print("="*60)
+            print("Image Analysis (Fast Mode Test)")
+            print("="*60)
+            analysis = analyze_image(client, file_input, file_type, args.visualize)
+            if "error" in analysis:
+                print(f"\nAnalysis Error: {analysis['error']}")
+                print(f"Recommendation: {analysis['recommended_mode']}")
+                print(f"Reason: {analysis['reason']}\n")
+                sys.exit(4)
+            print(f"\nQuality Score:  {analysis['quality_score']:.2f} / 1.00")
+            print(f"Text Items:     {analysis['text_items']}")
+            print(f"Avg Confidence: {analysis['avg_confidence']:.2f}")
+            print(f"Analysis Time:  {analysis['analysis_time_ms']:.0f} ms")
+            print(f"\nRecommendation: --mode {analysis['recommended_mode']}")
+            print(f"Reason: {analysis['reason']}")
+            print("\n" + "="*60)
+            print("To run OCR with recommended mode:")
+            print(f"  python ocr_caller.py --mode {analysis['recommended_mode']} \\")
+            if args.file_url:
+                print(f"    --file-url \"{args.file_url}\"")
+            elif args.file_path:
+                print(f"    --file-path \"{args.file_path}\"")
+            print("="*60 + "\n")
+            sys.exit(0)
+        # Mode 2: Interactive mode (analyze + prompt for mode)
+        if args.interactive:
+            print("\n" + "="*60)
+            print("Interactive Mode Selection")
+            print("="*60)
+            print("\nStep 1: Analyzing image (fast mode test)...")
+            analysis = analyze_image(client, file_input, file_type, args.visualize)
+            if "error" in analysis:
+                print(f"\nAnalysis Error: {analysis['error']}")
+                print(f"Recommendation: {analysis['recommended_mode']}")
+            else:
+                print(f"\nQuality Score:  {analysis['quality_score']:.2f} / 1.00")
+                print(f"Text Items:     {analysis['text_items']}")
+                print(f"Avg Confidence: {analysis['avg_confidence']:.2f}")
+            print(f"\nRecommended Mode: {analysis['recommended_mode']}")
+            print(f"Reason: {analysis['reason']}")
+            print("\n" + "="*60)
+            print("Step 2: Select mode to use")
+            print("="*60)
+            print(f"\nOptions:")
+            print(f"  1. Use recommended mode ({analysis['recommended_mode']})")
+            print(f"  2. Use fast mode (fastest)")
+            print(f"  3. Use quality mode (most accurate)")
+            print(f"  4. Use auto mode (adaptive)")
+            print(f"  5. Cancel")
+            while True:
+                choice = input("\nYour choice [1-5]: ").strip()
+                if choice == "1":
+                    selected_mode = analysis['recommended_mode']
+                    break
+                elif choice == "2":
+                    selected_mode = "fast"
+                    break
+                elif choice == "3":
+                    selected_mode = "quality"
+                    break
+                elif choice == "4":
+                    selected_mode = "auto"
+                    break
+                elif choice == "5":
+                    print("Cancelled.")
+                    sys.exit(0)
+                else:
+                    print("Invalid choice. Please enter 1-5.")
+            print(f"\nSelected mode: {selected_mode}")
+            print("Running OCR...\n")
+            # Use selected mode
+            args.mode = selected_mode
+        # Mode 3: Normal mode (run OCR directly)
+        result = run_ocr(
+            client,
+            file_input,
+            file_type,
+            args.mode,
+            args.max_attempts,
+            args.budget_ms,
+            args.quality_target,
+            args.visualize,
+            args.return_raw_provider,
+            cache
+        )
+        # Prepare JSON output
+        indent = 2 if args.pretty else None
+        json_output = json.dumps(result, indent=indent, ensure_ascii=False)
+        # Save to file if --output specified
+        if args.output:
+            try:
+                output_path = Path(args.output).resolve()
+                # Create directory if not exists
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                # Write file
+                with open(output_path, 'w', encoding='utf-8') as f:
+                    f.write(json_output)
+                # Print success message to stderr (so it doesn't mix with JSON output)
+                print(f"Result saved to: {output_path}", file=sys.stderr)
+            except PermissionError:
+                print(f"Error: Permission denied to write to {output_path}", file=sys.stderr)
+                sys.exit(5)
+            except OSError as e:
+                print(f"Error: Cannot write to {output_path}: {e}", file=sys.stderr)
+                sys.exit(5)
+        else:
+            # No --output: print to stdout (original behavior)
+            print(json_output)
+        # Exit code
+        if result["ok"]:
+            sys.exit(0)
+        else:
+            error_code = result["error"]["code"]
+            if error_code in ["PROVIDER_AUTH_ERROR", "PROVIDER_QUOTA_EXCEEDED"]:
+                sys.exit(3)
+            elif error_code in ["PROVIDER_OVERLOADED", "PROVIDER_TIMEOUT"]:
+                sys.exit(4)
+            else:
+                sys.exit(4)
+    finally:
+        client.close()
+if __name__ == "__main__":
+    main()

package/templates/ppocrv5/scripts/ppocrv5/requirements.txt ADDED Viewed

@@ -0,0 +1,4 @@
+# Runtime dependencies for PP-OCRv5 API Skill
+httpx>=0.24.0
+python-dotenv>=0.19.0