npm - paddleocr-skills - Versions diffs - 1.0.0 - Mend

paddleocr-skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/README.md +220 -0
package/bin/paddleocr-skills.js +20 -0
package/lib/copy.js +39 -0
package/lib/installer.js +70 -0
package/lib/prompts.js +67 -0
package/lib/python.js +75 -0
package/lib/verify.js +121 -0
package/package.json +42 -0
package/templates/.env.example +12 -0
package/templates/paddleocr-vl/references/paddleocr-vl/layout_schema.md +64 -0
package/templates/paddleocr-vl/references/paddleocr-vl/output_format.md +154 -0
package/templates/paddleocr-vl/references/paddleocr-vl/vl_model_spec.md +157 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/_lib.py +780 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/configure.py +270 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/optimize_file.py +226 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements-optimize.txt +8 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements.txt +7 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/smoke_test.py +199 -0
package/templates/paddleocr-vl/scripts/paddleocr-vl/vl_caller.py +232 -0
package/templates/paddleocr-vl/skills/paddleocr-vl/SKILL.md +481 -0
package/templates/ppocrv5/references/ppocrv5/agent_policy.md +258 -0
package/templates/ppocrv5/references/ppocrv5/normalized_schema.md +257 -0
package/templates/ppocrv5/references/ppocrv5/provider_api.md +140 -0
package/templates/ppocrv5/scripts/ppocrv5/_lib.py +635 -0
package/templates/ppocrv5/scripts/ppocrv5/configure.py +346 -0
package/templates/ppocrv5/scripts/ppocrv5/ocr_caller.py +684 -0
package/templates/ppocrv5/scripts/ppocrv5/requirements.txt +4 -0
package/templates/ppocrv5/scripts/ppocrv5/smoke_test.py +139 -0
package/templates/ppocrv5/skills/ppocrv5/SKILL.md +272 -0

package/templates/ppocrv5/scripts/ppocrv5/smoke_test.py ADDED Viewed

@@ -0,0 +1,139 @@
+#!/usr/bin/env python3
+"""
+Smoke Test for PP-OCRv5 API Skill
+Verifies that AISTUDIO_HOST and PADDLE_OCR_TOKEN are correctly configured
+and that the provider API is accessible.
+"""
+import json
+import os
+import subprocess
+import sys
+from pathlib import Path
+def main():
+    print("=" * 60)
+    print("PP-OCRv5 API Skill - Smoke Test")
+    print("=" * 60)
+    # Check configuration (all sources)
+    print("\n[1/3] Checking configuration...")
+    # Add scripts dir to path for imports
+    script_dir = Path(__file__).parent
+    sys.path.insert(0, str(script_dir))
+    from _lib import Config
+    try:
+        # Try to get config from .env file
+        api_url = Config.get_api_url()
+        token = Config.get_token()
+    except ValueError as e:
+        print(f"\nConfiguration error: {e}")
+        sys.exit(1)
+    test_file_url = os.getenv("TEST_FILE_URL", "").strip()
+    if not test_file_url:
+        print("WARNING: TEST_FILE_URL is not set, using default test image")
+        # Use a default public test image (Chinese text)
+        test_file_url = "https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.7/doc/imgs/11.jpg"
+    print(f"  API_URL: {api_url}")
+    print(f"  PADDLE_OCR_TOKEN: {'*' * 8}{token[-4:] if len(token) > 4 else '****'}")
+    print(f"  TEST_FILE_URL: {test_file_url}")
+    # Run ocr_caller.py
+    print("\n[2/3] Running OCR on test file...")
+    script_dir = Path(__file__).parent
+    ocr_caller = script_dir / "ocr_caller.py"
+    cmd = [
+        sys.executable,
+        str(ocr_caller),
+        "--mode", "auto",
+        "--file-url", test_file_url,
+        "--max-attempts", "2",
+        "--budget-ms", "20000"
+    ]
+    try:
+        result = subprocess.run(
+            cmd,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+    except subprocess.TimeoutExpired:
+        print("ERROR: OCR call timed out after 30 seconds")
+        sys.exit(4)
+    except Exception as e:
+        print(f"ERROR: Failed to run ocr_caller.py: {e}")
+        sys.exit(4)
+    # Parse output
+    print("\n[3/3] Validating response...")
+    if result.returncode != 0:
+        print(f"ERROR: ocr_caller.py exited with code {result.returncode}")
+        print("\nStderr:")
+        print(result.stderr)
+        print("\nStdout:")
+        print(result.stdout)
+        sys.exit(result.returncode)
+    try:
+        response = json.loads(result.stdout)
+    except json.JSONDecodeError as e:
+        print(f"ERROR: Failed to parse JSON response: {e}")
+        print("\nStdout:")
+        print(result.stdout)
+        sys.exit(4)
+    # Validate response structure
+    if not response.get("ok"):
+        error = response.get("error", {})
+        print(f"ERROR: OCR failed with error code: {error.get('code')}")
+        print(f"Message: {error.get('message')}")
+        print(f"\nFull response:\n{json.dumps(response, indent=2, ensure_ascii=False)}")
+        sys.exit(3)
+    # Check for content
+    result_data = response.get("result", {})
+    full_text = result_data.get("full_text", "")
+    pages = result_data.get("pages", [])
+    total_items = sum(len(page.get("items", [])) for page in pages)
+    if not full_text and total_items == 0:
+        print("WARNING: OCR succeeded but returned no text. This may indicate:")
+        print("  - The test image is blank or unreadable")
+        print("  - Provider API is working but returned empty results")
+        print(f"\nFull response:\n{json.dumps(response, indent=2, ensure_ascii=False)}")
+        # Still pass, as API is working
+    else:
+        print(f"SUCCESS: OCR completed")
+        print(f"  - Total text items: {total_items}")
+        print(f"  - Quality score: {response.get('quality', {}).get('quality_score', 0):.4f}")
+        print(f"  - Avg confidence: {response.get('quality', {}).get('avg_rec_score', 0):.4f}")
+        print(f"  - Mode: {response.get('agent_trace', {}).get('mode')}")
+        print(f"  - Attempts: {len(response.get('agent_trace', {}).get('attempts', []))}")
+        # Print first 200 chars of text
+        if full_text:
+            preview = full_text[:200].replace("\n", " ")
+            if len(full_text) > 200:
+                preview += "..."
+            print(f"\n  Preview: {preview}")
+    print("\n" + "=" * 60)
+    print("Smoke test PASSED")
+    print("=" * 60)
+    sys.exit(0)
+if __name__ == "__main__":
+    main()

package/templates/ppocrv5/skills/ppocrv5/SKILL.md ADDED Viewed

@@ -0,0 +1,272 @@
+---
+name: ppocrv5
+description: >
+  Use this skill when users need to extract text from images, PDFs, or documents. Supports URLs and local files,
+  with adaptive quality modes. Returns structured JSON containing recognized text, confidence scores, and quality metrics.
+---
+# PP-OCRv5 API Skill
+## When to Use This Skill
+Invoke this skill in the following situations:
+- Extract text from images (screenshots, photos, scans, charts)
+- Read text from PDF or document images
+- Perform OCR on any visual content containing text
+- Parse structured documents (invoices, receipts, forms, tables)
+- Recognize text in photos taken by mobile phones
+- Extract text from URLs pointing to images or PDFs
+Do not use this skill in the following situations:
+- Plain text files that can be read directly with the Read tool
+- Code files or markdown documents
+- Tasks that do not involve image-to-text conversion
+## How to Use This Skill
+**⛔ MANDATORY RESTRICTIONS - DO NOT VIOLATE ⛔**
+1. **ONLY use PP-OCRv5 API** - Execute the script `python scripts/ppocrv5/ocr_caller.py`
+2. **NEVER use Claude's built-in vision** - Do NOT read images yourself
+3. **NEVER offer alternatives** - Do NOT suggest "I can try to read it" or similar
+4. **IF API fails** - Display the error message and STOP immediately
+5. **NO fallback methods** - Do NOT attempt OCR any other way
+If the script execution fails (API not configured, network error, etc.):
+- Show the error message to the user
+- Do NOT offer to help using your vision capabilities
+- Do NOT ask "Would you like me to try reading it?"
+- Simply stop and wait for user to fix the configuration
+### Basic Workflow
+1. **Identify the input source**:
+   - User provides URL: Use the `--file-url` parameter
+   - User provides local file path: Use the `--file-path` parameter
+   - User uploads image: Save it first, then use `--file-path`
+2. **Execute OCR**:
+   ```bash
+   python scripts/ppocrv5/ocr_caller.py --file-url "URL provided by user" --pretty
+   ```
+   Or for local files:
+   ```bash
+   python scripts/ppocrv5/ocr_caller.py --file-path "file path" --pretty
+   ```
+   **Save result to file** (recommended):
+   ```bash
+   python scripts/ppocrv5/ocr_caller.py --file-url "URL" --output result.json --pretty
+   ```
+   - The script will display: `Result saved to: /absolute/path/to/result.json`
+   - This message appears on stderr, the JSON is saved to the file
+   - **Tell the user the file path** shown in the message
+3. **Parse JSON response**:
+   - Check the `ok` field: `true` means success, `false` means error
+   - Extract text: `result.full_text` contains all recognized text
+   - Get quality: `quality.quality_score` indicates recognition confidence (0.0-1.0)
+   - Handle errors: If `ok` is false, display `error.message`
+4. **Present results to user**:
+   - Display extracted text in a readable format
+   - If quality score is low (<0.5), alert the user
+   - If structured output is needed, use `result.pages[].items[]` to get line-by-line data
+### IMPORTANT: Complete Output Display
+**CRITICAL**: Always display the COMPLETE recognized text to the user. Do NOT truncate or summarize the OCR results.
+- The script returns the full JSON with complete text content in `result.full_text`
+- **You MUST display the entire `full_text` content to the user**, no matter how long it is
+- Do NOT use phrases like "Here's a summary" or "The text begins with..."
+- Do NOT truncate with "..." unless the text truly exceeds reasonable display limits
+- The user expects to see ALL the recognized text, not a preview or excerpt
+**Correct approach**:
+```
+I've extracted the text from the image. Here's the complete content:
+[Display the entire result.full_text here]
+Quality Score: 0.85 / 1.00 (Good quality recognition)
+```
+**Incorrect approach** ❌:
+```
+I found some text in the image. Here's a preview:
+"The quick brown fox..." (truncated)
+```
+### Mode Selection
+Always use `--mode auto` (default) unless the user explicitly requests otherwise:
+| User Request | Use Mode | Command Flag |
+|--------------|----------|--------------|
+| Default/unspecified | Auto (adaptive) | `--mode auto` (or omit) |
+| "Quick recognition" / "fast" | Fast | `--mode fast` |
+| "High precision" / "accurate" | Quality | `--mode quality` |
+**Auto mode** (recommended): Automatically tries 1-3 times, progressively increasing correction levels, returning the best result.
+### Usage Mode Examples
+**Mode 1: Simple URL OCR**
+```bash
+python scripts/ppocrv5/ocr_caller.py --file-url "https://example.com/invoice.jpg" --pretty
+```
+**Mode 2: Local File OCR**
+```bash
+python scripts/ppocrv5/ocr_caller.py --file-path "./document.pdf" --pretty
+```
+**Mode 3: Fast Mode for Clear Images**
+```bash
+python scripts/ppocrv5/ocr_caller.py --file-url "URL" --mode fast --pretty
+```
+### Understanding the Output
+The script outputs JSON structure as follows:
+```json
+{
+  "ok": true,
+  "result": {
+    "full_text": "All recognized text here...",
+    "pages": [...]
+  },
+  "quality": {
+    "quality_score": 0.85,
+    "text_items": 42
+  }
+}
+```
+**Key fields to extract**:
+- `result.full_text`: Complete text for the user
+- `quality.quality_score`: 0.72+ is good, <0.5 is poor
+- `error.message`: If `ok` is false, provides error description
+### First-Time Configuration
+**When API is not configured**:
+The error will show:
+```
+Configuration error: API not configured. Get your API at: https://aistudio.baidu.com/paddleocr/task
+```
+**Auto-configuration workflow**:
+1. **Show the exact error message** to user (including the URL)
+2. **Tell user to provide credentials**:
+   ```
+   Please visit the URL above to get your API_URL and TOKEN.
+   Once you have them, send them to me and I'll configure it automatically.
+   ```
+3. **When user provides credentials** (accept any format):
+   - `API_URL=https://xxx.aistudio-app.com/ocr, TOKEN=abc123...`
+   - `Here's my API: https://xxx and token: abc123`
+   - Copy-pasted code format
+   - Any other reasonable format
+4. **Parse credentials from user's message**:
+   - Extract API_URL value (look for URLs with aistudio-app.com or similar)
+   - Extract TOKEN value (long alphanumeric string, usually 40+ chars)
+5. **Configure automatically**:
+   ```bash
+   python scripts/ppocrv5/configure.py --api-url "PARSED_URL" --token "PARSED_TOKEN"
+   ```
+6. **If configuration succeeds**:
+   - Inform user: "Configuration complete! Running OCR now..."
+   - Retry the original OCR task
+7. **If configuration fails**:
+   - Show the error
+   - Ask user to verify the credentials
+**IMPORTANT**: The error message format is STRICT and must be shown exactly as provided by the script. Do not modify or paraphrase it.
+**Authentication failed (403)**:
+```
+error_code: PROVIDER_AUTH_ERROR
+```
+→ Token is invalid, reconfigure with correct credentials
+**Quota exceeded (429)**:
+```
+error_code: PROVIDER_QUOTA_EXCEEDED
+```
+→ Daily API quota exhausted, inform user to wait or upgrade
+**No text detected**:
+```
+quality_score: 0.0, text_items: 0
+```
+→ Image may be blank, corrupted, or contain no text
+## Quality Interpretation
+When presenting results to users, consider the quality score:
+| Quality Score | Explanation to User |
+|---------------|---------------------|
+| 0.90 - 1.00 | Excellent recognition quality |
+| 0.72 - 0.89 | Good recognition quality (default target) |
+| 0.50 - 0.71 | Fair recognition quality, may have some errors |
+| 0.00 - 0.49 | Poor recognition quality or no text detected |
+If quality is below 0.5, mention to the user and suggest:
+- Try using `--mode quality` for better accuracy
+- Check if the image is clear and contains text
+- Provide a higher resolution image if possible
+## Advanced Options
+Use only when explicitly requested by the user:
+**Include raw provider response** (for debugging):
+```bash
+python scripts/ppocrv5/ocr_caller.py --file-url "URL" --return-raw-provider
+```
+**Request visualization** (show detection regions):
+```bash
+python scripts/ppocrv5/ocr_caller.py --file-url "URL" --visualize
+```
+**Adjust auto mode parameters**:
+```bash
+python scripts/ppocrv5/ocr_caller.py --file-url "URL" \
+  --max-attempts 2 \
+  --quality-target 0.80 \
+  --budget-ms 20000
+```
+## Reference Documentation
+For in-depth understanding of the OCR system, refer to:
+- `references/ppocrv5/agent_policy.md` - Auto mode strategy and quality scoring
+- `references/ppocrv5/normalized_schema.md` - Complete output schema specification
+- `references/ppocrv5/provider_api.md` - Provider API contract details
+Load these reference documents into context when:
+- Debugging complex issues
+- User asks about quality scoring algorithm
+- Need to understand adaptive retry mechanism
+- Customizing auto mode parameters
+## Testing the Skill
+To verify the skill is working properly:
+```bash
+python scripts/ppocrv5/smoke_test.py
+```
+This tests configuration and API connectivity.