paddleocr-skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/README.md +220 -0
  2. package/bin/paddleocr-skills.js +20 -0
  3. package/lib/copy.js +39 -0
  4. package/lib/installer.js +70 -0
  5. package/lib/prompts.js +67 -0
  6. package/lib/python.js +75 -0
  7. package/lib/verify.js +121 -0
  8. package/package.json +42 -0
  9. package/templates/.env.example +12 -0
  10. package/templates/paddleocr-vl/references/paddleocr-vl/layout_schema.md +64 -0
  11. package/templates/paddleocr-vl/references/paddleocr-vl/output_format.md +154 -0
  12. package/templates/paddleocr-vl/references/paddleocr-vl/vl_model_spec.md +157 -0
  13. package/templates/paddleocr-vl/scripts/paddleocr-vl/_lib.py +780 -0
  14. package/templates/paddleocr-vl/scripts/paddleocr-vl/configure.py +270 -0
  15. package/templates/paddleocr-vl/scripts/paddleocr-vl/optimize_file.py +226 -0
  16. package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements-optimize.txt +8 -0
  17. package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements.txt +7 -0
  18. package/templates/paddleocr-vl/scripts/paddleocr-vl/smoke_test.py +199 -0
  19. package/templates/paddleocr-vl/scripts/paddleocr-vl/vl_caller.py +232 -0
  20. package/templates/paddleocr-vl/skills/paddleocr-vl/SKILL.md +481 -0
  21. package/templates/ppocrv5/references/ppocrv5/agent_policy.md +258 -0
  22. package/templates/ppocrv5/references/ppocrv5/normalized_schema.md +257 -0
  23. package/templates/ppocrv5/references/ppocrv5/provider_api.md +140 -0
  24. package/templates/ppocrv5/scripts/ppocrv5/_lib.py +635 -0
  25. package/templates/ppocrv5/scripts/ppocrv5/configure.py +346 -0
  26. package/templates/ppocrv5/scripts/ppocrv5/ocr_caller.py +684 -0
  27. package/templates/ppocrv5/scripts/ppocrv5/requirements.txt +4 -0
  28. package/templates/ppocrv5/scripts/ppocrv5/smoke_test.py +139 -0
  29. package/templates/ppocrv5/skills/ppocrv5/SKILL.md +272 -0
@@ -0,0 +1,139 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Smoke Test for PP-OCRv5 API Skill
4
+ Verifies that AISTUDIO_HOST and PADDLE_OCR_TOKEN are correctly configured
5
+ and that the provider API is accessible.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import subprocess
11
+ import sys
12
+ from pathlib import Path
13
+
14
+
15
+ def main():
16
+ print("=" * 60)
17
+ print("PP-OCRv5 API Skill - Smoke Test")
18
+ print("=" * 60)
19
+
20
+ # Check configuration (all sources)
21
+ print("\n[1/3] Checking configuration...")
22
+
23
+ # Add scripts dir to path for imports
24
+ script_dir = Path(__file__).parent
25
+ sys.path.insert(0, str(script_dir))
26
+
27
+ from _lib import Config
28
+
29
+ try:
30
+ # Try to get config from .env file
31
+ api_url = Config.get_api_url()
32
+ token = Config.get_token()
33
+ except ValueError as e:
34
+ print(f"\nConfiguration error: {e}")
35
+ sys.exit(1)
36
+
37
+ test_file_url = os.getenv("TEST_FILE_URL", "").strip()
38
+
39
+ if not test_file_url:
40
+ print("WARNING: TEST_FILE_URL is not set, using default test image")
41
+ # Use a default public test image (Chinese text)
42
+ test_file_url = "https://raw.githubusercontent.com/PaddlePaddle/PaddleOCR/release/2.7/doc/imgs/11.jpg"
43
+
44
+ print(f" API_URL: {api_url}")
45
+ print(f" PADDLE_OCR_TOKEN: {'*' * 8}{token[-4:] if len(token) > 4 else '****'}")
46
+ print(f" TEST_FILE_URL: {test_file_url}")
47
+
48
+ # Run ocr_caller.py
49
+ print("\n[2/3] Running OCR on test file...")
50
+
51
+ script_dir = Path(__file__).parent
52
+ ocr_caller = script_dir / "ocr_caller.py"
53
+
54
+ cmd = [
55
+ sys.executable,
56
+ str(ocr_caller),
57
+ "--mode", "auto",
58
+ "--file-url", test_file_url,
59
+ "--max-attempts", "2",
60
+ "--budget-ms", "20000"
61
+ ]
62
+
63
+ try:
64
+ result = subprocess.run(
65
+ cmd,
66
+ capture_output=True,
67
+ text=True,
68
+ timeout=30
69
+ )
70
+ except subprocess.TimeoutExpired:
71
+ print("ERROR: OCR call timed out after 30 seconds")
72
+ sys.exit(4)
73
+ except Exception as e:
74
+ print(f"ERROR: Failed to run ocr_caller.py: {e}")
75
+ sys.exit(4)
76
+
77
+ # Parse output
78
+ print("\n[3/3] Validating response...")
79
+
80
+ if result.returncode != 0:
81
+ print(f"ERROR: ocr_caller.py exited with code {result.returncode}")
82
+ print("\nStderr:")
83
+ print(result.stderr)
84
+ print("\nStdout:")
85
+ print(result.stdout)
86
+ sys.exit(result.returncode)
87
+
88
+ try:
89
+ response = json.loads(result.stdout)
90
+ except json.JSONDecodeError as e:
91
+ print(f"ERROR: Failed to parse JSON response: {e}")
92
+ print("\nStdout:")
93
+ print(result.stdout)
94
+ sys.exit(4)
95
+
96
+ # Validate response structure
97
+ if not response.get("ok"):
98
+ error = response.get("error", {})
99
+ print(f"ERROR: OCR failed with error code: {error.get('code')}")
100
+ print(f"Message: {error.get('message')}")
101
+ print(f"\nFull response:\n{json.dumps(response, indent=2, ensure_ascii=False)}")
102
+ sys.exit(3)
103
+
104
+ # Check for content
105
+ result_data = response.get("result", {})
106
+ full_text = result_data.get("full_text", "")
107
+ pages = result_data.get("pages", [])
108
+
109
+ total_items = sum(len(page.get("items", [])) for page in pages)
110
+
111
+ if not full_text and total_items == 0:
112
+ print("WARNING: OCR succeeded but returned no text. This may indicate:")
113
+ print(" - The test image is blank or unreadable")
114
+ print(" - Provider API is working but returned empty results")
115
+ print(f"\nFull response:\n{json.dumps(response, indent=2, ensure_ascii=False)}")
116
+ # Still pass, as API is working
117
+ else:
118
+ print(f"SUCCESS: OCR completed")
119
+ print(f" - Total text items: {total_items}")
120
+ print(f" - Quality score: {response.get('quality', {}).get('quality_score', 0):.4f}")
121
+ print(f" - Avg confidence: {response.get('quality', {}).get('avg_rec_score', 0):.4f}")
122
+ print(f" - Mode: {response.get('agent_trace', {}).get('mode')}")
123
+ print(f" - Attempts: {len(response.get('agent_trace', {}).get('attempts', []))}")
124
+
125
+ # Print first 200 chars of text
126
+ if full_text:
127
+ preview = full_text[:200].replace("\n", " ")
128
+ if len(full_text) > 200:
129
+ preview += "..."
130
+ print(f"\n Preview: {preview}")
131
+
132
+ print("\n" + "=" * 60)
133
+ print("Smoke test PASSED")
134
+ print("=" * 60)
135
+ sys.exit(0)
136
+
137
+
138
+ if __name__ == "__main__":
139
+ main()
@@ -0,0 +1,272 @@
1
+ ---
2
+ name: ppocrv5
3
+ description: >
4
+ Use this skill when users need to extract text from images, PDFs, or documents. Supports URLs and local files,
5
+ with adaptive quality modes. Returns structured JSON containing recognized text, confidence scores, and quality metrics.
6
+ ---
7
+
8
+ # PP-OCRv5 API Skill
9
+
10
+ ## When to Use This Skill
11
+
12
+ Invoke this skill in the following situations:
13
+ - Extract text from images (screenshots, photos, scans, charts)
14
+ - Read text from PDF or document images
15
+ - Perform OCR on any visual content containing text
16
+ - Parse structured documents (invoices, receipts, forms, tables)
17
+ - Recognize text in photos taken by mobile phones
18
+ - Extract text from URLs pointing to images or PDFs
19
+
20
+ Do not use this skill in the following situations:
21
+ - Plain text files that can be read directly with the Read tool
22
+ - Code files or markdown documents
23
+ - Tasks that do not involve image-to-text conversion
24
+
25
+ ## How to Use This Skill
26
+
27
+ **⛔ MANDATORY RESTRICTIONS - DO NOT VIOLATE ⛔**
28
+
29
+ 1. **ONLY use PP-OCRv5 API** - Execute the script `python scripts/ppocrv5/ocr_caller.py`
30
+ 2. **NEVER use Claude's built-in vision** - Do NOT read images yourself
31
+ 3. **NEVER offer alternatives** - Do NOT suggest "I can try to read it" or similar
32
+ 4. **IF API fails** - Display the error message and STOP immediately
33
+ 5. **NO fallback methods** - Do NOT attempt OCR any other way
34
+
35
+ If the script execution fails (API not configured, network error, etc.):
36
+ - Show the error message to the user
37
+ - Do NOT offer to help using your vision capabilities
38
+ - Do NOT ask "Would you like me to try reading it?"
39
+ - Simply stop and wait for user to fix the configuration
40
+
41
+ ### Basic Workflow
42
+
43
+ 1. **Identify the input source**:
44
+ - User provides URL: Use the `--file-url` parameter
45
+ - User provides local file path: Use the `--file-path` parameter
46
+ - User uploads image: Save it first, then use `--file-path`
47
+
48
+ 2. **Execute OCR**:
49
+ ```bash
50
+ python scripts/ppocrv5/ocr_caller.py --file-url "URL provided by user" --pretty
51
+ ```
52
+ Or for local files:
53
+ ```bash
54
+ python scripts/ppocrv5/ocr_caller.py --file-path "file path" --pretty
55
+ ```
56
+
57
+ **Save result to file** (recommended):
58
+ ```bash
59
+ python scripts/ppocrv5/ocr_caller.py --file-url "URL" --output result.json --pretty
60
+ ```
61
+ - The script will display: `Result saved to: /absolute/path/to/result.json`
62
+ - This message appears on stderr, the JSON is saved to the file
63
+ - **Tell the user the file path** shown in the message
64
+
65
+ 3. **Parse JSON response**:
66
+ - Check the `ok` field: `true` means success, `false` means error
67
+ - Extract text: `result.full_text` contains all recognized text
68
+ - Get quality: `quality.quality_score` indicates recognition confidence (0.0-1.0)
69
+ - Handle errors: If `ok` is false, display `error.message`
70
+
71
+ 4. **Present results to user**:
72
+ - Display extracted text in a readable format
73
+ - If quality score is low (<0.5), alert the user
74
+ - If structured output is needed, use `result.pages[].items[]` to get line-by-line data
75
+
76
+ ### IMPORTANT: Complete Output Display
77
+
78
+ **CRITICAL**: Always display the COMPLETE recognized text to the user. Do NOT truncate or summarize the OCR results.
79
+
80
+ - The script returns the full JSON with complete text content in `result.full_text`
81
+ - **You MUST display the entire `full_text` content to the user**, no matter how long it is
82
+ - Do NOT use phrases like "Here's a summary" or "The text begins with..."
83
+ - Do NOT truncate with "..." unless the text truly exceeds reasonable display limits
84
+ - The user expects to see ALL the recognized text, not a preview or excerpt
85
+
86
+ **Correct approach**:
87
+ ```
88
+ I've extracted the text from the image. Here's the complete content:
89
+
90
+ [Display the entire result.full_text here]
91
+
92
+ Quality Score: 0.85 / 1.00 (Good quality recognition)
93
+ ```
94
+
95
+ **Incorrect approach** ❌:
96
+ ```
97
+ I found some text in the image. Here's a preview:
98
+ "The quick brown fox..." (truncated)
99
+ ```
100
+
101
+ ### Mode Selection
102
+
103
+ Always use `--mode auto` (default) unless the user explicitly requests otherwise:
104
+
105
+ | User Request | Use Mode | Command Flag |
106
+ |--------------|----------|--------------|
107
+ | Default/unspecified | Auto (adaptive) | `--mode auto` (or omit) |
108
+ | "Quick recognition" / "fast" | Fast | `--mode fast` |
109
+ | "High precision" / "accurate" | Quality | `--mode quality` |
110
+
111
+ **Auto mode** (recommended): Automatically tries 1-3 times, progressively increasing correction levels, returning the best result.
112
+
113
+ ### Usage Mode Examples
114
+
115
+ **Mode 1: Simple URL OCR**
116
+ ```bash
117
+ python scripts/ppocrv5/ocr_caller.py --file-url "https://example.com/invoice.jpg" --pretty
118
+ ```
119
+
120
+ **Mode 2: Local File OCR**
121
+ ```bash
122
+ python scripts/ppocrv5/ocr_caller.py --file-path "./document.pdf" --pretty
123
+ ```
124
+
125
+ **Mode 3: Fast Mode for Clear Images**
126
+ ```bash
127
+ python scripts/ppocrv5/ocr_caller.py --file-url "URL" --mode fast --pretty
128
+ ```
129
+
130
+ ### Understanding the Output
131
+
132
+ The script outputs JSON structure as follows:
133
+ ```json
134
+ {
135
+ "ok": true,
136
+ "result": {
137
+ "full_text": "All recognized text here...",
138
+ "pages": [...]
139
+ },
140
+ "quality": {
141
+ "quality_score": 0.85,
142
+ "text_items": 42
143
+ }
144
+ }
145
+ ```
146
+
147
+ **Key fields to extract**:
148
+ - `result.full_text`: Complete text for the user
149
+ - `quality.quality_score`: 0.72+ is good, <0.5 is poor
150
+ - `error.message`: If `ok` is false, provides error description
151
+
152
+ ### First-Time Configuration
153
+
154
+ **When API is not configured**:
155
+
156
+ The error will show:
157
+ ```
158
+ Configuration error: API not configured. Get your API at: https://aistudio.baidu.com/paddleocr/task
159
+ ```
160
+
161
+ **Auto-configuration workflow**:
162
+
163
+ 1. **Show the exact error message** to user (including the URL)
164
+
165
+ 2. **Tell user to provide credentials**:
166
+ ```
167
+ Please visit the URL above to get your API_URL and TOKEN.
168
+ Once you have them, send them to me and I'll configure it automatically.
169
+ ```
170
+
171
+ 3. **When user provides credentials** (accept any format):
172
+ - `API_URL=https://xxx.aistudio-app.com/ocr, TOKEN=abc123...`
173
+ - `Here's my API: https://xxx and token: abc123`
174
+ - Copy-pasted code format
175
+ - Any other reasonable format
176
+
177
+ 4. **Parse credentials from user's message**:
178
+ - Extract API_URL value (look for URLs with aistudio-app.com or similar)
179
+ - Extract TOKEN value (long alphanumeric string, usually 40+ chars)
180
+
181
+ 5. **Configure automatically**:
182
+ ```bash
183
+ python scripts/ppocrv5/configure.py --api-url "PARSED_URL" --token "PARSED_TOKEN"
184
+ ```
185
+
186
+ 6. **If configuration succeeds**:
187
+ - Inform user: "Configuration complete! Running OCR now..."
188
+ - Retry the original OCR task
189
+
190
+ 7. **If configuration fails**:
191
+ - Show the error
192
+ - Ask user to verify the credentials
193
+
194
+ **IMPORTANT**: The error message format is STRICT and must be shown exactly as provided by the script. Do not modify or paraphrase it.
195
+
196
+ **Authentication failed (403)**:
197
+ ```
198
+ error_code: PROVIDER_AUTH_ERROR
199
+ ```
200
+ → Token is invalid, reconfigure with correct credentials
201
+
202
+ **Quota exceeded (429)**:
203
+ ```
204
+ error_code: PROVIDER_QUOTA_EXCEEDED
205
+ ```
206
+ → Daily API quota exhausted, inform user to wait or upgrade
207
+
208
+ **No text detected**:
209
+ ```
210
+ quality_score: 0.0, text_items: 0
211
+ ```
212
+ → Image may be blank, corrupted, or contain no text
213
+
214
+ ## Quality Interpretation
215
+
216
+ When presenting results to users, consider the quality score:
217
+
218
+ | Quality Score | Explanation to User |
219
+ |---------------|---------------------|
220
+ | 0.90 - 1.00 | Excellent recognition quality |
221
+ | 0.72 - 0.89 | Good recognition quality (default target) |
222
+ | 0.50 - 0.71 | Fair recognition quality, may have some errors |
223
+ | 0.00 - 0.49 | Poor recognition quality or no text detected |
224
+
225
+ If quality is below 0.5, mention to the user and suggest:
226
+ - Try using `--mode quality` for better accuracy
227
+ - Check if the image is clear and contains text
228
+ - Provide a higher resolution image if possible
229
+
230
+ ## Advanced Options
231
+
232
+ Use only when explicitly requested by the user:
233
+
234
+ **Include raw provider response** (for debugging):
235
+ ```bash
236
+ python scripts/ppocrv5/ocr_caller.py --file-url "URL" --return-raw-provider
237
+ ```
238
+
239
+ **Request visualization** (show detection regions):
240
+ ```bash
241
+ python scripts/ppocrv5/ocr_caller.py --file-url "URL" --visualize
242
+ ```
243
+
244
+ **Adjust auto mode parameters**:
245
+ ```bash
246
+ python scripts/ppocrv5/ocr_caller.py --file-url "URL" \
247
+ --max-attempts 2 \
248
+ --quality-target 0.80 \
249
+ --budget-ms 20000
250
+ ```
251
+
252
+ ## Reference Documentation
253
+
254
+ For in-depth understanding of the OCR system, refer to:
255
+ - `references/ppocrv5/agent_policy.md` - Auto mode strategy and quality scoring
256
+ - `references/ppocrv5/normalized_schema.md` - Complete output schema specification
257
+ - `references/ppocrv5/provider_api.md` - Provider API contract details
258
+
259
+ Load these reference documents into context when:
260
+ - Debugging complex issues
261
+ - User asks about quality scoring algorithm
262
+ - Need to understand adaptive retry mechanism
263
+ - Customizing auto mode parameters
264
+
265
+ ## Testing the Skill
266
+
267
+ To verify the skill is working properly:
268
+ ```bash
269
+ python scripts/ppocrv5/smoke_test.py
270
+ ```
271
+
272
+ This tests configuration and API connectivity.