paddleocr-skills 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. package/README.md +220 -0
  2. package/bin/paddleocr-skills.js +20 -0
  3. package/lib/copy.js +39 -0
  4. package/lib/installer.js +70 -0
  5. package/lib/prompts.js +67 -0
  6. package/lib/python.js +75 -0
  7. package/lib/verify.js +121 -0
  8. package/package.json +42 -0
  9. package/templates/.env.example +12 -0
  10. package/templates/paddleocr-vl/references/paddleocr-vl/layout_schema.md +64 -0
  11. package/templates/paddleocr-vl/references/paddleocr-vl/output_format.md +154 -0
  12. package/templates/paddleocr-vl/references/paddleocr-vl/vl_model_spec.md +157 -0
  13. package/templates/paddleocr-vl/scripts/paddleocr-vl/_lib.py +780 -0
  14. package/templates/paddleocr-vl/scripts/paddleocr-vl/configure.py +270 -0
  15. package/templates/paddleocr-vl/scripts/paddleocr-vl/optimize_file.py +226 -0
  16. package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements-optimize.txt +8 -0
  17. package/templates/paddleocr-vl/scripts/paddleocr-vl/requirements.txt +7 -0
  18. package/templates/paddleocr-vl/scripts/paddleocr-vl/smoke_test.py +199 -0
  19. package/templates/paddleocr-vl/scripts/paddleocr-vl/vl_caller.py +232 -0
  20. package/templates/paddleocr-vl/skills/paddleocr-vl/SKILL.md +481 -0
  21. package/templates/ppocrv5/references/ppocrv5/agent_policy.md +258 -0
  22. package/templates/ppocrv5/references/ppocrv5/normalized_schema.md +257 -0
  23. package/templates/ppocrv5/references/ppocrv5/provider_api.md +140 -0
  24. package/templates/ppocrv5/scripts/ppocrv5/_lib.py +635 -0
  25. package/templates/ppocrv5/scripts/ppocrv5/configure.py +346 -0
  26. package/templates/ppocrv5/scripts/ppocrv5/ocr_caller.py +684 -0
  27. package/templates/ppocrv5/scripts/ppocrv5/requirements.txt +4 -0
  28. package/templates/ppocrv5/scripts/ppocrv5/smoke_test.py +139 -0
  29. package/templates/ppocrv5/skills/ppocrv5/SKILL.md +272 -0
@@ -0,0 +1,684 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ PP-OCRv5 API Caller
4
+ Calls Paddle AI Studio PP-OCRv5 /ocr API with user-provided host/token.
5
+ Supports fast/quality/auto modes with agent-based retry and quality scoring.
6
+ """
7
+
8
+ import argparse
9
+ import base64
10
+ import json
11
+ import logging
12
+ import os
13
+ import sys
14
+ import time
15
+ import uuid
16
+ from pathlib import Path
17
+ from typing import Any, Dict, List, Optional
18
+
19
+ # Add scripts dir to path for imports
20
+ sys.path.insert(0, str(Path(__file__).parent))
21
+
22
+ from _lib import (
23
+ AgentPolicy,
24
+ Config,
25
+ Mapper,
26
+ Normalizer,
27
+ ProviderClient,
28
+ QualityEvaluator,
29
+ SimpleCache
30
+ )
31
+
32
+ # Configure logging
33
+ log_level = os.getenv("PADDLE_OCR_LOG_LEVEL", "INFO").upper()
34
+ logging.basicConfig(
35
+ level=getattr(logging, log_level),
36
+ format="%(asctime)s [%(levelname)s] %(message)s"
37
+ )
38
+ logger = logging.getLogger(__name__)
39
+
40
+
41
+ # =============================================================================
42
+ # Main OCR Logic
43
+ # =============================================================================
44
+
45
+ def detect_file_type(file_input: str) -> int:
46
+ """
47
+ Detect file type from URL or path.
48
+ Returns: 0 for PDF, 1 for Image
49
+
50
+ Supported extensions:
51
+ - PDF: .pdf
52
+ - Image: .png, .jpg, .jpeg, .bmp, .tiff, .tif, .webp, .gif
53
+ """
54
+ import re
55
+ from urllib.parse import urlparse, unquote
56
+
57
+ # Clean up the input
58
+ file_str = file_input.lower().strip()
59
+
60
+ # If it's a URL, extract the path
61
+ if file_str.startswith(('http://', 'https://')):
62
+ try:
63
+ parsed = urlparse(file_str)
64
+ file_str = unquote(parsed.path)
65
+ except:
66
+ pass
67
+
68
+ # Remove query parameters and fragments
69
+ file_str = re.sub(r'[?#].*$', '', file_str)
70
+
71
+ # Check for PDF extension
72
+ pdf_extensions = ['.pdf']
73
+ for ext in pdf_extensions:
74
+ if file_str.endswith(ext):
75
+ return 0
76
+
77
+ # Check for image extensions
78
+ image_extensions = ['.png', '.jpg', '.jpeg', '.bmp', '.tiff', '.tif', '.webp', '.gif']
79
+ for ext in image_extensions:
80
+ if file_str.endswith(ext):
81
+ return 1
82
+
83
+ # Default to image if uncertain
84
+ return 1
85
+
86
+
87
+ def load_file_as_base64(file_path: str) -> str:
88
+ """Load local file and encode as base64 (without data URI prefix)"""
89
+ with open(file_path, "rb") as f:
90
+ data = f.read()
91
+ return base64.b64encode(data).decode("utf-8")
92
+
93
+
94
+ def build_payload(
95
+ file_input: str,
96
+ file_type: int,
97
+ visualize: bool,
98
+ options: Dict[str, Any]
99
+ ) -> Dict[str, Any]:
100
+ """
101
+ Build provider API payload.
102
+
103
+ Args:
104
+ file_input: URL or base64 string
105
+ file_type: 0=PDF, 1=Image
106
+ visualize: Whether to return visualization
107
+ options: Additional OCR options (snake_case)
108
+
109
+ Returns:
110
+ Provider payload dict (camelCase)
111
+ """
112
+ # Base payload
113
+ payload = {
114
+ "file": file_input,
115
+ "file_type": file_type,
116
+ "visualize": visualize
117
+ }
118
+
119
+ # Add OCR options (with defaults)
120
+ payload.update({
121
+ "use_doc_orientation_classify": options.get("use_doc_orientation_classify", False),
122
+ "use_doc_unwarping": options.get("use_doc_unwarping", False),
123
+ "use_textline_orientation": options.get("use_textline_orientation", False),
124
+ "text_det_limit_side_len": options.get("text_det_limit_side_len", 736),
125
+ "text_det_limit_type": options.get("text_det_limit_type", "max"),
126
+ "text_det_thresh": options.get("text_det_thresh", 0.3),
127
+ "text_det_box_thresh": options.get("text_det_box_thresh", 0.6),
128
+ "text_det_unclip_ratio": options.get("text_det_unclip_ratio", 1.5),
129
+ "text_rec_score_thresh": options.get("text_rec_score_thresh", 0.0)
130
+ })
131
+
132
+ # Convert to camelCase for provider
133
+ return Mapper.dict_to_camel(payload)
134
+
135
+
136
+ def analyze_image(
137
+ client: ProviderClient,
138
+ file_input: str,
139
+ file_type: int,
140
+ visualize: bool = False
141
+ ) -> Dict[str, Any]:
142
+ """
143
+ Analyze image quality with minimal cost (1 fast call) and recommend mode.
144
+
145
+ Returns:
146
+ {
147
+ "quality_score": float,
148
+ "text_items": int,
149
+ "avg_confidence": float,
150
+ "recommended_mode": str,
151
+ "reason": str,
152
+ "fast_result": dict # Full result from fast mode
153
+ }
154
+ """
155
+ logger.info("Analyzing image to recommend mode...")
156
+
157
+ # Use fast mode (cheapest) for analysis
158
+ options = {
159
+ "use_doc_orientation_classify": False,
160
+ "use_doc_unwarping": False,
161
+ "use_textline_orientation": False
162
+ }
163
+
164
+ payload = build_payload(file_input, file_type, visualize, options)
165
+ provider_resp, status_code, elapsed_ms = client.call(payload)
166
+
167
+ # Check for errors
168
+ if provider_resp.get("errorCode", -1) != 0:
169
+ return {
170
+ "quality_score": 0.0,
171
+ "text_items": 0,
172
+ "avg_confidence": 0.0,
173
+ "recommended_mode": "quality",
174
+ "reason": f"Analysis failed (error {provider_resp.get('errorCode')}), recommend quality mode for robustness",
175
+ "fast_result": None,
176
+ "error": provider_resp.get("errorMsg", "Unknown error")
177
+ }
178
+
179
+ # Extract and evaluate quality
180
+ result = provider_resp.get("result", {})
181
+ ocr_results = result.get("ocrResults", [])
182
+
183
+ all_rec_texts = []
184
+ all_rec_scores = []
185
+ for ocr_res in ocr_results:
186
+ pruned = ocr_res.get("prunedResult", {})
187
+ all_rec_texts.extend(pruned.get("rec_texts", []))
188
+ all_rec_scores.extend(pruned.get("rec_scores", []))
189
+
190
+ quality = QualityEvaluator.evaluate(all_rec_texts, all_rec_scores)
191
+
192
+ # Recommend mode based on quality score
193
+ score = quality["quality_score"]
194
+ text_items = quality["text_items"]
195
+
196
+ if score >= 0.80:
197
+ recommended_mode = "fast"
198
+ reason = f"Excellent quality (score: {score:.2f}). Fast mode is sufficient."
199
+ elif score >= 0.65:
200
+ recommended_mode = "fast"
201
+ reason = f"Good quality (score: {score:.2f}). Fast mode recommended, but auto mode can optimize further."
202
+ elif score >= 0.45:
203
+ recommended_mode = "auto"
204
+ reason = f"Medium quality (score: {score:.2f}). Auto mode recommended for adaptive retry."
205
+ elif text_items > 0:
206
+ recommended_mode = "quality"
207
+ reason = f"Low quality (score: {score:.2f}). Quality mode recommended for better preprocessing."
208
+ else:
209
+ recommended_mode = "quality"
210
+ reason = f"No text detected. Quality mode recommended to enable all corrections."
211
+
212
+ return {
213
+ "quality_score": score,
214
+ "text_items": text_items,
215
+ "avg_confidence": quality["avg_rec_score"],
216
+ "recommended_mode": recommended_mode,
217
+ "reason": reason,
218
+ "fast_result": provider_resp,
219
+ "analysis_time_ms": elapsed_ms
220
+ }
221
+
222
+
223
+ def run_ocr(
224
+ client: ProviderClient,
225
+ file_input: str,
226
+ file_type: int,
227
+ mode: str,
228
+ max_attempts: int,
229
+ budget_ms: int,
230
+ quality_target: float,
231
+ visualize: bool,
232
+ return_raw: bool,
233
+ cache: Optional[SimpleCache] = None
234
+ ) -> Dict[str, Any]:
235
+ """
236
+ Run OCR with auto mode support.
237
+
238
+ Args:
239
+ client: ProviderClient instance
240
+ file_input: URL or base64 string
241
+ file_type: 0=PDF, 1=Image
242
+ mode: 'fast', 'quality', or 'auto'
243
+ max_attempts: Max attempts for auto mode
244
+ budget_ms: Max total time budget
245
+ quality_target: Target quality score for auto mode
246
+ visualize: Whether to return visualization
247
+ return_raw: Whether to include raw provider response
248
+ cache: Optional cache instance
249
+
250
+ Returns:
251
+ Normalized output dict
252
+ """
253
+ request_id = f"req_{uuid.uuid4().hex[:12]}"
254
+ start_time = time.time()
255
+
256
+ # Get attempt configurations
257
+ attempts_config = AgentPolicy.get_attempts_config(mode, max_attempts)
258
+
259
+ # Cache check (only for fast/quality mode, not auto)
260
+ if cache and mode != "auto":
261
+ options = attempts_config[0]
262
+ cache_key = SimpleCache.make_key(file_input, options)
263
+ cached = cache.get(cache_key)
264
+ if cached:
265
+ logger.info(f"Cache hit for request {request_id}")
266
+ return cached
267
+
268
+ attempts_history = []
269
+ best_attempt = None
270
+ best_quality = -1.0
271
+
272
+ for attempt_idx, options in enumerate(attempts_config):
273
+ attempt_num = attempt_idx + 1
274
+
275
+ # Check budget
276
+ elapsed_ms = (time.time() - start_time) * 1000
277
+ if elapsed_ms >= budget_ms:
278
+ logger.warning(f"Budget exceeded after {attempt_num - 1} attempts")
279
+ break
280
+
281
+ logger.info(f"Attempt {attempt_num}/{len(attempts_config)} with options: {options}")
282
+
283
+ # Build payload
284
+ payload = build_payload(file_input, file_type, visualize, options)
285
+
286
+ # Call provider
287
+ provider_resp, status_code, provider_time_ms = client.call(payload)
288
+
289
+ # Parse response to get quality
290
+ error_code = provider_resp.get("errorCode", -1)
291
+ if error_code != 0:
292
+ # Error - record and stop
293
+ logger.error(f"Attempt {attempt_num} failed with errorCode={error_code}")
294
+ attempts_history.append({
295
+ "attempt": attempt_num,
296
+ "provider_time_ms": round(provider_time_ms, 2),
297
+ "quality_score": 0.0,
298
+ "avg_rec_score": 0.0,
299
+ "text_items": 0,
300
+ "warnings": [f"Provider error: {provider_resp.get('errorMsg', 'Unknown')}"],
301
+ "options_effective": options
302
+ })
303
+ # Return error immediately
304
+ return Normalizer.normalize_response(
305
+ provider_resp,
306
+ request_id,
307
+ client.api_url,
308
+ status_code,
309
+ mode,
310
+ attempt_num,
311
+ attempts_history,
312
+ return_raw
313
+ )
314
+
315
+ # Success - evaluate quality
316
+ result = provider_resp.get("result", {})
317
+ ocr_results = result.get("ocrResults", [])
318
+
319
+ # Gather all rec_texts and rec_scores from all pages
320
+ all_rec_texts = []
321
+ all_rec_scores = []
322
+ for ocr_res in ocr_results:
323
+ pruned = ocr_res.get("prunedResult", {})
324
+ all_rec_texts.extend(pruned.get("rec_texts", []))
325
+ all_rec_scores.extend(pruned.get("rec_scores", []))
326
+
327
+ quality = QualityEvaluator.evaluate(all_rec_texts, all_rec_scores)
328
+
329
+ attempts_history.append({
330
+ "attempt": attempt_num,
331
+ "provider_time_ms": round(provider_time_ms, 2),
332
+ "quality_score": quality["quality_score"],
333
+ "avg_rec_score": quality["avg_rec_score"],
334
+ "text_items": quality["text_items"],
335
+ "warnings": quality["warnings"],
336
+ "options_effective": options
337
+ })
338
+
339
+ # Track best
340
+ if quality["quality_score"] > best_quality:
341
+ best_quality = quality["quality_score"]
342
+ best_attempt = {
343
+ "attempt_num": attempt_num,
344
+ "provider_resp": provider_resp,
345
+ "status_code": status_code
346
+ }
347
+
348
+ logger.info(f"Attempt {attempt_num} quality_score: {quality['quality_score']:.4f}")
349
+
350
+ # Stop if quality target met (only for auto mode)
351
+ if mode == "auto" and quality["quality_score"] >= quality_target:
352
+ logger.info(f"Quality target {quality_target} met, stopping early")
353
+ break
354
+
355
+ # Select best attempt
356
+ if best_attempt is None:
357
+ # No successful attempts
358
+ return {
359
+ "ok": False,
360
+ "request_id": request_id,
361
+ "provider": {
362
+ "api_url": client.api_url,
363
+ "status_code": 500,
364
+ "log_id": None
365
+ },
366
+ "result": None,
367
+ "quality": None,
368
+ "agent_trace": {
369
+ "mode": mode,
370
+ "selected_attempt": 0,
371
+ "attempts": attempts_history
372
+ },
373
+ "raw_provider": None,
374
+ "error": {
375
+ "code": "PROVIDER_ERROR",
376
+ "message": "All attempts failed",
377
+ "details": {}
378
+ }
379
+ }
380
+
381
+ # Normalize best result
382
+ normalized = Normalizer.normalize_response(
383
+ best_attempt["provider_resp"],
384
+ request_id,
385
+ client.api_url,
386
+ best_attempt["status_code"],
387
+ mode,
388
+ best_attempt["attempt_num"],
389
+ attempts_history,
390
+ return_raw
391
+ )
392
+
393
+ # Cache result (only for fast/quality mode)
394
+ if cache and mode != "auto" and normalized["ok"]:
395
+ options = attempts_config[0]
396
+ cache_key = SimpleCache.make_key(file_input, options)
397
+ cache.set(cache_key, normalized)
398
+
399
+ return normalized
400
+
401
+
402
+ # =============================================================================
403
+ # CLI
404
+ # =============================================================================
405
+
406
+ def main():
407
+ parser = argparse.ArgumentParser(
408
+ description="PP-OCRv5 API Caller - OCR images/PDFs via Paddle AI Studio",
409
+ epilog="""
410
+ Configuration:
411
+ Configuration is read from .env file in project root.
412
+
413
+ First-time setup:
414
+ python scripts/configure.py
415
+
416
+ Or manually create .env file:
417
+ API_URL=https://your-subdomain.aistudio-app.com/ocr
418
+ PADDLE_OCR_TOKEN=your_token_here
419
+ """
420
+ )
421
+
422
+ # Note: Configuration is now handled via .env file
423
+ # Run: python scripts/configure.py to set up
424
+
425
+ # Input (one of these required)
426
+ input_group = parser.add_mutually_exclusive_group(required=True)
427
+ input_group.add_argument("--file-url", help="URL to image or PDF")
428
+ input_group.add_argument("--file-path", help="Local path to image or PDF")
429
+ input_group.add_argument("--file-base64", help="Base64-encoded file (no data URI prefix)")
430
+
431
+ # File type
432
+ parser.add_argument(
433
+ "--file-type",
434
+ choices=["auto", "pdf", "image"],
435
+ default="auto",
436
+ help="File type (default: auto)"
437
+ )
438
+
439
+ # Mode
440
+ mode_group = parser.add_argument_group('mode selection')
441
+ mode_group.add_argument(
442
+ "--mode",
443
+ choices=["fast", "quality", "auto"],
444
+ default="auto",
445
+ help="OCR mode: fast (single quick call), quality (single high-quality call), auto (adaptive retry)"
446
+ )
447
+ mode_group.add_argument(
448
+ "--analyze",
449
+ action="store_true",
450
+ help="Analyze image and recommend mode (low cost, no full OCR)"
451
+ )
452
+ mode_group.add_argument(
453
+ "--interactive",
454
+ action="store_true",
455
+ help="Analyze first, then prompt for mode selection"
456
+ )
457
+
458
+ # Auto mode options
459
+ parser.add_argument(
460
+ "--max-attempts",
461
+ type=int,
462
+ default=3,
463
+ help="Max attempts for auto mode (default: 3)"
464
+ )
465
+ parser.add_argument(
466
+ "--budget-ms",
467
+ type=int,
468
+ default=25000,
469
+ help="Max total time budget in ms (default: 25000)"
470
+ )
471
+ parser.add_argument(
472
+ "--quality-target",
473
+ type=float,
474
+ default=0.72,
475
+ help="Target quality score for auto mode (default: 0.72)"
476
+ )
477
+
478
+ # Output options
479
+ parser.add_argument(
480
+ "--visualize",
481
+ action="store_true",
482
+ help="Request visualization from provider (may increase response size/time)"
483
+ )
484
+ parser.add_argument(
485
+ "--return-raw-provider",
486
+ action="store_true",
487
+ help="Include raw provider response in output"
488
+ )
489
+ parser.add_argument(
490
+ "--pretty",
491
+ action="store_true",
492
+ help="Pretty-print JSON output"
493
+ )
494
+ parser.add_argument(
495
+ "--output", "-o",
496
+ metavar="FILE",
497
+ help="Save result to JSON file (absolute or relative path)"
498
+ )
499
+
500
+ args = parser.parse_args()
501
+
502
+ # Determine file input
503
+ if args.file_url:
504
+ file_input = args.file_url
505
+ elif args.file_path:
506
+ file_input = load_file_as_base64(args.file_path)
507
+ elif args.file_base64:
508
+ file_input = args.file_base64
509
+ else:
510
+ print("Error: No file input provided", file=sys.stderr)
511
+ sys.exit(2)
512
+
513
+ # Determine file type
514
+ if args.file_type == "auto":
515
+ file_type = detect_file_type(args.file_url or args.file_path or "")
516
+ elif args.file_type == "pdf":
517
+ file_type = 0
518
+ else:
519
+ file_type = 1
520
+
521
+ # Load config from .env file
522
+ try:
523
+ api_url = Config.get_api_url()
524
+ token = Config.get_token()
525
+ timeout_ms = Config.get_timeout_ms()
526
+ max_retry = Config.get_max_retry()
527
+ cache_ttl_sec = Config.get_cache_ttl_sec()
528
+ except ValueError as e:
529
+ print(f"\nConfiguration error: {e}", file=sys.stderr)
530
+ sys.exit(2)
531
+
532
+ # Create client
533
+ client = ProviderClient(api_url, token, timeout_ms, max_retry)
534
+
535
+ # Create cache
536
+ cache = SimpleCache(cache_ttl_sec)
537
+
538
+ try:
539
+ # Mode 1: Analyze only (no full OCR)
540
+ if args.analyze:
541
+ print("="*60)
542
+ print("Image Analysis (Fast Mode Test)")
543
+ print("="*60)
544
+
545
+ analysis = analyze_image(client, file_input, file_type, args.visualize)
546
+
547
+ if "error" in analysis:
548
+ print(f"\nAnalysis Error: {analysis['error']}")
549
+ print(f"Recommendation: {analysis['recommended_mode']}")
550
+ print(f"Reason: {analysis['reason']}\n")
551
+ sys.exit(4)
552
+
553
+ print(f"\nQuality Score: {analysis['quality_score']:.2f} / 1.00")
554
+ print(f"Text Items: {analysis['text_items']}")
555
+ print(f"Avg Confidence: {analysis['avg_confidence']:.2f}")
556
+ print(f"Analysis Time: {analysis['analysis_time_ms']:.0f} ms")
557
+ print(f"\nRecommendation: --mode {analysis['recommended_mode']}")
558
+ print(f"Reason: {analysis['reason']}")
559
+ print("\n" + "="*60)
560
+ print("To run OCR with recommended mode:")
561
+ print(f" python ocr_caller.py --mode {analysis['recommended_mode']} \\")
562
+ if args.file_url:
563
+ print(f" --file-url \"{args.file_url}\"")
564
+ elif args.file_path:
565
+ print(f" --file-path \"{args.file_path}\"")
566
+ print("="*60 + "\n")
567
+ sys.exit(0)
568
+
569
+ # Mode 2: Interactive mode (analyze + prompt for mode)
570
+ if args.interactive:
571
+ print("\n" + "="*60)
572
+ print("Interactive Mode Selection")
573
+ print("="*60)
574
+ print("\nStep 1: Analyzing image (fast mode test)...")
575
+
576
+ analysis = analyze_image(client, file_input, file_type, args.visualize)
577
+
578
+ if "error" in analysis:
579
+ print(f"\nAnalysis Error: {analysis['error']}")
580
+ print(f"Recommendation: {analysis['recommended_mode']}")
581
+ else:
582
+ print(f"\nQuality Score: {analysis['quality_score']:.2f} / 1.00")
583
+ print(f"Text Items: {analysis['text_items']}")
584
+ print(f"Avg Confidence: {analysis['avg_confidence']:.2f}")
585
+
586
+ print(f"\nRecommended Mode: {analysis['recommended_mode']}")
587
+ print(f"Reason: {analysis['reason']}")
588
+ print("\n" + "="*60)
589
+ print("Step 2: Select mode to use")
590
+ print("="*60)
591
+ print(f"\nOptions:")
592
+ print(f" 1. Use recommended mode ({analysis['recommended_mode']})")
593
+ print(f" 2. Use fast mode (fastest)")
594
+ print(f" 3. Use quality mode (most accurate)")
595
+ print(f" 4. Use auto mode (adaptive)")
596
+ print(f" 5. Cancel")
597
+
598
+ while True:
599
+ choice = input("\nYour choice [1-5]: ").strip()
600
+ if choice == "1":
601
+ selected_mode = analysis['recommended_mode']
602
+ break
603
+ elif choice == "2":
604
+ selected_mode = "fast"
605
+ break
606
+ elif choice == "3":
607
+ selected_mode = "quality"
608
+ break
609
+ elif choice == "4":
610
+ selected_mode = "auto"
611
+ break
612
+ elif choice == "5":
613
+ print("Cancelled.")
614
+ sys.exit(0)
615
+ else:
616
+ print("Invalid choice. Please enter 1-5.")
617
+
618
+ print(f"\nSelected mode: {selected_mode}")
619
+ print("Running OCR...\n")
620
+
621
+ # Use selected mode
622
+ args.mode = selected_mode
623
+
624
+ # Mode 3: Normal mode (run OCR directly)
625
+ result = run_ocr(
626
+ client,
627
+ file_input,
628
+ file_type,
629
+ args.mode,
630
+ args.max_attempts,
631
+ args.budget_ms,
632
+ args.quality_target,
633
+ args.visualize,
634
+ args.return_raw_provider,
635
+ cache
636
+ )
637
+
638
+ # Prepare JSON output
639
+ indent = 2 if args.pretty else None
640
+ json_output = json.dumps(result, indent=indent, ensure_ascii=False)
641
+
642
+ # Save to file if --output specified
643
+ if args.output:
644
+ try:
645
+ output_path = Path(args.output).resolve()
646
+
647
+ # Create directory if not exists
648
+ output_path.parent.mkdir(parents=True, exist_ok=True)
649
+
650
+ # Write file
651
+ with open(output_path, 'w', encoding='utf-8') as f:
652
+ f.write(json_output)
653
+
654
+ # Print success message to stderr (so it doesn't mix with JSON output)
655
+ print(f"Result saved to: {output_path}", file=sys.stderr)
656
+
657
+ except PermissionError:
658
+ print(f"Error: Permission denied to write to {output_path}", file=sys.stderr)
659
+ sys.exit(5)
660
+ except OSError as e:
661
+ print(f"Error: Cannot write to {output_path}: {e}", file=sys.stderr)
662
+ sys.exit(5)
663
+ else:
664
+ # No --output: print to stdout (original behavior)
665
+ print(json_output)
666
+
667
+ # Exit code
668
+ if result["ok"]:
669
+ sys.exit(0)
670
+ else:
671
+ error_code = result["error"]["code"]
672
+ if error_code in ["PROVIDER_AUTH_ERROR", "PROVIDER_QUOTA_EXCEEDED"]:
673
+ sys.exit(3)
674
+ elif error_code in ["PROVIDER_OVERLOADED", "PROVIDER_TIMEOUT"]:
675
+ sys.exit(4)
676
+ else:
677
+ sys.exit(4)
678
+
679
+ finally:
680
+ client.close()
681
+
682
+
683
+ if __name__ == "__main__":
684
+ main()
@@ -0,0 +1,4 @@
1
+ # Runtime dependencies for PP-OCRv5 API Skill
2
+
3
+ httpx>=0.24.0
4
+ python-dotenv>=0.19.0