PyPI - paddleocr-api - Versions diffs - 0.0.1__py3-none-any.whl - Mend

paddleocr-api 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

paddleocr_api/__init__.py +7 -0
paddleocr_api/config.py +13 -0
paddleocr_api/constants.py +26 -0
paddleocr_api/exceptions.py +22 -0
paddleocr_api/models/__init__.py +16 -0
paddleocr_api/models/aistudio_client.py +175 -0
paddleocr_api/models/job.py +337 -0
paddleocr_api/models/model.py +18 -0
paddleocr_api/models/optional_payload.py +121 -0
paddleocr_api/models/result.py +210 -0
paddleocr_api/utils/__init__.py +0 -0
paddleocr_api/utils/enum.py +8 -0
paddleocr_api/utils/regex.py +9 -0
paddleocr_api-0.0.1.dist-info/METADATA +399 -0
paddleocr_api-0.0.1.dist-info/RECORD +18 -0
paddleocr_api-0.0.1.dist-info/WHEEL +5 -0
paddleocr_api-0.0.1.dist-info/licenses/LICENSE +201 -0
paddleocr_api-0.0.1.dist-info/top_level.txt +1 -0

paddleocr_api/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+"""
+Paddle OCR API
+"""
+__version__ = "0.0.1"
+from .models import AistudioClient, Job, State, Result, Model, OptionalPayload

paddleocr_api/config.py ADDED Viewed

@@ -0,0 +1,13 @@
+"""
+constant::AISTUDIO_ACCESS_TOKEN
+"""
+import os
+from dotenv import load_dotenv
+# 加载 .env 文件
+load_dotenv()
+# 用于 AI Studio 用户进行身份验证的令牌
+AISTUDIO_ACCESS_TOKEN = os.getenv('AISTUDIO_ACCESS_TOKEN')

paddleocr_api/constants.py ADDED Viewed

@@ -0,0 +1,26 @@
+"""
+Canstants.
+"""
+from urllib.parse import urljoin
+# The base URL of the Paddle OCR service
+BASE_URL = "https://paddleocr.aistudio-app.com/"
+# The version of the API
+API_VERSION = "v2"
+# The subpath of API interface
+API_PATH = f"/api/{API_VERSION}/"
+# The URL of API interface
+# Like 'https://paddleocr.aistudio-app.com/api/v2/'
+API_URL = urljoin(BASE_URL, API_PATH)
+# The subpath of OCR JOB interface
+JOB_PATH = "ocr/jobs"
+# URL interface for creating and querying tasks
+# Like 'https://paddleocr.aistudio-app.com/api/v2/ocr/jobs'
+JOB_URL = urljoin(API_URL, JOB_PATH)

paddleocr_api/exceptions.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""
+errors
+"""
+class PaddleOCRError(Exception):
+    """error related to Paddle OCR API"""
+class AistudioClientError(PaddleOCRError):
+    """error related to AistudioClient"""
+class JobError(PaddleOCRError):
+    """error related to Job"""
+class JobCreationError(AistudioClientError, JobError):
+    """the error that occurs when creating job"""
+class JobStatusQueryError(JobError):
+    """the error that occurs when querying job's status"""

paddleocr_api/models/__init__.py ADDED Viewed

@@ -0,0 +1,16 @@
+"""
+API wrappers
+"""
+from .aistudio_client import AistudioClient
+from .job import Job, State
+from .model import Model
+from .optional_payload import (
+	AuxiliaryLayoutElement, LayoutShapeMode, PromptLabel,
+	OptionalPayload,
+)
+from .result import (
+	PrunedResult, Markdown, LayoutParsingResult,
+	PageSizeInfo, DataInfo,
+	Result,
+)

paddleocr_api/models/aistudio_client.py ADDED Viewed

@@ -0,0 +1,175 @@
+"""
+class::AistudioClient
+"""
+import json
+from typing import Optional
+from urllib.parse import urljoin
+import aiofiles
+import httpx
+from typing_extensions import Self
+from ..exceptions import AistudioClientError, JobCreationError
+from ..constants import (
+    BASE_URL,
+    API_VERSION,
+    JOB_PATH,
+)
+from ..config import AISTUDIO_ACCESS_TOKEN
+from .model import Model
+from .optional_payload import OptionalPayload
+from .job import Job
+class AistudioClient:
+    """Client that requests APIs of Paddle OCR."""
+    def __init__(
+        self,
+        *,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        api_version: Optional[str] = None,
+        http_client: Optional[httpx.AsyncClient] = None,
+        **kwargs
+    ):
+        """
+        Args:
+            api_key (str): to obtain: https://aistudio.baidu.com/account/accessToken
+                It can be passed in through the environment variable `AISTUDIO_ACCESS_TOKEN`.
+            base_url (str): The base URL of the Paddle OCR service,
+                defaults to `"https://paddleocr.aistudio-app.com/"`.
+            api_version (str): The version of the API, defaults to `"v2"`.
+            http_client (httpx.AsyncClient): An HTTP client similar to `httpx.AsyncClient`, used for sending requests.
+            kwargs: The initialization parameters passed to `http_client`.
+        Raises:
+            AistudioClientError: If the api_key parameter is not received.
+        """
+        # Access Token
+        if api_key is None:
+            api_key = AISTUDIO_ACCESS_TOKEN
+        if api_key is None:
+            raise AistudioClientError(
+                "The api_key client option must be set either by passing api_key to the client "
+                "or by setting the AISTUDIO_ACCESS_TOKEN environment variable"
+            )
+        self.api_key = api_key
+        # URL
+        if base_url is None:
+            base_url = BASE_URL
+        if api_version is None:
+            api_version = API_VERSION
+        self.base_url = base_url
+        self.api_version = api_version
+        # HTTP Client
+        self._http_client_is_local = http_client is None
+        if self._http_client_is_local:
+            http_client = httpx.AsyncClient(**kwargs)
+        self._client = http_client
+        self._kwargs = kwargs
+    async def __aenter__(self) -> Self:
+        if self._http_client_is_local:
+            await self._client.__aenter__()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self._http_client_is_local:
+            await self._client.__aexit__(exc_type, exc_val, exc_tb)
+    async def aclose(self) -> None:
+        """Close the client within the instance."""
+        if self._http_client_is_local:
+            await self._client.aclose()
+    @property
+    def api_url(self) -> str:
+        """
+        The URL of API interface.
+        Returns like:
+            'https://paddleocr.aistudio-app.com/api/v2/'
+        """
+        api_path = f"/api/{self.api_version}/"
+        return urljoin(self.base_url, api_path)
+    @property
+    def job_url(self) -> str:
+        """
+        URL interface for creating and querying tasks
+        Returns like:
+            'https://paddleocr.aistudio-app.com/api/v2/ocr/jobs'
+        """
+        return urljoin(self.api_url, JOB_PATH)
+    async def create_job(
+        self,
+        model: Model = Model.DEFAULT,
+        file_bytes: Optional[bytes] = None,
+        *,
+        file_path: Optional[str] = None,
+        file_url: Optional[str] = None,
+        optional_payload: Optional[OptionalPayload] = None,
+        **kwargs
+    ) -> Job:
+        """
+        Suitable for PaddleOCR-VL series / PP-STuctureV3 model.
+        Args:
+            model (str): Model for processing documents.
+        """
+        if (file_path is None) and (file_bytes is None) and (file_url is None):
+            raise TypeError("At least one of `file_bytes`, `file_path` and `file_url` must be provided.")
+        kwargs["method"] = "POST"
+        kwargs["url"] = self.job_url
+        kwargs.setdefault("headers", {}).update({
+            "Authorization": f"bearer {self.api_key}"
+        })
+        if optional_payload is None:
+            optional_payload = {}
+        if (file_bytes is None) and (file_path is not None):
+            async with aiofiles.open(file_path, mode="rb") as file:
+                file_bytes = await file.read()
+        if file_bytes is None:
+            # file_url is enable
+            kwargs.setdefault("json", {}).update({
+                "fileUrl": file_url,
+                "model": model,
+                "optionalPayload": optional_payload,
+            })
+        else:
+            # file_bytes or file_path is enable
+            kwargs.setdefault("data", {}).update({
+                "model": model,
+                "optionalPayload": json.dumps(optional_payload),
+            })
+            kwargs.setdefault("files", {}).update({
+                "file": file_bytes
+            })
+        # Send request
+        response = await self._client.request(**kwargs)
+        if response.status_code != 200:
+            raise JobCreationError(response.text)
+        job_id = response.json()["data"]["jobId"]
+        return Job(job_id=job_id, aistudio_client=self)

paddleocr_api/models/job.py ADDED Viewed

@@ -0,0 +1,337 @@
+"""
+enum::State
+class::Job
+"""
+from __future__ import annotations
+import asyncio
+from datetime import datetime
+from functools import reduce
+import time
+import json
+from numbers import Number
+from typing import Dict, List, Optional, Self, TYPE_CHECKING
+from urllib.parse import urljoin
+import httpx
+from ..exceptions import JobStatusQueryError
+from .result import Result
+try:
+    from enum import StrEnum
+except ImportError:
+    from ..utils.enum import StrEnum
+if TYPE_CHECKING:
+    from .aistudio_client import AistudioClient
+    from .result import Markdown
+class State(StrEnum):
+    """The processing state of the job."""
+    PENDING = "pending"
+    RUNNING = "running"
+    DONE    = "done"
+    FAILED  = "failed"
+    UNKNOWN = "unknown"
+class Job:
+    """Track the progress of task execution."""
+    def __init__(
+        self,
+        job_id: str,
+        aistudio_client: AistudioClient,
+        *,
+        http_client: Optional[httpx.AsyncClient] = None,
+        status_update_interval: Number = 2,
+        **kwargs
+    ):
+        """
+        Args:
+            job_id (str): The identifier of the job.
+            aistudio_client (AistudioClient): The AistudiClient object creating this job.
+            http_client (httpx.AsyncClient): An HTTP client similar to `httpx.AsyncClient`, used for sending requests.
+            status_update_interval (Number): The minimum time interval (in seconds) between two status queries.
+            kwargs: The initialization parameters passed to `http_client`.
+        """
+        self.id = job_id
+        self._aistudio_client = aistudio_client
+        # HTTP Client
+        self._http_client_is_local = http_client is None
+        if self._http_client_is_local:
+            http_client = httpx.AsyncClient(**kwargs)
+        self._client = http_client
+        # Status query management
+        self.status_update_interval = max(status_update_interval, 0)
+        self._last_update_status_time = 0
+        self._status_query_task = None
+        self._status_query_lock = asyncio.Lock()  # 并发锁
+        # Status cache
+        self._status: Dict[str, str | Dict[str, str | int]] = {}
+        # Result cache
+        self._result: Optional[Result] = None
+    def __str__(self) -> str:
+        return f"{type(self).__name__}<{self.id}>"
+    async def __aenter__(self) -> Self:
+        if self._http_client_is_local:
+            await self._client.__aenter__()
+        return self
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        if self._http_client_is_local:
+            await self._client.__aexit__(exc_type, exc_val, exc_tb)
+    async def aclose(self) -> None:
+        """Close the client within the instance."""
+        if self._http_client_is_local:
+            await self._client.aclose()
+    @property
+    def api_key(self) -> str:
+        """Aistudio Access Token"""
+        return self._aistudio_client.api_key
+    @property
+    def status_url(self) -> str:
+        """The URL used to query the status of this job."""
+        return urljoin(f"{self._aistudio_client.job_url}/", self.id)
+    async def query_status(self, **kwargs) -> Dict[str, str | Dict[str, str | int]]:
+        """
+        Query the processing status of the task.
+        Args:
+            kwargs: The keyword arguments passed to `self._client.request`.
+        Returns like:
+        - If processing:
+            {
+                'extractProgress': {
+                    'extractedPages': 3,
+                    'startTime': '2026-05-30 14:27:39',
+                    'totalPages': 8
+                },
+               'jobId': '540***8',
+               'state': 'running'
+            }
+        - If done:
+            {
+                'extractProgress': {
+                    'endTime': '2026-05-30 14:30:29',
+                    'extractedPages': 8,
+                    'startTime': '2026-05-30 14:30:18',
+                    'totalPages': 8
+                },
+                'jobId': '540***8',
+                'resultUrl': {
+                    'jsonUrl': 'https://bj.bcebos.com/v1/paddleocr-store/job/b5e...6/json/788...f.json?authorization=bce-auth-v1%2F...'
+                },
+                'state': 'done'
+            }
+        - If failed:
+            {
+                'errorMsg': '系统错误-聚合',
+                'jobId': '540***8',
+                'state': 'failed'
+            }
+        Raises:
+            JobStatusQueryError: If the response is not 200 OK or JSON parsing fails.
+        """
+        kwargs["method"] = "GET"
+        kwargs["url"] = self.status_url
+        kwargs.setdefault("headers", {}).update({
+            "Authorization": f"bearer {self.api_key}"
+        })
+        response = await self._client.request(**kwargs)
+        try:
+            resp_json = response.json()
+            json_parse_success = True
+        except json.JSONDecodeError:
+            resp_json = {}
+            json_parse_success = False
+        if (response.status_code != 200) or (not json_parse_success):
+            raise JobStatusQueryError(resp_json.get("msg", response.text))
+        return resp_json.get("data", {})
+    async def query_status_safe(self, **kwargs) -> Dict[str, str | Dict[str, str | int]]:
+        """
+        Same as `query_status`.
+        But when encountering an exception, it will not throw, but return the cached object.
+        """
+        try:
+            return await self.query_status(**kwargs)
+        except JobStatusQueryError:
+            return self._status.copy()
+    @property
+    async def status(self) -> Dict[str, str | Dict[str, str | int]]:
+        """
+        Get the real-time status of the task (with self.status_update_interval seconds of cache).
+        """
+        async with self._status_query_lock:
+            need_query = (
+                self._status_query_task is None
+                and (time.time() - self._last_update_status_time) > self.status_update_interval
+            )
+            if need_query:
+                self._status_query_task = asyncio.create_task(self.query_status_safe())
+        if self._status_query_task is not None:
+            query_result = await self._status_query_task
+            self._last_update_status_time = time.time()
+            self._status_query_task = None
+            self._status.update(query_result)
+        return self._status
+    @property
+    async def state(self) -> State:
+        """The processing state"""
+        return State((await self.status).get("state", State.UNKNOWN))
+    @property
+    async def result_json_url(self) -> Optional[str]:
+        """JSON URL for processing results"""
+        return (await self.status).get("resultUrl", {}).get("jsonUrl", None)
+    @property
+    async def extract_progress(self) -> Dict[str, str | int]:
+        """
+        The progress of page extraction for this job.
+        Returns like:
+        - If extracting:
+            {
+                'extractedPages': 3,
+                'startTime': '2026-05-30 14:27:39',
+                'totalPages': 8
+            }
+        - If extracted:
+            {
+                'endTime': '2026-05-30 14:30:29',
+                'extractedPages': 8,
+                'startTime': '2026-05-30 14:30:18',
+                'totalPages': 8
+            }
+        """
+        return (await self.status).get("extractProgress", {})
+    @property
+    async def extracted_pages(self) -> Optional[int]:
+        """The number of extracted pages."""
+        return (await self.extract_progress).get("extractedPages")
+    @property
+    async def total_pages(self) -> Optional[int]:
+        """The number of total pages."""
+        return (await self.extract_progress).get("totalPages")
+    @property
+    async def start_time(self) -> Optional[datetime]:
+        """The start processing time of the job."""
+        time_str = (await self.extract_progress).get("startTime")
+        if time_str is None:
+            return None
+        return datetime.fromisoformat(time_str)
+    @property
+    async def end_time(self) -> Optional[datetime]:
+        """The completion time of the job."""
+        time_str = (await self.extract_progress).get("endTime")
+        if time_str is None:
+            return None
+        return datetime.fromisoformat(time_str)
+    @property
+    async def error_message(self) -> Optional[str]:
+        """errorMsg returned when processing failure"""
+        return (await self.status).get("errorMsg")
+    @property
+    async def result(self) -> Optional[Result]:
+        """
+        Return the complete parsing result of the job.
+        """
+        # try to fetch cache
+        if self._result is not None:
+            return self._result
+        # get result url
+        result_json_url = await self.result_json_url
+        if result_json_url is None:
+            return None
+        # fetch result
+        jsonl_response = await self._client.get(result_json_url)
+        jsonl_response.raise_for_status()
+        # parse result
+        results: List[Result] = []
+        for line in jsonl_response.text.strip().split('\n'):
+            line = line.strip()
+            if not line:
+                continue
+            result = Result.from_json(json.loads(line)["result"])
+            results.append(result)
+        # cache result
+        self._result = reduce(Result.extend, results)
+        return self._result
+    @property
+    async def markdown(self) -> Optional[Markdown]:
+        """
+        Return the Markdown formatted parsing result.
+        """
+        # try to get the result
+        result = await self.result
+        if result is None:
+            return None
+        # extract markdown
+        return result.markdown

paddleocr_api/models/model.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""
+enum::Model
+"""
+try:
+    from enum import StrEnum
+except ImportError:
+    from ..utils.enum import StrEnum
+class Model(StrEnum):
+    """Available models."""
+    PP_OCR_V5 = "PP-OCRv5"
+    PADDLE_OCR = "PaddleOCR"
+    PADDLE_OCR_VL = "PaddleOCR-VL"
+    PADDLE_OCR_VL_1_5 = "PaddleOCR-VL-1.5" # expected to be abandoned on 2026/06/17
+    PADDLE_OCR_VL_1_6 = "PaddleOCR-VL-1.6"
+    DEFAULT = PADDLE_OCR_VL_1_6