paddleocr-api 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ """
2
+ Paddle OCR API
3
+ """
4
+
5
+ __version__ = "0.0.1"
6
+
7
+ from .models import AistudioClient, Job, State, Result, Model, OptionalPayload
@@ -0,0 +1,13 @@
1
+ """
2
+ constant::AISTUDIO_ACCESS_TOKEN
3
+ """
4
+
5
+ import os
6
+ from dotenv import load_dotenv
7
+
8
+
9
+ # 加载 .env 文件
10
+ load_dotenv()
11
+
12
+ # 用于 AI Studio 用户进行身份验证的令牌
13
+ AISTUDIO_ACCESS_TOKEN = os.getenv('AISTUDIO_ACCESS_TOKEN')
@@ -0,0 +1,26 @@
1
+ """
2
+ Canstants.
3
+ """
4
+
5
+ from urllib.parse import urljoin
6
+
7
+
8
+ # The base URL of the Paddle OCR service
9
+ BASE_URL = "https://paddleocr.aistudio-app.com/"
10
+
11
+ # The version of the API
12
+ API_VERSION = "v2"
13
+
14
+ # The subpath of API interface
15
+ API_PATH = f"/api/{API_VERSION}/"
16
+
17
+ # The URL of API interface
18
+ # Like 'https://paddleocr.aistudio-app.com/api/v2/'
19
+ API_URL = urljoin(BASE_URL, API_PATH)
20
+
21
+ # The subpath of OCR JOB interface
22
+ JOB_PATH = "ocr/jobs"
23
+
24
+ # URL interface for creating and querying tasks
25
+ # Like 'https://paddleocr.aistudio-app.com/api/v2/ocr/jobs'
26
+ JOB_URL = urljoin(API_URL, JOB_PATH)
@@ -0,0 +1,22 @@
1
+ """
2
+ errors
3
+ """
4
+
5
+ class PaddleOCRError(Exception):
6
+ """error related to Paddle OCR API"""
7
+
8
+
9
+ class AistudioClientError(PaddleOCRError):
10
+ """error related to AistudioClient"""
11
+
12
+
13
+ class JobError(PaddleOCRError):
14
+ """error related to Job"""
15
+
16
+
17
+ class JobCreationError(AistudioClientError, JobError):
18
+ """the error that occurs when creating job"""
19
+
20
+
21
+ class JobStatusQueryError(JobError):
22
+ """the error that occurs when querying job's status"""
@@ -0,0 +1,16 @@
1
+ """
2
+ API wrappers
3
+ """
4
+
5
+ from .aistudio_client import AistudioClient
6
+ from .job import Job, State
7
+ from .model import Model
8
+ from .optional_payload import (
9
+ AuxiliaryLayoutElement, LayoutShapeMode, PromptLabel,
10
+ OptionalPayload,
11
+ )
12
+ from .result import (
13
+ PrunedResult, Markdown, LayoutParsingResult,
14
+ PageSizeInfo, DataInfo,
15
+ Result,
16
+ )
@@ -0,0 +1,175 @@
1
+ """
2
+ class::AistudioClient
3
+ """
4
+
5
+ import json
6
+ from typing import Optional
7
+ from urllib.parse import urljoin
8
+
9
+ import aiofiles
10
+ import httpx
11
+ from typing_extensions import Self
12
+
13
+ from ..exceptions import AistudioClientError, JobCreationError
14
+ from ..constants import (
15
+ BASE_URL,
16
+ API_VERSION,
17
+ JOB_PATH,
18
+ )
19
+ from ..config import AISTUDIO_ACCESS_TOKEN
20
+ from .model import Model
21
+ from .optional_payload import OptionalPayload
22
+ from .job import Job
23
+
24
+
25
+
26
+ class AistudioClient:
27
+ """Client that requests APIs of Paddle OCR."""
28
+
29
+ def __init__(
30
+ self,
31
+ *,
32
+ api_key: Optional[str] = None,
33
+ base_url: Optional[str] = None,
34
+ api_version: Optional[str] = None,
35
+ http_client: Optional[httpx.AsyncClient] = None,
36
+ **kwargs
37
+ ):
38
+ """
39
+ Args:
40
+ api_key (str): to obtain: https://aistudio.baidu.com/account/accessToken
41
+ It can be passed in through the environment variable `AISTUDIO_ACCESS_TOKEN`.
42
+ base_url (str): The base URL of the Paddle OCR service,
43
+ defaults to `"https://paddleocr.aistudio-app.com/"`.
44
+ api_version (str): The version of the API, defaults to `"v2"`.
45
+ http_client (httpx.AsyncClient): An HTTP client similar to `httpx.AsyncClient`, used for sending requests.
46
+ kwargs: The initialization parameters passed to `http_client`.
47
+
48
+ Raises:
49
+ AistudioClientError: If the api_key parameter is not received.
50
+ """
51
+ # Access Token
52
+ if api_key is None:
53
+ api_key = AISTUDIO_ACCESS_TOKEN
54
+ if api_key is None:
55
+ raise AistudioClientError(
56
+ "The api_key client option must be set either by passing api_key to the client "
57
+ "or by setting the AISTUDIO_ACCESS_TOKEN environment variable"
58
+ )
59
+
60
+ self.api_key = api_key
61
+
62
+ # URL
63
+ if base_url is None:
64
+ base_url = BASE_URL
65
+ if api_version is None:
66
+ api_version = API_VERSION
67
+
68
+ self.base_url = base_url
69
+ self.api_version = api_version
70
+
71
+ # HTTP Client
72
+ self._http_client_is_local = http_client is None
73
+ if self._http_client_is_local:
74
+ http_client = httpx.AsyncClient(**kwargs)
75
+ self._client = http_client
76
+ self._kwargs = kwargs
77
+
78
+
79
+ async def __aenter__(self) -> Self:
80
+ if self._http_client_is_local:
81
+ await self._client.__aenter__()
82
+ return self
83
+
84
+
85
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
86
+ if self._http_client_is_local:
87
+ await self._client.__aexit__(exc_type, exc_val, exc_tb)
88
+
89
+
90
+ async def aclose(self) -> None:
91
+ """Close the client within the instance."""
92
+ if self._http_client_is_local:
93
+ await self._client.aclose()
94
+
95
+
96
+ @property
97
+ def api_url(self) -> str:
98
+ """
99
+ The URL of API interface.
100
+
101
+ Returns like:
102
+ 'https://paddleocr.aistudio-app.com/api/v2/'
103
+ """
104
+ api_path = f"/api/{self.api_version}/"
105
+ return urljoin(self.base_url, api_path)
106
+
107
+
108
+ @property
109
+ def job_url(self) -> str:
110
+ """
111
+ URL interface for creating and querying tasks
112
+
113
+ Returns like:
114
+ 'https://paddleocr.aistudio-app.com/api/v2/ocr/jobs'
115
+ """
116
+ return urljoin(self.api_url, JOB_PATH)
117
+
118
+
119
+ async def create_job(
120
+ self,
121
+ model: Model = Model.DEFAULT,
122
+ file_bytes: Optional[bytes] = None,
123
+ *,
124
+ file_path: Optional[str] = None,
125
+ file_url: Optional[str] = None,
126
+ optional_payload: Optional[OptionalPayload] = None,
127
+ **kwargs
128
+ ) -> Job:
129
+ """
130
+ Suitable for PaddleOCR-VL series / PP-STuctureV3 model.
131
+
132
+ Args:
133
+ model (str): Model for processing documents.
134
+
135
+ """
136
+ if (file_path is None) and (file_bytes is None) and (file_url is None):
137
+ raise TypeError("At least one of `file_bytes`, `file_path` and `file_url` must be provided.")
138
+
139
+ kwargs["method"] = "POST"
140
+ kwargs["url"] = self.job_url
141
+ kwargs.setdefault("headers", {}).update({
142
+ "Authorization": f"bearer {self.api_key}"
143
+ })
144
+
145
+ if optional_payload is None:
146
+ optional_payload = {}
147
+
148
+ if (file_bytes is None) and (file_path is not None):
149
+ async with aiofiles.open(file_path, mode="rb") as file:
150
+ file_bytes = await file.read()
151
+
152
+ if file_bytes is None:
153
+ # file_url is enable
154
+ kwargs.setdefault("json", {}).update({
155
+ "fileUrl": file_url,
156
+ "model": model,
157
+ "optionalPayload": optional_payload,
158
+ })
159
+ else:
160
+ # file_bytes or file_path is enable
161
+ kwargs.setdefault("data", {}).update({
162
+ "model": model,
163
+ "optionalPayload": json.dumps(optional_payload),
164
+ })
165
+ kwargs.setdefault("files", {}).update({
166
+ "file": file_bytes
167
+ })
168
+
169
+ # Send request
170
+ response = await self._client.request(**kwargs)
171
+ if response.status_code != 200:
172
+ raise JobCreationError(response.text)
173
+
174
+ job_id = response.json()["data"]["jobId"]
175
+ return Job(job_id=job_id, aistudio_client=self)
@@ -0,0 +1,337 @@
1
+ """
2
+ enum::State
3
+ class::Job
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import asyncio
9
+ from datetime import datetime
10
+ from functools import reduce
11
+ import time
12
+ import json
13
+ from numbers import Number
14
+ from typing import Dict, List, Optional, Self, TYPE_CHECKING
15
+ from urllib.parse import urljoin
16
+
17
+ import httpx
18
+
19
+ from ..exceptions import JobStatusQueryError
20
+ from .result import Result
21
+
22
+ try:
23
+ from enum import StrEnum
24
+ except ImportError:
25
+ from ..utils.enum import StrEnum
26
+
27
+ if TYPE_CHECKING:
28
+ from .aistudio_client import AistudioClient
29
+ from .result import Markdown
30
+
31
+
32
+
33
+ class State(StrEnum):
34
+ """The processing state of the job."""
35
+ PENDING = "pending"
36
+ RUNNING = "running"
37
+ DONE = "done"
38
+ FAILED = "failed"
39
+ UNKNOWN = "unknown"
40
+
41
+
42
+
43
+ class Job:
44
+ """Track the progress of task execution."""
45
+
46
+ def __init__(
47
+ self,
48
+ job_id: str,
49
+ aistudio_client: AistudioClient,
50
+ *,
51
+ http_client: Optional[httpx.AsyncClient] = None,
52
+ status_update_interval: Number = 2,
53
+ **kwargs
54
+ ):
55
+ """
56
+ Args:
57
+ job_id (str): The identifier of the job.
58
+ aistudio_client (AistudioClient): The AistudiClient object creating this job.
59
+ http_client (httpx.AsyncClient): An HTTP client similar to `httpx.AsyncClient`, used for sending requests.
60
+ status_update_interval (Number): The minimum time interval (in seconds) between two status queries.
61
+ kwargs: The initialization parameters passed to `http_client`.
62
+ """
63
+ self.id = job_id
64
+ self._aistudio_client = aistudio_client
65
+
66
+ # HTTP Client
67
+ self._http_client_is_local = http_client is None
68
+ if self._http_client_is_local:
69
+ http_client = httpx.AsyncClient(**kwargs)
70
+ self._client = http_client
71
+
72
+ # Status query management
73
+ self.status_update_interval = max(status_update_interval, 0)
74
+ self._last_update_status_time = 0
75
+ self._status_query_task = None
76
+ self._status_query_lock = asyncio.Lock() # 并发锁
77
+
78
+ # Status cache
79
+ self._status: Dict[str, str | Dict[str, str | int]] = {}
80
+
81
+ # Result cache
82
+ self._result: Optional[Result] = None
83
+
84
+
85
+ def __str__(self) -> str:
86
+ return f"{type(self).__name__}<{self.id}>"
87
+
88
+
89
+ async def __aenter__(self) -> Self:
90
+ if self._http_client_is_local:
91
+ await self._client.__aenter__()
92
+ return self
93
+
94
+
95
+ async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
96
+ if self._http_client_is_local:
97
+ await self._client.__aexit__(exc_type, exc_val, exc_tb)
98
+
99
+
100
+ async def aclose(self) -> None:
101
+ """Close the client within the instance."""
102
+ if self._http_client_is_local:
103
+ await self._client.aclose()
104
+
105
+
106
+ @property
107
+ def api_key(self) -> str:
108
+ """Aistudio Access Token"""
109
+ return self._aistudio_client.api_key
110
+
111
+
112
+ @property
113
+ def status_url(self) -> str:
114
+ """The URL used to query the status of this job."""
115
+ return urljoin(f"{self._aistudio_client.job_url}/", self.id)
116
+
117
+
118
+ async def query_status(self, **kwargs) -> Dict[str, str | Dict[str, str | int]]:
119
+ """
120
+ Query the processing status of the task.
121
+
122
+ Args:
123
+ kwargs: The keyword arguments passed to `self._client.request`.
124
+
125
+ Returns like:
126
+
127
+ - If processing:
128
+
129
+ {
130
+ 'extractProgress': {
131
+ 'extractedPages': 3,
132
+ 'startTime': '2026-05-30 14:27:39',
133
+ 'totalPages': 8
134
+ },
135
+ 'jobId': '540***8',
136
+ 'state': 'running'
137
+ }
138
+
139
+ - If done:
140
+
141
+ {
142
+ 'extractProgress': {
143
+ 'endTime': '2026-05-30 14:30:29',
144
+ 'extractedPages': 8,
145
+ 'startTime': '2026-05-30 14:30:18',
146
+ 'totalPages': 8
147
+ },
148
+ 'jobId': '540***8',
149
+ 'resultUrl': {
150
+ 'jsonUrl': 'https://bj.bcebos.com/v1/paddleocr-store/job/b5e...6/json/788...f.json?authorization=bce-auth-v1%2F...'
151
+ },
152
+ 'state': 'done'
153
+ }
154
+
155
+ - If failed:
156
+
157
+ {
158
+ 'errorMsg': '系统错误-聚合',
159
+ 'jobId': '540***8',
160
+ 'state': 'failed'
161
+ }
162
+
163
+ Raises:
164
+ JobStatusQueryError: If the response is not 200 OK or JSON parsing fails.
165
+ """
166
+ kwargs["method"] = "GET"
167
+ kwargs["url"] = self.status_url
168
+ kwargs.setdefault("headers", {}).update({
169
+ "Authorization": f"bearer {self.api_key}"
170
+ })
171
+
172
+ response = await self._client.request(**kwargs)
173
+ try:
174
+ resp_json = response.json()
175
+ json_parse_success = True
176
+ except json.JSONDecodeError:
177
+ resp_json = {}
178
+ json_parse_success = False
179
+
180
+ if (response.status_code != 200) or (not json_parse_success):
181
+ raise JobStatusQueryError(resp_json.get("msg", response.text))
182
+
183
+ return resp_json.get("data", {})
184
+
185
+
186
+ async def query_status_safe(self, **kwargs) -> Dict[str, str | Dict[str, str | int]]:
187
+ """
188
+ Same as `query_status`.
189
+ But when encountering an exception, it will not throw, but return the cached object.
190
+ """
191
+ try:
192
+ return await self.query_status(**kwargs)
193
+ except JobStatusQueryError:
194
+ return self._status.copy()
195
+
196
+
197
+ @property
198
+ async def status(self) -> Dict[str, str | Dict[str, str | int]]:
199
+ """
200
+ Get the real-time status of the task (with self.status_update_interval seconds of cache).
201
+ """
202
+ async with self._status_query_lock:
203
+ need_query = (
204
+ self._status_query_task is None
205
+ and (time.time() - self._last_update_status_time) > self.status_update_interval
206
+ )
207
+ if need_query:
208
+ self._status_query_task = asyncio.create_task(self.query_status_safe())
209
+
210
+ if self._status_query_task is not None:
211
+ query_result = await self._status_query_task
212
+ self._last_update_status_time = time.time()
213
+ self._status_query_task = None
214
+ self._status.update(query_result)
215
+
216
+ return self._status
217
+
218
+
219
+ @property
220
+ async def state(self) -> State:
221
+ """The processing state"""
222
+ return State((await self.status).get("state", State.UNKNOWN))
223
+
224
+
225
+ @property
226
+ async def result_json_url(self) -> Optional[str]:
227
+ """JSON URL for processing results"""
228
+ return (await self.status).get("resultUrl", {}).get("jsonUrl", None)
229
+
230
+
231
+ @property
232
+ async def extract_progress(self) -> Dict[str, str | int]:
233
+ """
234
+ The progress of page extraction for this job.
235
+
236
+ Returns like:
237
+
238
+ - If extracting:
239
+
240
+ {
241
+ 'extractedPages': 3,
242
+ 'startTime': '2026-05-30 14:27:39',
243
+ 'totalPages': 8
244
+ }
245
+
246
+ - If extracted:
247
+
248
+ {
249
+ 'endTime': '2026-05-30 14:30:29',
250
+ 'extractedPages': 8,
251
+ 'startTime': '2026-05-30 14:30:18',
252
+ 'totalPages': 8
253
+ }
254
+ """
255
+ return (await self.status).get("extractProgress", {})
256
+
257
+
258
+ @property
259
+ async def extracted_pages(self) -> Optional[int]:
260
+ """The number of extracted pages."""
261
+ return (await self.extract_progress).get("extractedPages")
262
+
263
+
264
+ @property
265
+ async def total_pages(self) -> Optional[int]:
266
+ """The number of total pages."""
267
+ return (await self.extract_progress).get("totalPages")
268
+
269
+
270
+ @property
271
+ async def start_time(self) -> Optional[datetime]:
272
+ """The start processing time of the job."""
273
+ time_str = (await self.extract_progress).get("startTime")
274
+ if time_str is None:
275
+ return None
276
+ return datetime.fromisoformat(time_str)
277
+
278
+
279
+ @property
280
+ async def end_time(self) -> Optional[datetime]:
281
+ """The completion time of the job."""
282
+ time_str = (await self.extract_progress).get("endTime")
283
+ if time_str is None:
284
+ return None
285
+ return datetime.fromisoformat(time_str)
286
+
287
+
288
+ @property
289
+ async def error_message(self) -> Optional[str]:
290
+ """errorMsg returned when processing failure"""
291
+ return (await self.status).get("errorMsg")
292
+
293
+
294
+ @property
295
+ async def result(self) -> Optional[Result]:
296
+ """
297
+ Return the complete parsing result of the job.
298
+ """
299
+ # try to fetch cache
300
+ if self._result is not None:
301
+ return self._result
302
+
303
+ # get result url
304
+ result_json_url = await self.result_json_url
305
+ if result_json_url is None:
306
+ return None
307
+
308
+ # fetch result
309
+ jsonl_response = await self._client.get(result_json_url)
310
+ jsonl_response.raise_for_status()
311
+
312
+ # parse result
313
+ results: List[Result] = []
314
+ for line in jsonl_response.text.strip().split('\n'):
315
+ line = line.strip()
316
+ if not line:
317
+ continue
318
+ result = Result.from_json(json.loads(line)["result"])
319
+ results.append(result)
320
+
321
+ # cache result
322
+ self._result = reduce(Result.extend, results)
323
+ return self._result
324
+
325
+
326
+ @property
327
+ async def markdown(self) -> Optional[Markdown]:
328
+ """
329
+ Return the Markdown formatted parsing result.
330
+ """
331
+ # try to get the result
332
+ result = await self.result
333
+ if result is None:
334
+ return None
335
+
336
+ # extract markdown
337
+ return result.markdown
@@ -0,0 +1,18 @@
1
+ """
2
+ enum::Model
3
+ """
4
+
5
+ try:
6
+ from enum import StrEnum
7
+ except ImportError:
8
+ from ..utils.enum import StrEnum
9
+
10
+
11
+ class Model(StrEnum):
12
+ """Available models."""
13
+ PP_OCR_V5 = "PP-OCRv5"
14
+ PADDLE_OCR = "PaddleOCR"
15
+ PADDLE_OCR_VL = "PaddleOCR-VL"
16
+ PADDLE_OCR_VL_1_5 = "PaddleOCR-VL-1.5" # expected to be abandoned on 2026/06/17
17
+ PADDLE_OCR_VL_1_6 = "PaddleOCR-VL-1.6"
18
+ DEFAULT = PADDLE_OCR_VL_1_6