paddleocr-api 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- paddleocr_api/__init__.py +7 -0
- paddleocr_api/config.py +13 -0
- paddleocr_api/constants.py +26 -0
- paddleocr_api/exceptions.py +22 -0
- paddleocr_api/models/__init__.py +16 -0
- paddleocr_api/models/aistudio_client.py +175 -0
- paddleocr_api/models/job.py +337 -0
- paddleocr_api/models/model.py +18 -0
- paddleocr_api/models/optional_payload.py +121 -0
- paddleocr_api/models/result.py +210 -0
- paddleocr_api/utils/__init__.py +0 -0
- paddleocr_api/utils/enum.py +8 -0
- paddleocr_api/utils/regex.py +9 -0
- paddleocr_api-0.0.1.dist-info/METADATA +399 -0
- paddleocr_api-0.0.1.dist-info/RECORD +18 -0
- paddleocr_api-0.0.1.dist-info/WHEEL +5 -0
- paddleocr_api-0.0.1.dist-info/licenses/LICENSE +201 -0
- paddleocr_api-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
enum::AuxiliaryLayoutElement
|
|
3
|
+
enum::LayoutShapeMode
|
|
4
|
+
enum::PromptLabel
|
|
5
|
+
json::OptionalPayload
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from numbers import Number
|
|
9
|
+
from typing import List, TypedDict
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
from enum import StrEnum
|
|
13
|
+
except ImportError:
|
|
14
|
+
from ..utils.enum import StrEnum
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AuxiliaryLayoutElement(StrEnum):
|
|
18
|
+
"""辅助版面元素"""
|
|
19
|
+
HEADER = "header" # 页眉
|
|
20
|
+
HEADER_IMAGE = "header_image" # 页眉图片
|
|
21
|
+
FOOTER = "footer" # 页脚
|
|
22
|
+
FOOTER_IMAGE = "footer_image" # 页脚图片
|
|
23
|
+
NUMBER = "number" # 页码
|
|
24
|
+
FOOTNOTE = "footnote" # 脚注
|
|
25
|
+
ASIDE_TEXT = "aside_text" # 旁注文本
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class LayoutShapeMode(StrEnum):
|
|
29
|
+
"""版面检测结果的几何形状"""
|
|
30
|
+
AUTO = "auto" # 自动
|
|
31
|
+
RECT = "rect" # 矩形
|
|
32
|
+
QUAD = "quad" # 四边形
|
|
33
|
+
POLY = "poly" # 多边形
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class PromptLabel(StrEnum):
|
|
37
|
+
"""prompt类型设置"""
|
|
38
|
+
OCR = "ocr" # 文本
|
|
39
|
+
FORMULA = "formula" # 公式
|
|
40
|
+
TABLE = "table" # 表格
|
|
41
|
+
CHART = "chart" # 图表
|
|
42
|
+
SEAL = "seal" # 印章
|
|
43
|
+
SPOTTING = "spotting" # 文本检测与识别
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class OptionalPayload(TypedDict):
|
|
47
|
+
"""Optional parameters when creating jobs."""
|
|
48
|
+
|
|
49
|
+
# 辅助内容解析,默认为所有的 AuxiliaryLayoutElement 元素
|
|
50
|
+
# 模型会自动识别并过滤列出的辅助内容
|
|
51
|
+
markdownIgnoreLabels: List[AuxiliaryLayoutElement]
|
|
52
|
+
|
|
53
|
+
# 图片方向矫正, def. to `False`
|
|
54
|
+
# 开启后,可以自动识别并矫正 0°、90°、180°、270°的图片
|
|
55
|
+
useDocOrientationClassify: bool
|
|
56
|
+
|
|
57
|
+
# 图片扭曲矫正, def. to `False`
|
|
58
|
+
# 开启后,可以自动矫正扭曲图片,例如褶皱、倾斜等情况
|
|
59
|
+
useDocUnwarping: bool
|
|
60
|
+
|
|
61
|
+
# 版面分析, def. to `True`
|
|
62
|
+
# 开启后,系统会对文档进行版面分区和排序,并按区域类别识别。
|
|
63
|
+
# 如果您的文档仅包含文本、表格、公式或图表中的一种元素,可关闭此参数,直接进行对应识别
|
|
64
|
+
useLayoutDetection: bool
|
|
65
|
+
|
|
66
|
+
# 图表识别, def. to `False`
|
|
67
|
+
# 开启后,可以自动解析文档中的图表(如柱状图、饼图等)并转换为表格形式,方便查看和编辑数据
|
|
68
|
+
useChartRecognition: bool
|
|
69
|
+
|
|
70
|
+
# 印章识别, def. to `True`
|
|
71
|
+
# 开启后,可以识别文档中的印章内容,并将其提取为可编辑的文字
|
|
72
|
+
useSealRecognition: bool
|
|
73
|
+
|
|
74
|
+
# 图片文字识别, def. to `False`
|
|
75
|
+
# 开启后,对图片版面元素中的文本进行识别
|
|
76
|
+
useOcrForImageBlock: bool
|
|
77
|
+
|
|
78
|
+
# 跨页表格合并, def. to `True`
|
|
79
|
+
# 开启后,会识别跨页表格,将其合并为一个
|
|
80
|
+
mergeTables: bool
|
|
81
|
+
|
|
82
|
+
# 段落标题级别识别, def. to `True`
|
|
83
|
+
# 开启后,会识别段落标题级别
|
|
84
|
+
relevelTitles: bool
|
|
85
|
+
|
|
86
|
+
# 版面检测结果的几何形状, def. to `"auto"`
|
|
87
|
+
# 该参数决定了检测区域(如文本块、图片、表格等)边界的计算方式及展示形态。
|
|
88
|
+
# 默认为自动,系统根据检测目标的复杂程度和置信度,自动选择最合适的形状表达方式。
|
|
89
|
+
layoutShapeMode: LayoutShapeMode
|
|
90
|
+
|
|
91
|
+
# prompt类型设置, def. to `"ocr"`
|
|
92
|
+
# 当且仅当不使用版面分析模块时生效,设置输入文档的区域类型,当输入文档仅存在单个区域时推荐使用
|
|
93
|
+
promptLabel: PromptLabel
|
|
94
|
+
|
|
95
|
+
# 重复抑制强度, def. to `1.0`, recommended in [1.0, 1.2]
|
|
96
|
+
# 结果中出现重复文字、重复表格内容时,可适当调高
|
|
97
|
+
repetitionPenalty: Number
|
|
98
|
+
|
|
99
|
+
# 识别稳定性, def. to `0.0`, recommended in [0.0, 1.0]
|
|
100
|
+
# 结果不稳定或出现明显幻觉时调低,漏识别或者重复较多时可略微调高
|
|
101
|
+
temperature: Number
|
|
102
|
+
|
|
103
|
+
# 结果可信范围, def. to `1.0`
|
|
104
|
+
# 结果发散、不够可信时可适当调低,让模型更保守
|
|
105
|
+
topP: Number
|
|
106
|
+
|
|
107
|
+
# 图像最小总像素数
|
|
108
|
+
# 输入图片太小、文字看不清时可适当调高,一般无需调整
|
|
109
|
+
minPixels: int
|
|
110
|
+
|
|
111
|
+
# 图像最大总像素数
|
|
112
|
+
# 输入图片特别大、处理变慢或显存压力较大时可适当调低
|
|
113
|
+
maxPixels: int
|
|
114
|
+
|
|
115
|
+
# NMS后处理, def. to `True`
|
|
116
|
+
# 当且仅当使用版面分析模块时生效,开启后,会自动移除重复或高度重叠的区域框
|
|
117
|
+
layoutNms: bool
|
|
118
|
+
|
|
119
|
+
# 重构多页结果
|
|
120
|
+
# 对多页 pdf 解析结果进行重构,用于适配跨页表格合并和段落标题级别识别,默认初始化为False。
|
|
121
|
+
restructurePages: bool
|
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""
|
|
2
|
+
data::Markdown
|
|
3
|
+
dataclass::Result
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
|
+
from typing_extensions import Self
|
|
11
|
+
|
|
12
|
+
from ..utils.regex import MARKDOWN_TITLE_PATTERN
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@dataclass
|
|
16
|
+
class PrunedResult:
|
|
17
|
+
"""The recognition result of layout elements on a page."""
|
|
18
|
+
page_count: Optional[int]
|
|
19
|
+
width: int # page width
|
|
20
|
+
height: int # page height
|
|
21
|
+
model_settings: Dict[str, bool | List[str]] # Similar configuration in `OptionalPayload`.
|
|
22
|
+
parsing_res_list: List[Dict[str, Any]] # Layout recognition results
|
|
23
|
+
doc_preprocessor_res: Dict[str, int | Dict[str, bool]]
|
|
24
|
+
layout_det_res: Dict[str, List[Dict[str, Any]]]
|
|
25
|
+
|
|
26
|
+
@classmethod
|
|
27
|
+
def from_json(cls, data: Dict[str, Any]) -> PrunedResult:
|
|
28
|
+
"""Construct from a JSON object."""
|
|
29
|
+
return PrunedResult(**data)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class Markdown:
|
|
34
|
+
"""The recognition result of one page's markdown format."""
|
|
35
|
+
text: str
|
|
36
|
+
images: Dict[str, str] # image_path: image_url
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_json(cls, data: Dict[str, str | Dict[str, str]]) -> Markdown:
|
|
40
|
+
"""Construct from a JSON object."""
|
|
41
|
+
return Markdown(**data)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class LayoutParsingResult:
|
|
46
|
+
"""One page layout recognition result"""
|
|
47
|
+
pruned_result: PrunedResult
|
|
48
|
+
markdown: Markdown
|
|
49
|
+
output_images: Dict[str, str] # output images when processing this page
|
|
50
|
+
input_image: str # The URL of the original image on this page
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_json(cls, data: Dict[str, Any]) -> LayoutParsingResult:
|
|
54
|
+
"""Construct from a JSON object."""
|
|
55
|
+
pruned_result = PrunedResult.from_json(data["prunedResult"])
|
|
56
|
+
markdown = Markdown.from_json(data["markdown"])
|
|
57
|
+
return LayoutParsingResult(
|
|
58
|
+
pruned_result=pruned_result,
|
|
59
|
+
markdown=markdown,
|
|
60
|
+
output_images=data["outputImages"],
|
|
61
|
+
input_image=data["inputImage"],
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class PageSizeInfo:
|
|
67
|
+
"""One page size information"""
|
|
68
|
+
width: int
|
|
69
|
+
height: int
|
|
70
|
+
|
|
71
|
+
@classmethod
|
|
72
|
+
def from_json(cls, data: Dict[str, int]) -> PageSizeInfo:
|
|
73
|
+
"""Construct from a JSON object."""
|
|
74
|
+
return PageSizeInfo(**data)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@dataclass
|
|
78
|
+
class DataInfo:
|
|
79
|
+
"""Processing result data"""
|
|
80
|
+
num_pages: int # Number of pages
|
|
81
|
+
pages: List[PageSizeInfo] # Size of each image
|
|
82
|
+
type: str # Input file format, like `"pdf"`
|
|
83
|
+
|
|
84
|
+
@classmethod
|
|
85
|
+
def from_json(cls, data: Dict[str, Any]) -> DataInfo:
|
|
86
|
+
"""Construct from a JSON object."""
|
|
87
|
+
pages = list(map(PageSizeInfo.from_json, data["pages"]))
|
|
88
|
+
return DataInfo(
|
|
89
|
+
num_pages=data["numPages"],
|
|
90
|
+
pages=pages,
|
|
91
|
+
type=data["type"]
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
_SENTENCE_END_SYMBOLS = ".!?。!?>"
|
|
97
|
+
|
|
98
|
+
@dataclass
|
|
99
|
+
class Result:
|
|
100
|
+
"""Paddle OCR Processing Results for Documents"""
|
|
101
|
+
layout_parsing_results: List[LayoutParsingResult]
|
|
102
|
+
data_info: DataInfo
|
|
103
|
+
preprocessed_images: List[str] # Preprocessed Image URL List
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def __add__(self, other: Result) -> Result:
|
|
107
|
+
"""
|
|
108
|
+
Add the recognition results for the next section.
|
|
109
|
+
"""
|
|
110
|
+
layout_parsing_results = self.layout_parsing_results + other.layout_parsing_results
|
|
111
|
+
preprocessed_images = self.preprocessed_images + other.preprocessed_images
|
|
112
|
+
|
|
113
|
+
data_info = self.data_info
|
|
114
|
+
other_data_info = other.data_info
|
|
115
|
+
num_pages = data_info.num_pages + other_data_info.num_pages
|
|
116
|
+
pages = data_info.pages + other_data_info.pages
|
|
117
|
+
|
|
118
|
+
return Result(
|
|
119
|
+
layout_parsing_results=layout_parsing_results,
|
|
120
|
+
preprocessed_images=preprocessed_images,
|
|
121
|
+
data_info=DataInfo(
|
|
122
|
+
num_pages=num_pages,
|
|
123
|
+
pages=pages,
|
|
124
|
+
type=data_info["type"],
|
|
125
|
+
)
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
@classmethod
|
|
130
|
+
def from_json(cls, data: Dict[str, Any]) -> Result:
|
|
131
|
+
"""Construct from a JSON object."""
|
|
132
|
+
layout_parsing_results = list(map(LayoutParsingResult.from_json, data["layoutParsingResults"]))
|
|
133
|
+
data_info = DataInfo.from_json(data["dataInfo"])
|
|
134
|
+
return Result(
|
|
135
|
+
layout_parsing_results=layout_parsing_results,
|
|
136
|
+
data_info=data_info,
|
|
137
|
+
preprocessed_images=data["preprocessedImages"],
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def extend(self, other: Result) -> Self:
|
|
142
|
+
"""
|
|
143
|
+
Add the recognition results for the next section.
|
|
144
|
+
"""
|
|
145
|
+
self.layout_parsing_results.extend(other.layout_parsing_results)
|
|
146
|
+
self.preprocessed_images.extend(other.preprocessed_images)
|
|
147
|
+
|
|
148
|
+
data_info = self.data_info
|
|
149
|
+
other_data_info = other.data_info
|
|
150
|
+
data_info.num_pages += other_data_info.num_pages
|
|
151
|
+
data_info.pages.extend(other_data_info.pages)
|
|
152
|
+
|
|
153
|
+
return self
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@property
|
|
157
|
+
def markdown_text(self) -> str:
|
|
158
|
+
"""
|
|
159
|
+
Return the parsing result of Markdown text format.
|
|
160
|
+
|
|
161
|
+
This will intelligently connect to the previous text.
|
|
162
|
+
"""
|
|
163
|
+
# extract markdown
|
|
164
|
+
markdown_texts = []
|
|
165
|
+
for layout_parsing_result in self.layout_parsing_results:
|
|
166
|
+
markdown_text = layout_parsing_result.markdown.text
|
|
167
|
+
if len(markdown_texts) == 0:
|
|
168
|
+
markdown_texts.append(markdown_text)
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
# Take out the last line of the previous text
|
|
172
|
+
last_row = markdown_texts[-1].rsplit("\n", 1)[-1].strip()
|
|
173
|
+
if last_row and (
|
|
174
|
+
MARKDOWN_TITLE_PATTERN.fullmatch(last_row) or # The previous line is the title
|
|
175
|
+
last_row.strip()[-1] in _SENTENCE_END_SYMBOLS # The previous line is a complete sentence
|
|
176
|
+
):
|
|
177
|
+
markdown_texts.append(markdown_text)
|
|
178
|
+
else:
|
|
179
|
+
markdown_texts[-1] += ' ' + markdown_text
|
|
180
|
+
|
|
181
|
+
return "\n\n".join(markdown_texts)
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def markdown_images(self) -> Dict[str, str]:
|
|
186
|
+
"""
|
|
187
|
+
Return the image in Markdown format in the parsed result.
|
|
188
|
+
|
|
189
|
+
Returns:
|
|
190
|
+
dict[str, str]:
|
|
191
|
+
- the key is image_relative_path.
|
|
192
|
+
- the value is image_url
|
|
193
|
+
"""
|
|
194
|
+
# extract markdown
|
|
195
|
+
markdown_images = {}
|
|
196
|
+
for layout_parsing_result in self.layout_parsing_results:
|
|
197
|
+
markdown_images |= layout_parsing_result.markdown.images
|
|
198
|
+
|
|
199
|
+
return markdown_images
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def markdown(self) -> Markdown:
|
|
204
|
+
"""
|
|
205
|
+
Return the Markdown formatted parsing result.
|
|
206
|
+
"""
|
|
207
|
+
return Markdown(
|
|
208
|
+
text=self.markdown_text,
|
|
209
|
+
images=self.markdown_images
|
|
210
|
+
)
|
|
File without changes
|