paddleocr-api 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,121 @@
1
+ """
2
+ enum::AuxiliaryLayoutElement
3
+ enum::LayoutShapeMode
4
+ enum::PromptLabel
5
+ json::OptionalPayload
6
+ """
7
+
8
+ from numbers import Number
9
+ from typing import List, TypedDict
10
+
11
+ try:
12
+ from enum import StrEnum
13
+ except ImportError:
14
+ from ..utils.enum import StrEnum
15
+
16
+
17
+ class AuxiliaryLayoutElement(StrEnum):
18
+ """辅助版面元素"""
19
+ HEADER = "header" # 页眉
20
+ HEADER_IMAGE = "header_image" # 页眉图片
21
+ FOOTER = "footer" # 页脚
22
+ FOOTER_IMAGE = "footer_image" # 页脚图片
23
+ NUMBER = "number" # 页码
24
+ FOOTNOTE = "footnote" # 脚注
25
+ ASIDE_TEXT = "aside_text" # 旁注文本
26
+
27
+
28
+ class LayoutShapeMode(StrEnum):
29
+ """版面检测结果的几何形状"""
30
+ AUTO = "auto" # 自动
31
+ RECT = "rect" # 矩形
32
+ QUAD = "quad" # 四边形
33
+ POLY = "poly" # 多边形
34
+
35
+
36
+ class PromptLabel(StrEnum):
37
+ """prompt类型设置"""
38
+ OCR = "ocr" # 文本
39
+ FORMULA = "formula" # 公式
40
+ TABLE = "table" # 表格
41
+ CHART = "chart" # 图表
42
+ SEAL = "seal" # 印章
43
+ SPOTTING = "spotting" # 文本检测与识别
44
+
45
+
46
+ class OptionalPayload(TypedDict):
47
+ """Optional parameters when creating jobs."""
48
+
49
+ # 辅助内容解析,默认为所有的 AuxiliaryLayoutElement 元素
50
+ # 模型会自动识别并过滤列出的辅助内容
51
+ markdownIgnoreLabels: List[AuxiliaryLayoutElement]
52
+
53
+ # 图片方向矫正, def. to `False`
54
+ # 开启后,可以自动识别并矫正 0°、90°、180°、270°的图片
55
+ useDocOrientationClassify: bool
56
+
57
+ # 图片扭曲矫正, def. to `False`
58
+ # 开启后,可以自动矫正扭曲图片,例如褶皱、倾斜等情况
59
+ useDocUnwarping: bool
60
+
61
+ # 版面分析, def. to `True`
62
+ # 开启后,系统会对文档进行版面分区和排序,并按区域类别识别。
63
+ # 如果您的文档仅包含文本、表格、公式或图表中的一种元素,可关闭此参数,直接进行对应识别
64
+ useLayoutDetection: bool
65
+
66
+ # 图表识别, def. to `False`
67
+ # 开启后,可以自动解析文档中的图表(如柱状图、饼图等)并转换为表格形式,方便查看和编辑数据
68
+ useChartRecognition: bool
69
+
70
+ # 印章识别, def. to `True`
71
+ # 开启后,可以识别文档中的印章内容,并将其提取为可编辑的文字
72
+ useSealRecognition: bool
73
+
74
+ # 图片文字识别, def. to `False`
75
+ # 开启后,对图片版面元素中的文本进行识别
76
+ useOcrForImageBlock: bool
77
+
78
+ # 跨页表格合并, def. to `True`
79
+ # 开启后,会识别跨页表格,将其合并为一个
80
+ mergeTables: bool
81
+
82
+ # 段落标题级别识别, def. to `True`
83
+ # 开启后,会识别段落标题级别
84
+ relevelTitles: bool
85
+
86
+ # 版面检测结果的几何形状, def. to `"auto"`
87
+ # 该参数决定了检测区域(如文本块、图片、表格等)边界的计算方式及展示形态。
88
+ # 默认为自动,系统根据检测目标的复杂程度和置信度,自动选择最合适的形状表达方式。
89
+ layoutShapeMode: LayoutShapeMode
90
+
91
+ # prompt类型设置, def. to `"ocr"`
92
+ # 当且仅当不使用版面分析模块时生效,设置输入文档的区域类型,当输入文档仅存在单个区域时推荐使用
93
+ promptLabel: PromptLabel
94
+
95
+ # 重复抑制强度, def. to `1.0`, recommended in [1.0, 1.2]
96
+ # 结果中出现重复文字、重复表格内容时,可适当调高
97
+ repetitionPenalty: Number
98
+
99
+ # 识别稳定性, def. to `0.0`, recommended in [0.0, 1.0]
100
+ # 结果不稳定或出现明显幻觉时调低,漏识别或者重复较多时可略微调高
101
+ temperature: Number
102
+
103
+ # 结果可信范围, def. to `1.0`
104
+ # 结果发散、不够可信时可适当调低,让模型更保守
105
+ topP: Number
106
+
107
+ # 图像最小总像素数
108
+ # 输入图片太小、文字看不清时可适当调高,一般无需调整
109
+ minPixels: int
110
+
111
+ # 图像最大总像素数
112
+ # 输入图片特别大、处理变慢或显存压力较大时可适当调低
113
+ maxPixels: int
114
+
115
+ # NMS后处理, def. to `True`
116
+ # 当且仅当使用版面分析模块时生效,开启后,会自动移除重复或高度重叠的区域框
117
+ layoutNms: bool
118
+
119
+ # 重构多页结果
120
+ # 对多页 pdf 解析结果进行重构,用于适配跨页表格合并和段落标题级别识别,默认初始化为False。
121
+ restructurePages: bool
@@ -0,0 +1,210 @@
1
+ """
2
+ data::Markdown
3
+ dataclass::Result
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from dataclasses import dataclass
9
+ from typing import Any, Dict, List, Optional
10
+ from typing_extensions import Self
11
+
12
+ from ..utils.regex import MARKDOWN_TITLE_PATTERN
13
+
14
+
15
+ @dataclass
16
+ class PrunedResult:
17
+ """The recognition result of layout elements on a page."""
18
+ page_count: Optional[int]
19
+ width: int # page width
20
+ height: int # page height
21
+ model_settings: Dict[str, bool | List[str]] # Similar configuration in `OptionalPayload`.
22
+ parsing_res_list: List[Dict[str, Any]] # Layout recognition results
23
+ doc_preprocessor_res: Dict[str, int | Dict[str, bool]]
24
+ layout_det_res: Dict[str, List[Dict[str, Any]]]
25
+
26
+ @classmethod
27
+ def from_json(cls, data: Dict[str, Any]) -> PrunedResult:
28
+ """Construct from a JSON object."""
29
+ return PrunedResult(**data)
30
+
31
+
32
+ @dataclass
33
+ class Markdown:
34
+ """The recognition result of one page's markdown format."""
35
+ text: str
36
+ images: Dict[str, str] # image_path: image_url
37
+
38
+ @classmethod
39
+ def from_json(cls, data: Dict[str, str | Dict[str, str]]) -> Markdown:
40
+ """Construct from a JSON object."""
41
+ return Markdown(**data)
42
+
43
+
44
+ @dataclass
45
+ class LayoutParsingResult:
46
+ """One page layout recognition result"""
47
+ pruned_result: PrunedResult
48
+ markdown: Markdown
49
+ output_images: Dict[str, str] # output images when processing this page
50
+ input_image: str # The URL of the original image on this page
51
+
52
+ @classmethod
53
+ def from_json(cls, data: Dict[str, Any]) -> LayoutParsingResult:
54
+ """Construct from a JSON object."""
55
+ pruned_result = PrunedResult.from_json(data["prunedResult"])
56
+ markdown = Markdown.from_json(data["markdown"])
57
+ return LayoutParsingResult(
58
+ pruned_result=pruned_result,
59
+ markdown=markdown,
60
+ output_images=data["outputImages"],
61
+ input_image=data["inputImage"],
62
+ )
63
+
64
+
65
+ @dataclass
66
+ class PageSizeInfo:
67
+ """One page size information"""
68
+ width: int
69
+ height: int
70
+
71
+ @classmethod
72
+ def from_json(cls, data: Dict[str, int]) -> PageSizeInfo:
73
+ """Construct from a JSON object."""
74
+ return PageSizeInfo(**data)
75
+
76
+
77
+ @dataclass
78
+ class DataInfo:
79
+ """Processing result data"""
80
+ num_pages: int # Number of pages
81
+ pages: List[PageSizeInfo] # Size of each image
82
+ type: str # Input file format, like `"pdf"`
83
+
84
+ @classmethod
85
+ def from_json(cls, data: Dict[str, Any]) -> DataInfo:
86
+ """Construct from a JSON object."""
87
+ pages = list(map(PageSizeInfo.from_json, data["pages"]))
88
+ return DataInfo(
89
+ num_pages=data["numPages"],
90
+ pages=pages,
91
+ type=data["type"]
92
+ )
93
+
94
+
95
+
96
+ _SENTENCE_END_SYMBOLS = ".!?。!?>"
97
+
98
+ @dataclass
99
+ class Result:
100
+ """Paddle OCR Processing Results for Documents"""
101
+ layout_parsing_results: List[LayoutParsingResult]
102
+ data_info: DataInfo
103
+ preprocessed_images: List[str] # Preprocessed Image URL List
104
+
105
+
106
+ def __add__(self, other: Result) -> Result:
107
+ """
108
+ Add the recognition results for the next section.
109
+ """
110
+ layout_parsing_results = self.layout_parsing_results + other.layout_parsing_results
111
+ preprocessed_images = self.preprocessed_images + other.preprocessed_images
112
+
113
+ data_info = self.data_info
114
+ other_data_info = other.data_info
115
+ num_pages = data_info.num_pages + other_data_info.num_pages
116
+ pages = data_info.pages + other_data_info.pages
117
+
118
+ return Result(
119
+ layout_parsing_results=layout_parsing_results,
120
+ preprocessed_images=preprocessed_images,
121
+ data_info=DataInfo(
122
+ num_pages=num_pages,
123
+ pages=pages,
124
+ type=data_info["type"],
125
+ )
126
+ )
127
+
128
+
129
+ @classmethod
130
+ def from_json(cls, data: Dict[str, Any]) -> Result:
131
+ """Construct from a JSON object."""
132
+ layout_parsing_results = list(map(LayoutParsingResult.from_json, data["layoutParsingResults"]))
133
+ data_info = DataInfo.from_json(data["dataInfo"])
134
+ return Result(
135
+ layout_parsing_results=layout_parsing_results,
136
+ data_info=data_info,
137
+ preprocessed_images=data["preprocessedImages"],
138
+ )
139
+
140
+
141
+ def extend(self, other: Result) -> Self:
142
+ """
143
+ Add the recognition results for the next section.
144
+ """
145
+ self.layout_parsing_results.extend(other.layout_parsing_results)
146
+ self.preprocessed_images.extend(other.preprocessed_images)
147
+
148
+ data_info = self.data_info
149
+ other_data_info = other.data_info
150
+ data_info.num_pages += other_data_info.num_pages
151
+ data_info.pages.extend(other_data_info.pages)
152
+
153
+ return self
154
+
155
+
156
+ @property
157
+ def markdown_text(self) -> str:
158
+ """
159
+ Return the parsing result of Markdown text format.
160
+
161
+ This will intelligently connect to the previous text.
162
+ """
163
+ # extract markdown
164
+ markdown_texts = []
165
+ for layout_parsing_result in self.layout_parsing_results:
166
+ markdown_text = layout_parsing_result.markdown.text
167
+ if len(markdown_texts) == 0:
168
+ markdown_texts.append(markdown_text)
169
+ continue
170
+
171
+ # Take out the last line of the previous text
172
+ last_row = markdown_texts[-1].rsplit("\n", 1)[-1].strip()
173
+ if last_row and (
174
+ MARKDOWN_TITLE_PATTERN.fullmatch(last_row) or # The previous line is the title
175
+ last_row.strip()[-1] in _SENTENCE_END_SYMBOLS # The previous line is a complete sentence
176
+ ):
177
+ markdown_texts.append(markdown_text)
178
+ else:
179
+ markdown_texts[-1] += ' ' + markdown_text
180
+
181
+ return "\n\n".join(markdown_texts)
182
+
183
+
184
+ @property
185
+ def markdown_images(self) -> Dict[str, str]:
186
+ """
187
+ Return the image in Markdown format in the parsed result.
188
+
189
+ Returns:
190
+ dict[str, str]:
191
+ - the key is image_relative_path.
192
+ - the value is image_url
193
+ """
194
+ # extract markdown
195
+ markdown_images = {}
196
+ for layout_parsing_result in self.layout_parsing_results:
197
+ markdown_images |= layout_parsing_result.markdown.images
198
+
199
+ return markdown_images
200
+
201
+
202
+ @property
203
+ def markdown(self) -> Markdown:
204
+ """
205
+ Return the Markdown formatted parsing result.
206
+ """
207
+ return Markdown(
208
+ text=self.markdown_text,
209
+ images=self.markdown_images
210
+ )
File without changes
@@ -0,0 +1,8 @@
1
+ """
2
+ type::StrEnum
3
+ """
4
+
5
+ # 兼容 Python 3.10
6
+ from enum import Enum
7
+ class StrEnum(str, Enum):
8
+ """Enum where members are also (and must be) strings"""
@@ -0,0 +1,9 @@
1
+ """
2
+ Regular expression patterns.
3
+ """
4
+
5
+ import re
6
+
7
+ URL_PATTERN = re.compile(r"https?:.*")
8
+
9
+ MARKDOWN_TITLE_PATTERN = re.compile(r"\#+\s+.+")