recpdf 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- recpdf/__init__.py +1 -0
- recpdf/models.py +29 -0
- recpdf/parser.py +257 -0
- recpdf/prompts.py +29 -0
- recpdf/utils.py +97 -0
- recpdf-0.1.0.dist-info/METADATA +166 -0
- recpdf-0.1.0.dist-info/RECORD +10 -0
- recpdf-0.1.0.dist-info/WHEEL +4 -0
- recpdf-0.1.0.dist-info/entry_points.txt +4 -0
- recpdf-0.1.0.dist-info/licenses/LICENSE +21 -0
recpdf/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .parser import parse_pdf, refine_markdown
|
recpdf/models.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""
|
|
2
|
+
模型初始化模块
|
|
3
|
+
解析模型:parse_model
|
|
4
|
+
"""
|
|
5
|
+
import os
|
|
6
|
+
from langchain.chat_models import init_chat_model
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def init_vlm(api_key: str, base_url: str, model: str, temperature: float = 0.5, max_tokens: int = 96000):
|
|
10
|
+
model = init_chat_model(
|
|
11
|
+
api_key = api_key,
|
|
12
|
+
base_url = base_url,
|
|
13
|
+
model = model,
|
|
14
|
+
model_provider = "openai",
|
|
15
|
+
temperature = temperature,
|
|
16
|
+
max_tokens = max_tokens,
|
|
17
|
+
)
|
|
18
|
+
return model
|
|
19
|
+
|
|
20
|
+
def init_llm(api_key: str, base_url: str, model: str, temperature: float = 0.5, max_tokens: int = 102400):
|
|
21
|
+
model = init_chat_model(
|
|
22
|
+
api_key = api_key,
|
|
23
|
+
base_url = base_url,
|
|
24
|
+
model = model,
|
|
25
|
+
model_provider = "openai",
|
|
26
|
+
temperature = temperature,
|
|
27
|
+
max_tokens = max_tokens,
|
|
28
|
+
)
|
|
29
|
+
return model
|
recpdf/parser.py
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
"""
|
|
2
|
+
RecPDF包:
|
|
3
|
+
一个用于解析PDF文档内容的Python包,可识别PDF中的文本、图片、表格、公式等元素。
|
|
4
|
+
主接口:parse_pdf
|
|
5
|
+
@param pdf_path: PDF文件路径
|
|
6
|
+
@param output_dir: 输出目录
|
|
7
|
+
@param api_key: API密钥
|
|
8
|
+
@param base_url: API基础URL
|
|
9
|
+
@param model: 模型名称
|
|
10
|
+
@param workers: 工作线程数
|
|
11
|
+
@param prompt: 解析提示词
|
|
12
|
+
@param rect_prompt: 矩形解析提示词
|
|
13
|
+
@param sys_prompt: 系统提示词
|
|
14
|
+
@return: 解析后的markdown内容, 矩形图片路径列表
|
|
15
|
+
工作流程:
|
|
16
|
+
1. 读取PDF文件
|
|
17
|
+
2. 提取PDF页面中的图片区域和文本区域
|
|
18
|
+
3. 合并和吸附图片和文本区域
|
|
19
|
+
4. 页面矩形区域标注,保存页面图片和矩形区域图片
|
|
20
|
+
5. 创建页面解析线程池
|
|
21
|
+
6. 调用大模型解析页面图片
|
|
22
|
+
7. 保存解析结果到Markdown文件
|
|
23
|
+
"""
|
|
24
|
+
import logging
|
|
25
|
+
import fitz
|
|
26
|
+
import os
|
|
27
|
+
import base64
|
|
28
|
+
import shapely.geometry as sg
|
|
29
|
+
from typing import List, Tuple
|
|
30
|
+
from shapely.validation import explain_validity
|
|
31
|
+
from .utils import merge_rects, adsorb_rects_to_rects, remove_markdown_backticks
|
|
32
|
+
from .prompts import DEFAULT_PROMPT, DEFAULT_RECT_PROMPT, DEFAULT_SYS_PROMPT, REFINE_PROMPT, REFINE_SYS_PROMPT
|
|
33
|
+
from .models import init_vlm, init_llm
|
|
34
|
+
import concurrent.futures
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
38
|
+
logger = logging.getLogger(__name__)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def parse_rects(page: fitz.Page):
|
|
42
|
+
"""
|
|
43
|
+
解析页面中的绘图,并合并相邻的矩形。
|
|
44
|
+
@param page: 页面
|
|
45
|
+
@return: 矩形列表
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
# 1.提取PDF页面中的矢量图形,并转换为shapely的矩形列表
|
|
49
|
+
drawings = page.get_drawings()
|
|
50
|
+
## 忽略掉长度小于30的水平直线
|
|
51
|
+
is_short_line = lambda x: abs(x['rect'][3] - x['rect'][1]) < 1 and abs(x['rect'][2] - x['rect'][0]) < 30
|
|
52
|
+
drawings = [drawing for drawing in drawings if not is_short_line(drawing)]
|
|
53
|
+
rect_list = [sg.box(*drawing['rect']) for drawing in drawings]
|
|
54
|
+
|
|
55
|
+
# 2.提取PDF页面中的图像区域,并转换为shapely的矩形列表
|
|
56
|
+
images = page.get_image_info()
|
|
57
|
+
image_rects = [sg.box(*image['bbox']) for image in images]
|
|
58
|
+
|
|
59
|
+
# 3.合并矢量图形和图像区域的矩形列表
|
|
60
|
+
rect_list += image_rects
|
|
61
|
+
merged_rects = merge_rects(rect_list, distance=10, horizontal_distance=100)
|
|
62
|
+
merged_rects = [rect for rect in merged_rects if explain_validity(rect) == 'Valid Geometry']
|
|
63
|
+
|
|
64
|
+
# 4.提取PDF页面中的文本区域,并转换为shapely的矩形列表,区分大文本块和小文本块
|
|
65
|
+
is_large_content = lambda x: (len(x[4]) / max(1, len(x[4].split('\n')))) > 5
|
|
66
|
+
small_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if not is_large_content(x)]
|
|
67
|
+
large_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if is_large_content(x)]
|
|
68
|
+
|
|
69
|
+
# 5.将大文本块和小文本块分别处理,将靠近的文本块吸附到图形图像矩形列表中
|
|
70
|
+
_, merged_rects = adsorb_rects_to_rects(large_text_area_rects, merged_rects, distance=0.1) # 完全相交
|
|
71
|
+
_, merged_rects = adsorb_rects_to_rects(small_text_area_rects, merged_rects, distance=5) # 靠近
|
|
72
|
+
|
|
73
|
+
# 6.再做一次矩形列表合并
|
|
74
|
+
merged_rects = merge_rects(merged_rects, distance=10)
|
|
75
|
+
|
|
76
|
+
# 7.过滤比较小的矩形
|
|
77
|
+
merged_rects = [rect for rect in merged_rects if rect.bounds[2] - rect.bounds[0] > 20 and rect.bounds[3] - rect.bounds[1] > 20]
|
|
78
|
+
|
|
79
|
+
# 8.返回最终的矩形边界列表
|
|
80
|
+
return [rect.bounds for rect in merged_rects]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def parse_pdf_to_images(pdf_path, output_dir = './'):
|
|
84
|
+
"""
|
|
85
|
+
解析PDF文件到图片,并保存到输出目录。
|
|
86
|
+
@param pdf_path: PDF文件路径
|
|
87
|
+
@param output_dir: 输出目录
|
|
88
|
+
@return image_infos: 图片信息列表(图片路径, 矩形图片路径列表)
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
# 打开PDF文件
|
|
92
|
+
pdf_document = fitz.open(pdf_path)
|
|
93
|
+
image_infos = []
|
|
94
|
+
|
|
95
|
+
# 1.遍历PDF文件中的每一页
|
|
96
|
+
for page_index, page in enumerate(pdf_document):
|
|
97
|
+
logger.info(f'parse page: {page_index}')
|
|
98
|
+
rect_images = []
|
|
99
|
+
|
|
100
|
+
# 2.解析页面中的矩形区域
|
|
101
|
+
rects = parse_rects(page)
|
|
102
|
+
|
|
103
|
+
# 3.遍历每个矩形区域
|
|
104
|
+
for index, rect in enumerate(rects):
|
|
105
|
+
fitz_rect = fitz.Rect(rect)
|
|
106
|
+
|
|
107
|
+
# 4.获取矩形区域的像素映射数据,并保存为图片
|
|
108
|
+
pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(4, 4))
|
|
109
|
+
name = f'{page_index}_{index}.png'
|
|
110
|
+
pix.save(os.path.join(output_dir, name))
|
|
111
|
+
rect_images.append(name)
|
|
112
|
+
|
|
113
|
+
# 5.在页面上绘制红色矩形边框
|
|
114
|
+
big_fitz_rect = fitz.Rect(fitz_rect.x0 - 1, fitz_rect.y0 - 1, fitz_rect.x1 + 1, fitz_rect.y1 + 1)
|
|
115
|
+
page.draw_rect(big_fitz_rect, color=(1, 0, 0), width=1) # 空心矩形
|
|
116
|
+
|
|
117
|
+
# 画矩形区域(实心)
|
|
118
|
+
# page.draw_rect(big_fitz_rect, color=(1, 0, 0), fill=(1, 0, 0))
|
|
119
|
+
|
|
120
|
+
# 6.在矩形内的左上角写上矩形的索引name,添加一些偏移量
|
|
121
|
+
text_x = fitz_rect.x0 + 2
|
|
122
|
+
text_y = fitz_rect.y0 + 10
|
|
123
|
+
text_rect = fitz.Rect(text_x, text_y - 9, text_x + 80, text_y + 2)
|
|
124
|
+
page.draw_rect(text_rect, color=(1, 1, 1), fill=(1, 1, 1)) # 绘制白色背景矩形
|
|
125
|
+
page.insert_text((text_x, text_y), name, fontsize=10, color=(1, 0, 0)) # 插入文字
|
|
126
|
+
# 7.获取页面高清像素映射数据,并保存为图片。(页面上已标注它所有的矩形区域和索引文字)
|
|
127
|
+
page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(3, 3))
|
|
128
|
+
page_image = os.path.join(output_dir, f'{page_index}.png')
|
|
129
|
+
page_image_with_rects.save(page_image)
|
|
130
|
+
# 8.保存图片信息列表:列表元素为(页面图片文件路径, 矩形图片路径列表)
|
|
131
|
+
image_infos.append((page_image, rect_images))
|
|
132
|
+
|
|
133
|
+
pdf_document.close()
|
|
134
|
+
return image_infos
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def parse_pdf(
|
|
138
|
+
pdf_path: str,
|
|
139
|
+
output_dir: str = './',
|
|
140
|
+
api_key: str = '',
|
|
141
|
+
base_url: str = '',
|
|
142
|
+
model: str = '',
|
|
143
|
+
workers: int = 1,
|
|
144
|
+
prompt = DEFAULT_PROMPT,
|
|
145
|
+
rect_prompt = DEFAULT_RECT_PROMPT,
|
|
146
|
+
sys_prompt = DEFAULT_SYS_PROMPT,
|
|
147
|
+
) -> Tuple[str, List[str]]:
|
|
148
|
+
"""
|
|
149
|
+
解析PDF文件到markdown文件。
|
|
150
|
+
@param pdf_path: PDF文件路径
|
|
151
|
+
@param output_dir: 输出目录
|
|
152
|
+
@return: 解析后的markdown内容, 矩形图片路径列表
|
|
153
|
+
"""
|
|
154
|
+
file_name_as_subpath = os.path.basename(pdf_path).replace('.pdf', '')
|
|
155
|
+
output_dir = os.path.join(output_dir, file_name_as_subpath)
|
|
156
|
+
if not os.path.exists(output_dir):
|
|
157
|
+
os.makedirs(output_dir)
|
|
158
|
+
|
|
159
|
+
file_name = os.path.basename(pdf_path).replace('.pdf', '.md')
|
|
160
|
+
image_infos = parse_pdf_to_images(pdf_path, output_dir=output_dir)
|
|
161
|
+
|
|
162
|
+
# 初始化大模型
|
|
163
|
+
if not api_key or not base_url or not model:
|
|
164
|
+
raise ValueError("api_key, base_url, and model are required parameters")
|
|
165
|
+
parse_model = init_vlm(api_key, base_url, model)
|
|
166
|
+
|
|
167
|
+
# Process images with Vision Large Model
|
|
168
|
+
def _process_page(index: int, image_info: Tuple[str, List[str]]) -> Tuple[int, str]:
|
|
169
|
+
page_image, rect_images = image_info
|
|
170
|
+
local_prompt = prompt
|
|
171
|
+
if rect_images:
|
|
172
|
+
local_prompt += rect_prompt + ', '.join(rect_images)
|
|
173
|
+
|
|
174
|
+
# 打开页面图片文件
|
|
175
|
+
with open(page_image, "rb") as image_file:
|
|
176
|
+
# 调用大模型解析页面图片
|
|
177
|
+
try:
|
|
178
|
+
messages=[
|
|
179
|
+
{"role": "system", "content": sys_prompt},
|
|
180
|
+
{"role": "user", "content": [
|
|
181
|
+
{"type": "text", "text": local_prompt},
|
|
182
|
+
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64.b64encode(image_file.read()).decode('utf-8')}"}}
|
|
183
|
+
]}
|
|
184
|
+
]
|
|
185
|
+
logger.info(f' extract page: {index+1}')
|
|
186
|
+
response = parse_model.invoke(messages)
|
|
187
|
+
|
|
188
|
+
# 检查 response.content 是否为空字符串
|
|
189
|
+
#if response.content:
|
|
190
|
+
# print(response)
|
|
191
|
+
# return index, f"Error: Empty content in API response for page {index+1}"
|
|
192
|
+
|
|
193
|
+
content = response.content
|
|
194
|
+
return index, content
|
|
195
|
+
except Exception as e:
|
|
196
|
+
# 捕获所有异常并返回错误信息
|
|
197
|
+
return index, f"Error processing page {index+1}: {str(e)}"
|
|
198
|
+
|
|
199
|
+
contents = [None] * len(image_infos)
|
|
200
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
|
|
201
|
+
futures = [executor.submit(_process_page, index, image_info) for index, image_info in enumerate(image_infos)]
|
|
202
|
+
for future in concurrent.futures.as_completed(futures):
|
|
203
|
+
index, content = future.result()
|
|
204
|
+
content = remove_markdown_backticks(content)
|
|
205
|
+
contents[index] = content
|
|
206
|
+
|
|
207
|
+
# 保存解析后的markdown文件
|
|
208
|
+
output_path = os.path.join(output_dir, file_name)
|
|
209
|
+
content = '\n\n'.join(contents)
|
|
210
|
+
logger.info(f' save output file: {output_path}')
|
|
211
|
+
with open(output_path, 'w', encoding='utf-8') as f:
|
|
212
|
+
f.write(content)
|
|
213
|
+
|
|
214
|
+
# 删除中间过程的图片
|
|
215
|
+
all_rect_images = []
|
|
216
|
+
for page_image, rect_images in image_infos:
|
|
217
|
+
if os.path.exists(page_image):
|
|
218
|
+
os.remove(page_image)
|
|
219
|
+
all_rect_images.extend(rect_images)
|
|
220
|
+
|
|
221
|
+
return content, all_rect_images
|
|
222
|
+
|
|
223
|
+
def refine_markdown(
|
|
224
|
+
markdown_path: str,
|
|
225
|
+
api_key: str = '',
|
|
226
|
+
base_url: str = '',
|
|
227
|
+
model: str = '',
|
|
228
|
+
prompt = REFINE_PROMPT,
|
|
229
|
+
sys_prompt = REFINE_SYS_PROMPT,
|
|
230
|
+
) -> str:
|
|
231
|
+
"""
|
|
232
|
+
调整markdown内容结构。
|
|
233
|
+
@param markdown: 输入的markdown内容
|
|
234
|
+
@return: 调整后的markdown内容
|
|
235
|
+
"""
|
|
236
|
+
# 初始化大模型
|
|
237
|
+
if not api_key or not base_url or not model:
|
|
238
|
+
raise ValueError("api_key, base_url, and model are required parameters")
|
|
239
|
+
refine_model = init_llm(api_key, base_url, model)
|
|
240
|
+
|
|
241
|
+
with open(markdown_path, 'r', encoding='utf-8') as f:
|
|
242
|
+
markdown = f.read()
|
|
243
|
+
prompt = prompt.format(markdown=markdown)
|
|
244
|
+
messages=[
|
|
245
|
+
{"role": "system", "content": sys_prompt},
|
|
246
|
+
{"role": "user", "content": [
|
|
247
|
+
{"type": "text", "text": prompt},
|
|
248
|
+
]}
|
|
249
|
+
]
|
|
250
|
+
logger.info(f' start refine markdown: {markdown_path}')
|
|
251
|
+
response = refine_model.invoke(messages)
|
|
252
|
+
|
|
253
|
+
output_file = markdown_path.replace('.md', '_refined.md')
|
|
254
|
+
with open(output_file, 'w', encoding='utf-8') as f:
|
|
255
|
+
f.write(response.content)
|
|
256
|
+
logger.info(f' save refined output file: {output_file}')
|
|
257
|
+
return response.content
|
recpdf/prompts.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
DEFAULT_PROMPT = """使用markdown语法,将图片中识别到的文字转换为markdown格式输出。你必须做到:
|
|
2
|
+
1. 输出语言必须与图片中识别到的文字语言保持一致,例如,识别到英语的字段,输出的内容必须是英语;识别到中文的字段,输出的内容必须是中文;
|
|
3
|
+
2. 大小标题的判定:根据图片中文字的位置、文字的含义、字体的大小、字体的颜色深度、字体的粗细度等因素,综合判断文字是否为标题,并合理确定标题等级。确定为标题的文字用markdown语法标注标题等级。
|
|
4
|
+
3. 不要解释和输出无关的文字,直接输出图片中的内容。例如,严禁输出 "以下是我根据图片内容生成的markdown文本:"这样的例子,而是应该直接输出markdown。
|
|
5
|
+
4. 内容不要包含在```markdown ```中、段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式、忽略掉长直线、忽略掉页码。
|
|
6
|
+
再次强调,不要解释和输出无关的文字,直接输出图片中的内容。一定要去除页码,页码通常是位于页面的底部数字。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
DEFAULT_RECT_PROMPT = """图片中用红色框和名称(%s)标注出了一些区域。如果区域是表格或者图片,使用 ![]() 的形式插入到输出内容中,否则直接输出文字内容。
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
DEFAULT_SYS_PROMPT = """你是一个PDF文档解析器,使用markdown和latex语法输出图片的内容。
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
REFINE_SYS_PROMPT = """你是一个markdown文档编辑专家,使用markdown和latex语法编辑文档内容,使文档内容结构更合理。
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
REFINE_PROMPT = """你将收到一个markdown文档。请根据文档内容,使用markdown语法调整文档的标题等级和段落结构,使文档内容结构更合理。你必须做到:
|
|
19
|
+
1、各级标题的判定:根据文档内容的语义和逻辑,判定文章的标题、二级标题、三级标题等。
|
|
20
|
+
2. 标题等级的调整:根据文档内容的语义和逻辑,合理调整标题等级,使文章的层次书序结构合理,标题与段落内容相关。
|
|
21
|
+
3. 段落结构的调整:根据文档内容的语义和逻辑,合理调整段落结构,使段落结构合理。尤其要注意,有些段落的行可能没有填满整行就折行了,需要调整为整行。
|
|
22
|
+
4. 不要增加任何额外的内容,只调整标题等级层次和段落结构.
|
|
23
|
+
5. 不要删减文档中的任何内容,只调整标题等级和段落结构。尤其要注意,不要遗漏任何图片、表格、公式等保持不变的元素,也不要改变它们在文章中的位置。
|
|
24
|
+
6. 不要更改文档的内容,只调整标题等级层次和段落结构。
|
|
25
|
+
7. 不要做任何解释和输出无关的文字,直接输出调整后的markdown文档内容。
|
|
26
|
+
|
|
27
|
+
以下是markdown文档内容:\n%s\n
|
|
28
|
+
{markdown}
|
|
29
|
+
"""
|
recpdf/utils.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""
|
|
2
|
+
工具函数
|
|
3
|
+
"""
|
|
4
|
+
import shapely.geometry as sg
|
|
5
|
+
|
|
6
|
+
def _is_near(rect1, rect2, distance = 20):
|
|
7
|
+
"""
|
|
8
|
+
检查两个矩形是否靠近,如果它们之间的距离小于目标距离。
|
|
9
|
+
@param rect1: 矩形1
|
|
10
|
+
@param rect2: 矩形2
|
|
11
|
+
@param distance: 目标距离
|
|
12
|
+
@return: 是否靠近
|
|
13
|
+
"""
|
|
14
|
+
return rect1.buffer(0.1).distance(rect2.buffer(0.1)) < distance
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _is_horizontal_near(rect1, rect2, distance = 100):
|
|
18
|
+
"""
|
|
19
|
+
检查两个矩形是否水平靠近,如果其中一个矩形是水平线。
|
|
20
|
+
@param rect1: 矩形1
|
|
21
|
+
@param rect2: 矩形2
|
|
22
|
+
@param distance: 目标距离
|
|
23
|
+
@return: 是否水平靠近
|
|
24
|
+
"""
|
|
25
|
+
result = False
|
|
26
|
+
if abs(rect1.bounds[3] - rect1.bounds[1]) < 0.1 or abs(rect2.bounds[3] - rect2.bounds[1]) < 0.1:
|
|
27
|
+
if abs(rect1.bounds[0] - rect2.bounds[0]) < 0.1 and abs(rect1.bounds[2] - rect2.bounds[2]) < 0.1:
|
|
28
|
+
result = abs(rect1.bounds[3] - rect2.bounds[3]) < distance
|
|
29
|
+
return result
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _union_rects(rect1, rect2):
|
|
33
|
+
"""
|
|
34
|
+
合并两个矩形。
|
|
35
|
+
@param rect1: 矩形1
|
|
36
|
+
@param rect2: 矩形2
|
|
37
|
+
@return: 合并后的矩形
|
|
38
|
+
"""
|
|
39
|
+
return sg.box(*(rect1.union(rect2).bounds))
|
|
40
|
+
|
|
41
|
+
# 图形矩形合并函数
|
|
42
|
+
def merge_rects(rect_list, distance = 20, horizontal_distance = None):
|
|
43
|
+
"""
|
|
44
|
+
合并列表中的矩形,如果它们之间的距离小于目标距离。
|
|
45
|
+
@param rect_list: 矩形列表
|
|
46
|
+
@param distance: 目标距离
|
|
47
|
+
@param horizontal_distance: 水平目标距离
|
|
48
|
+
@return: 合并后的矩形列表
|
|
49
|
+
"""
|
|
50
|
+
merged = True
|
|
51
|
+
while merged:
|
|
52
|
+
merged = False
|
|
53
|
+
new_rect_list = []
|
|
54
|
+
while rect_list:
|
|
55
|
+
rect = rect_list.pop(0)
|
|
56
|
+
for other_rect in rect_list:
|
|
57
|
+
if _is_near(rect, other_rect, distance) or (
|
|
58
|
+
horizontal_distance and _is_horizontal_near(rect, other_rect, horizontal_distance)):
|
|
59
|
+
rect = _union_rects(rect, other_rect)
|
|
60
|
+
rect_list.remove(other_rect)
|
|
61
|
+
merged = True
|
|
62
|
+
new_rect_list.append(rect)
|
|
63
|
+
rect_list = new_rect_list
|
|
64
|
+
return rect_list
|
|
65
|
+
|
|
66
|
+
# 文本矩形吸附函数
|
|
67
|
+
def adsorb_rects_to_rects(source_rects, target_rects, distance=10):
|
|
68
|
+
"""
|
|
69
|
+
当距离小于目标距离时,将一组矩形吸附到另一组矩形。
|
|
70
|
+
@param source_rects: 源矩形列表
|
|
71
|
+
@param target_rects: 目标矩形列表
|
|
72
|
+
@param distance: 目标距离
|
|
73
|
+
@return: 吸附后的源矩形列表和目标矩形列表
|
|
74
|
+
"""
|
|
75
|
+
new_source_rects = []
|
|
76
|
+
for text_area_rect in source_rects:
|
|
77
|
+
adsorbed = False
|
|
78
|
+
for index, rect in enumerate(target_rects):
|
|
79
|
+
if _is_near(text_area_rect, rect, distance):
|
|
80
|
+
rect = _union_rects(text_area_rect, rect)
|
|
81
|
+
target_rects[index] = rect
|
|
82
|
+
adsorbed = True
|
|
83
|
+
break
|
|
84
|
+
if not adsorbed:
|
|
85
|
+
new_source_rects.append(text_area_rect)
|
|
86
|
+
return new_source_rects, target_rects
|
|
87
|
+
|
|
88
|
+
def remove_markdown_backticks(content: str) -> str:
|
|
89
|
+
"""
|
|
90
|
+
删除markdown中的```字符串。
|
|
91
|
+
"""
|
|
92
|
+
if '```markdown' in content:
|
|
93
|
+
content = content.replace('```markdown\n', '')
|
|
94
|
+
last_backticks_pos = content.rfind('```')
|
|
95
|
+
if last_backticks_pos != -1:
|
|
96
|
+
content = content[:last_backticks_pos] + content[last_backticks_pos + 3:]
|
|
97
|
+
return content
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: recpdf
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Using large model to parse and translate PDF.
|
|
5
|
+
Author-Email: FreeCode001 <freecode0902@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
9
|
+
Requires-Dist: shapely>=2.1.2
|
|
10
|
+
Requires-Dist: langchain>=1.2.8
|
|
11
|
+
Requires-Dist: pymupdf>=1.26.7
|
|
12
|
+
Requires-Dist: langchain-openai>=1.1.7
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# RecPDF
|
|
16
|
+
|
|
17
|
+
[](https://opensource.org/licenses/MIT)
|
|
18
|
+
[](https://www.python.org/downloads/release/python-3110/)
|
|
19
|
+
|
|
20
|
+
RecPDF是一个使用大模型解析和转换PDF文档的Python包,能够识别PDF中的文本、图片、表格、公式等元素,并将其转换为结构化的markdown格式。
|
|
21
|
+
|
|
22
|
+
## 功能特点
|
|
23
|
+
|
|
24
|
+
- 📄 解析PDF文档中的文本、图片、表格、公式等元素
|
|
25
|
+
- 🤖 支持使用各种大模型进行智能解析
|
|
26
|
+
- 🔄 多线程并行处理,提高解析速度
|
|
27
|
+
- 📝 输出结构化的markdown格式,保留原始文档的层次结构
|
|
28
|
+
- 🎨 智能识别标题层级,优化文档结构
|
|
29
|
+
- 🖼️ 自动处理图片和表格,保持文档的视觉完整性
|
|
30
|
+
|
|
31
|
+
## 安装
|
|
32
|
+
|
|
33
|
+
使用pip安装RecPDF:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install recpdf
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
## 依赖项
|
|
40
|
+
|
|
41
|
+
- python-dotenv>=1.2.1
|
|
42
|
+
- shapely>=2.1.2
|
|
43
|
+
- langchain>=1.2.8
|
|
44
|
+
- pymupdf>=1.26.7
|
|
45
|
+
- langchain-openai>=1.1.7
|
|
46
|
+
|
|
47
|
+
## 快速开始
|
|
48
|
+
|
|
49
|
+
### 基本使用
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from recpdf import parse_pdf
|
|
53
|
+
|
|
54
|
+
# 解析PDF文件
|
|
55
|
+
content, rect_images = parse_pdf(
|
|
56
|
+
pdf_path="path/to/your/document.pdf",
|
|
57
|
+
output_dir="./output",
|
|
58
|
+
api_key="your_api_key",
|
|
59
|
+
base_url="your_api_base_url",
|
|
60
|
+
model="your_model_name",
|
|
61
|
+
workers=2 # 多线程处理
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
print("解析完成,markdown内容已保存到指定目录")
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### 使用环境变量
|
|
68
|
+
|
|
69
|
+
您也可以通过环境变量设置API相关参数,这样在调用函数时就不需要传递这些参数了:
|
|
70
|
+
|
|
71
|
+
1. 创建一个 `.env`文件:
|
|
72
|
+
|
|
73
|
+
```
|
|
74
|
+
VLM_API_KEY=your_api_key
|
|
75
|
+
VLM_API_BASE=your_api_base_url
|
|
76
|
+
VLM_API_MODEL=your_model_name
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
2. 然后在代码中加载环境变量:
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import os
|
|
83
|
+
from dotenv import load_dotenv
|
|
84
|
+
from recpdf import parse_pdf
|
|
85
|
+
|
|
86
|
+
load_dotenv()
|
|
87
|
+
|
|
88
|
+
api_key = os.getenv('VLM_API_KEY')
|
|
89
|
+
base_url = os.getenv('VLM_API_BASE')
|
|
90
|
+
model = os.getenv('VLM_API_MODEL')
|
|
91
|
+
|
|
92
|
+
content, rect_images = parse_pdf(
|
|
93
|
+
pdf_path="path/to/your/document.pdf",
|
|
94
|
+
output_dir="./output",
|
|
95
|
+
api_key=api_key,
|
|
96
|
+
base_url=base_url,
|
|
97
|
+
model=model
|
|
98
|
+
)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
### 高级功能
|
|
102
|
+
|
|
103
|
+
#### 调整Markdown结构
|
|
104
|
+
|
|
105
|
+
RecPDF还提供了一个 `refine_markdown`函数,可以进一步优化生成的markdown文档结构:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
from recpdf.parser import refine_markdown
|
|
109
|
+
|
|
110
|
+
refined_content = refine_markdown(
|
|
111
|
+
markdown_path="path/to/your/output.md",
|
|
112
|
+
api_key="your_api_key",
|
|
113
|
+
base_url="your_api_base_url",
|
|
114
|
+
model="your_model_name"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
print("Markdown结构优化完成")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
## 工作原理
|
|
121
|
+
|
|
122
|
+
1. **PDF解析**:使用PyMuPDF库提取PDF页面中的文本、图片和图形元素
|
|
123
|
+
2. **区域识别**:通过几何分析识别和合并页面中的内容区域
|
|
124
|
+
3. **图像生成**:将识别到的区域转换为高清图像
|
|
125
|
+
4. **大模型解析**:调用配置的大模型解析图像内容,识别文本、表格、公式等
|
|
126
|
+
5. **Markdown生成**:将解析结果转换为结构化的markdown格式
|
|
127
|
+
6. **可选优化**:使用大模型进一步优化markdown文档结构
|
|
128
|
+
|
|
129
|
+
## 项目结构
|
|
130
|
+
|
|
131
|
+
```
|
|
132
|
+
recpdf/
|
|
133
|
+
├── __init__.py # 包入口,导出主要函数
|
|
134
|
+
├── parser.py # 核心解析功能实现
|
|
135
|
+
├── models.py # 模型初始化模块
|
|
136
|
+
├── prompts.py # 解析提示词定义
|
|
137
|
+
└── utils.py # 工具函数
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
## 示例
|
|
141
|
+
|
|
142
|
+
项目提供了一些示例PDF文件和输出结果,位于 `examples/`目录中:
|
|
143
|
+
|
|
144
|
+
- `examples/test1.pdf` - 简单文本PDF示例
|
|
145
|
+
- `examples/test2.pdf` - 包含图片的PDF示例
|
|
146
|
+
- `examples/test3.pdf` - 包含表格和公式的复杂PDF示例
|
|
147
|
+
- `examples/output/` - 解析结果输出目录
|
|
148
|
+
|
|
149
|
+
## 配置要求
|
|
150
|
+
|
|
151
|
+
- Python 3.11或更高版本
|
|
152
|
+
- 有效的大模型API密钥和访问地址
|
|
153
|
+
- 支持视觉理解的大模型(如GPT-4V、Claude 3等)
|
|
154
|
+
|
|
155
|
+
## 许可证
|
|
156
|
+
|
|
157
|
+
本项目采用MIT许可证,详见[LICENSE](LICENSE)文件。
|
|
158
|
+
|
|
159
|
+
## 贡献
|
|
160
|
+
|
|
161
|
+
欢迎提交问题和拉取请求来改进这个项目!
|
|
162
|
+
|
|
163
|
+
## 联系方式
|
|
164
|
+
|
|
165
|
+
- 作者:FreeCode
|
|
166
|
+
- 邮箱:freecode0902@gmail.com
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
recpdf-0.1.0.dist-info/METADATA,sha256=smV-mYQ9LwFjqFq8fsfxufzJ63pwFOwmu8IdepVH3dU,4403
|
|
2
|
+
recpdf-0.1.0.dist-info/WHEEL,sha256=Wb0ASbVj8JvWHpOiIpPi7ucfIgJeCi__PzivviEAQFc,90
|
|
3
|
+
recpdf-0.1.0.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
4
|
+
recpdf-0.1.0.dist-info/licenses/LICENSE,sha256=FH-K2UOWCVM6xokmofQ9aAl4Y8zStxLC-KVuZPXZmzI,1067
|
|
5
|
+
recpdf/__init__.py,sha256=2xMhnjfCt2lbIt-7MqQ5xTrWebGOgtk4R8HjJiSfIeo,46
|
|
6
|
+
recpdf/models.py,sha256=yo3qg9Iv4iTf-1TmZFUMpKctxcGjsr5hlz8x9owEZLA,800
|
|
7
|
+
recpdf/parser.py,sha256=kQLpHNso_o5BaBa12h32zOFp9lQxjGFubgb3zL5Treo,10668
|
|
8
|
+
recpdf/prompts.py,sha256=1HAbiO_kxkb8syrGB1WIBHXixEmjmiBlVEFO8w3qCKI,2839
|
|
9
|
+
recpdf/utils.py,sha256=KnJ_0X9ho0zMeQtzjOpwTCKBRoKcc-5scjpJus_7UFc,3355
|
|
10
|
+
recpdf-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
The MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Chen Li
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
|
13
|
+
all copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
|
21
|
+
THE SOFTWARE.
|