chem-pdf2ppt 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +235 -0
- package/README_EN.md +239 -0
- package/SKILL.md +469 -0
- package/SKILL_EN.md +473 -0
- package/assets/academic_template.html +197 -0
- package/cli.js +57 -0
- package/examples/example_usage.py +407 -0
- package/index.js +109 -0
- package/package.json +50 -0
- package/references/chemistry_templates.md +228 -0
- package/references/chemistry_templates_en.md +228 -0
- package/references/visual_style.md +172 -0
- package/references/visual_style_en.md +172 -0
- package/requirements.txt +20 -0
- package/scripts/analyze_paper.py +334 -0
- package/scripts/convert_to_images.py +67 -0
- package/scripts/create_ppt.py +712 -0
- package/scripts/extract_charts.py +425 -0
- package/scripts/generate_html.py +288 -0
|
@@ -0,0 +1,425 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
图表提取脚本 — 多策略提取 PDF 中的矢量图、嵌入式图片,含回退机制
|
|
4
|
+
Chart Extraction — multi-strategy: vector graphics, embedded images, page renders as fallback
|
|
5
|
+
"""
|
|
6
|
+
import fitz
|
|
7
|
+
import os
|
|
8
|
+
import sys
|
|
9
|
+
import json
|
|
10
|
+
import io
|
|
11
|
+
from PIL import Image
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _safe_print(msg):
|
|
15
|
+
"""Windows-safe print that avoids GBK encoding crashes."""
|
|
16
|
+
try:
|
|
17
|
+
print(msg)
|
|
18
|
+
except UnicodeEncodeError:
|
|
19
|
+
print(msg.encode('ascii', errors='replace').decode('ascii'))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ============================================================
|
|
23
|
+
# 核心提取函数 / Core Extraction Functions
|
|
24
|
+
# ============================================================
|
|
25
|
+
|
|
26
|
+
def extract_embedded_images(pdf_path, output_dir, min_size=100):
|
|
27
|
+
"""提取 PDF 中嵌入的位图 (PNG, JPEG)"""
|
|
28
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
29
|
+
doc = fitz.open(pdf_path)
|
|
30
|
+
count = 0
|
|
31
|
+
extracted = []
|
|
32
|
+
|
|
33
|
+
for page_num in range(len(doc)):
|
|
34
|
+
page = doc[page_num]
|
|
35
|
+
images = page.get_images(full=True)
|
|
36
|
+
|
|
37
|
+
for img_idx, img in enumerate(images):
|
|
38
|
+
xref = img[0]
|
|
39
|
+
base = doc.extract_image(xref)
|
|
40
|
+
width = base["width"]
|
|
41
|
+
height = base["height"]
|
|
42
|
+
|
|
43
|
+
if width > min_size and height > min_size:
|
|
44
|
+
ext = base["ext"]
|
|
45
|
+
fname = f"p{page_num+1}_img{img_idx+1}.{ext}"
|
|
46
|
+
output_path = os.path.join(output_dir, fname)
|
|
47
|
+
with open(output_path, "wb") as f:
|
|
48
|
+
f.write(base["image"])
|
|
49
|
+
count += 1
|
|
50
|
+
extracted.append({
|
|
51
|
+
"page": page_num + 1,
|
|
52
|
+
"file": fname,
|
|
53
|
+
"size": f"{width}x{height}",
|
|
54
|
+
"method": "embedded"
|
|
55
|
+
})
|
|
56
|
+
_safe_print(f" [embedded] {fname} ({width}x{height})")
|
|
57
|
+
|
|
58
|
+
doc.close()
|
|
59
|
+
return count, extracted
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _get_pymupdf_version():
|
|
63
|
+
"""获取 PyMuPDF 主版本号"""
|
|
64
|
+
try:
|
|
65
|
+
ver = fitz.version
|
|
66
|
+
if isinstance(ver, tuple):
|
|
67
|
+
return int(ver[0].split('.')[0]) if '.' in str(ver[0]) else int(ver[0])
|
|
68
|
+
return 0
|
|
69
|
+
except Exception:
|
|
70
|
+
return 0
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _cluster_drawings_compat(page, x_tolerance=3, y_tolerance=3):
|
|
74
|
+
"""
|
|
75
|
+
兼容不同 PyMuPDF 版本的图形聚类
|
|
76
|
+
|
|
77
|
+
PyMuPDF >= 1.23: 使用 page.cluster_drawings()
|
|
78
|
+
PyMuPDF < 1.23: 使用 page.get_drawings() + 手动聚类
|
|
79
|
+
"""
|
|
80
|
+
# 尝试新版 API
|
|
81
|
+
if hasattr(page, 'cluster_drawings'):
|
|
82
|
+
try:
|
|
83
|
+
return page.cluster_drawings(
|
|
84
|
+
x_tolerance=x_tolerance, y_tolerance=y_tolerance)
|
|
85
|
+
except Exception:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
# 回退:手动聚类 get_drawings() 的 rect
|
|
89
|
+
try:
|
|
90
|
+
drawings = page.get_drawings()
|
|
91
|
+
except Exception:
|
|
92
|
+
return []
|
|
93
|
+
|
|
94
|
+
if not drawings:
|
|
95
|
+
return []
|
|
96
|
+
|
|
97
|
+
# 收集所有 drawing rect
|
|
98
|
+
rects = []
|
|
99
|
+
for d in drawings:
|
|
100
|
+
r = d.get('rect')
|
|
101
|
+
if r is None:
|
|
102
|
+
continue
|
|
103
|
+
w = r.x1 - r.x0
|
|
104
|
+
h = r.y1 - r.y0
|
|
105
|
+
if w < 0.5 or h < 0.5: # 过滤极小的装饰线
|
|
106
|
+
continue
|
|
107
|
+
# 扩展 rect 以利于聚类
|
|
108
|
+
rects.append(fitz.Rect(
|
|
109
|
+
r.x0 - x_tolerance, r.y0 - y_tolerance,
|
|
110
|
+
r.x1 + x_tolerance, r.y1 + y_tolerance
|
|
111
|
+
))
|
|
112
|
+
|
|
113
|
+
if not rects:
|
|
114
|
+
return []
|
|
115
|
+
|
|
116
|
+
# 排序并迭代合并重叠的 rect
|
|
117
|
+
rects.sort(key=lambda r: (r.y0, r.x0))
|
|
118
|
+
clusters = [rects[0]]
|
|
119
|
+
for r in rects[1:]:
|
|
120
|
+
last = clusters[-1]
|
|
121
|
+
if r.intersects(last):
|
|
122
|
+
# 合并
|
|
123
|
+
clusters[-1] = last | r
|
|
124
|
+
else:
|
|
125
|
+
# 也检查与其他已有 cluster 的重叠
|
|
126
|
+
merged = False
|
|
127
|
+
for i, c in enumerate(clusters):
|
|
128
|
+
if r.intersects(c):
|
|
129
|
+
clusters[i] = c | r
|
|
130
|
+
merged = True
|
|
131
|
+
break
|
|
132
|
+
if not merged:
|
|
133
|
+
clusters.append(r)
|
|
134
|
+
|
|
135
|
+
# 收缩回原始大小,去掉我们加的 tolerance padding
|
|
136
|
+
result = []
|
|
137
|
+
for c in clusters:
|
|
138
|
+
result.append(fitz.Rect(
|
|
139
|
+
c.x0 + x_tolerance, c.y0 + y_tolerance,
|
|
140
|
+
c.x1 - x_tolerance, c.y1 - y_tolerance
|
|
141
|
+
) & page.rect)
|
|
142
|
+
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def extract_vector_figures(pdf_path, output_dir, dpi=200, min_size=100,
|
|
147
|
+
x_tolerance=3, y_tolerance=3):
|
|
148
|
+
"""提取矢量图形 — 兼容 PyMuPDF 1.19+ 和 1.23+"""
|
|
149
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
150
|
+
doc = fitz.open(pdf_path)
|
|
151
|
+
count = 0
|
|
152
|
+
extracted = []
|
|
153
|
+
|
|
154
|
+
for page_num in range(len(doc)):
|
|
155
|
+
page = doc[page_num]
|
|
156
|
+
|
|
157
|
+
try:
|
|
158
|
+
drawing_rects = _cluster_drawings_compat(
|
|
159
|
+
page, x_tolerance=x_tolerance, y_tolerance=y_tolerance)
|
|
160
|
+
except Exception as e:
|
|
161
|
+
_safe_print(f" [warn] Page {page_num+1}: drawing clustering failed: {e}")
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
for idx, rect in enumerate(drawing_rects):
|
|
165
|
+
if rect.width < min_size or rect.height < min_size:
|
|
166
|
+
continue
|
|
167
|
+
|
|
168
|
+
rect = rect + (-10, -10, 10, 10)
|
|
169
|
+
rect = rect & page.rect
|
|
170
|
+
|
|
171
|
+
zoom = dpi / 72
|
|
172
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
173
|
+
pix = page.get_pixmap(matrix=mat, clip=rect)
|
|
174
|
+
|
|
175
|
+
fname = f"p{page_num+1}_vec{idx+1}.png"
|
|
176
|
+
output_path = os.path.join(output_dir, fname)
|
|
177
|
+
pix.save(output_path)
|
|
178
|
+
count += 1
|
|
179
|
+
extracted.append({
|
|
180
|
+
"page": page_num + 1,
|
|
181
|
+
"file": fname,
|
|
182
|
+
"size": f"{int(rect.width)}x{int(rect.height)}",
|
|
183
|
+
"method": "vector"
|
|
184
|
+
})
|
|
185
|
+
_safe_print(f" [vector] {fname} ({int(rect.width)}x{int(rect.height)})")
|
|
186
|
+
|
|
187
|
+
doc.close()
|
|
188
|
+
return count, extracted
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
def extract_vector_multi_tolerance(pdf_path, output_dir, dpi=200, min_size=100):
|
|
192
|
+
"""多容忍度尝试"""
|
|
193
|
+
tolerances = [(3, 3), (6, 6), (10, 10), (15, 15), (20, 20)]
|
|
194
|
+
all_extracted = []
|
|
195
|
+
seen_hashes = set()
|
|
196
|
+
|
|
197
|
+
for x_tol, y_tol in tolerances:
|
|
198
|
+
doc = fitz.open(pdf_path)
|
|
199
|
+
for page_num in range(len(doc)):
|
|
200
|
+
page = doc[page_num]
|
|
201
|
+
try:
|
|
202
|
+
rects = _cluster_drawings_compat(
|
|
203
|
+
page, x_tolerance=x_tol, y_tolerance=y_tol)
|
|
204
|
+
except Exception:
|
|
205
|
+
continue
|
|
206
|
+
|
|
207
|
+
for idx, rect in enumerate(rects):
|
|
208
|
+
if rect.width < min_size or rect.height < min_size:
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
rect = (rect + (-10, -10, 10, 10)) & page.rect
|
|
212
|
+
rect_hash = f"{page_num}_{int(rect.x0)}_{int(rect.y0)}_{int(rect.width)}_{int(rect.height)}"
|
|
213
|
+
|
|
214
|
+
if rect_hash in seen_hashes:
|
|
215
|
+
continue
|
|
216
|
+
seen_hashes.add(rect_hash)
|
|
217
|
+
|
|
218
|
+
zoom = dpi / 72
|
|
219
|
+
pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=rect)
|
|
220
|
+
fname = f"p{page_num+1}_vec{len(all_extracted)+1}.png"
|
|
221
|
+
output_path = os.path.join(output_dir, fname)
|
|
222
|
+
pix.save(output_path)
|
|
223
|
+
all_extracted.append({
|
|
224
|
+
"page": page_num + 1,
|
|
225
|
+
"file": fname,
|
|
226
|
+
"size": f"{int(rect.width)}x{int(rect.height)}",
|
|
227
|
+
"method": f"vector(tol={x_tol},{y_tol})"
|
|
228
|
+
})
|
|
229
|
+
doc.close()
|
|
230
|
+
|
|
231
|
+
_safe_print(f" [vector multi-tol] {len(all_extracted)} figures total")
|
|
232
|
+
return len(all_extracted), all_extracted
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def extract_page_renders(pdf_path, output_dir, pages=None, dpi=200):
|
|
236
|
+
"""页面渲染回退 — 将整页渲染为图片(当矢量提取失败时使用)"""
|
|
237
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
238
|
+
doc = fitz.open(pdf_path)
|
|
239
|
+
count = 0
|
|
240
|
+
extracted = []
|
|
241
|
+
|
|
242
|
+
if pages is None:
|
|
243
|
+
pages = list(range(len(doc)))
|
|
244
|
+
|
|
245
|
+
for page_num in pages:
|
|
246
|
+
page = doc[page_num]
|
|
247
|
+
zoom = dpi / 72
|
|
248
|
+
mat = fitz.Matrix(zoom, zoom)
|
|
249
|
+
pix = page.get_pixmap(matrix=mat)
|
|
250
|
+
|
|
251
|
+
fname = f"page_{page_num+1}_render.png"
|
|
252
|
+
output_path = os.path.join(output_dir, fname)
|
|
253
|
+
pix.save(output_path)
|
|
254
|
+
count += 1
|
|
255
|
+
extracted.append({
|
|
256
|
+
"page": page_num + 1,
|
|
257
|
+
"file": fname,
|
|
258
|
+
"size": f"{pix.width}x{pix.height}",
|
|
259
|
+
"method": "page_render"
|
|
260
|
+
})
|
|
261
|
+
doc.close()
|
|
262
|
+
return count, extracted
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def detect_figure_pages(pdf_path):
|
|
266
|
+
"""检测包含图表的页码 — 通过搜索文本中的 Figure/Fig/图 引用"""
|
|
267
|
+
doc = fitz.open(pdf_path)
|
|
268
|
+
figure_pages = {}
|
|
269
|
+
|
|
270
|
+
for page_num in range(len(doc)):
|
|
271
|
+
page = doc[page_num]
|
|
272
|
+
text = page.get_text()
|
|
273
|
+
# 搜索 Figure 编号
|
|
274
|
+
import re
|
|
275
|
+
fig_refs = re.findall(r'(?:Figure|Fig\.?)\s*(\d+)', text, re.I)
|
|
276
|
+
for fig_num in fig_refs:
|
|
277
|
+
if fig_num not in figure_pages:
|
|
278
|
+
figure_pages[fig_num] = page_num + 1
|
|
279
|
+
|
|
280
|
+
doc.close()
|
|
281
|
+
return figure_pages
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# ============================================================
|
|
285
|
+
# 智能提取 + 回退 / Smart Extract with Fallback
|
|
286
|
+
# ============================================================
|
|
287
|
+
|
|
288
|
+
def smart_extract_figures(pdf_path, output_dir, dpi=200, min_size=100,
|
|
289
|
+
fallback_pages=True, report_path=None):
|
|
290
|
+
"""
|
|
291
|
+
智能提取论文中的所有图表,多策略回退
|
|
292
|
+
|
|
293
|
+
策略顺序:
|
|
294
|
+
1. cluster_drawings() 默认容忍度 (3,3)
|
|
295
|
+
2. 如果结果 < 3,尝试更宽松容忍度 (6,6) → (10,10) → (15,15) → (20,20)
|
|
296
|
+
3. 提取嵌入式位图
|
|
297
|
+
4. 回退:将包含图表的页面整页渲染(可选)
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
dict: 包含提取统计和详细报告
|
|
301
|
+
"""
|
|
302
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
303
|
+
|
|
304
|
+
report = {
|
|
305
|
+
"source": os.path.basename(pdf_path),
|
|
306
|
+
"dpi": dpi,
|
|
307
|
+
"min_size": min_size,
|
|
308
|
+
"strategies": {},
|
|
309
|
+
"all_extracted": [],
|
|
310
|
+
"warnings": [],
|
|
311
|
+
"suggestions": [],
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
_safe_print(f"Processing: {os.path.basename(pdf_path)}")
|
|
315
|
+
_safe_print("-" * 50)
|
|
316
|
+
|
|
317
|
+
# Strategy 1: Default vector extraction
|
|
318
|
+
_safe_print("\n[1/4] Vector graphics (default tolerances)...")
|
|
319
|
+
n_vec_default, vec_default = extract_vector_figures(
|
|
320
|
+
pdf_path, output_dir, dpi, min_size, x_tolerance=3, y_tolerance=3)
|
|
321
|
+
report["strategies"]["vector_default"] = {"count": n_vec_default, "items": vec_default}
|
|
322
|
+
report["all_extracted"].extend(vec_default)
|
|
323
|
+
|
|
324
|
+
# Strategy 2: Multi-tolerance if default found too few
|
|
325
|
+
if n_vec_default < 3:
|
|
326
|
+
_safe_print(f"\n[2/4] Only {n_vec_default} vector figures found. Trying relaxed tolerances...")
|
|
327
|
+
n_vec_multi, vec_multi = extract_vector_multi_tolerance(
|
|
328
|
+
pdf_path, output_dir, dpi, min_size)
|
|
329
|
+
report["strategies"]["vector_multi_tol"] = {"count": n_vec_multi, "items": vec_multi}
|
|
330
|
+
report["all_extracted"].extend(vec_multi)
|
|
331
|
+
else:
|
|
332
|
+
_safe_print("\n[2/4] Skipped (enough vector figures found)")
|
|
333
|
+
report["strategies"]["vector_multi_tol"] = {"count": 0, "items": [], "skipped": True}
|
|
334
|
+
|
|
335
|
+
# Strategy 3: Embedded images
|
|
336
|
+
_safe_print("\n[3/4] Embedded images...")
|
|
337
|
+
n_emb, emb_items = extract_embedded_images(pdf_path, output_dir, min_size)
|
|
338
|
+
report["strategies"]["embedded"] = {"count": n_emb, "items": emb_items}
|
|
339
|
+
report["all_extracted"].extend(emb_items)
|
|
340
|
+
|
|
341
|
+
# Strategy 4: Page renders as fallback
|
|
342
|
+
total_figures = len(report["all_extracted"])
|
|
343
|
+
if total_figures < 3 and fallback_pages:
|
|
344
|
+
_safe_print(f"\n[4/4] Only {total_figures} figures — rendering figure pages...")
|
|
345
|
+
fig_pages = detect_figure_pages(pdf_path)
|
|
346
|
+
pages_to_render = sorted(set(fig_pages.values()))
|
|
347
|
+
n_pages, page_items = extract_page_renders(
|
|
348
|
+
pdf_path, output_dir, pages=pages_to_render, dpi=dpi)
|
|
349
|
+
report["strategies"]["page_render_fallback"] = {
|
|
350
|
+
"count": n_pages,
|
|
351
|
+
"pages": pages_to_render,
|
|
352
|
+
"figure_page_map": fig_pages,
|
|
353
|
+
"items": page_items
|
|
354
|
+
}
|
|
355
|
+
report["all_extracted"].extend(page_items)
|
|
356
|
+
report["warnings"].append(
|
|
357
|
+
f"Vector extraction found few figures. Rendered {n_pages} pages as images. "
|
|
358
|
+
f"Figures detected on pages: {pages_to_render}"
|
|
359
|
+
)
|
|
360
|
+
report["suggestions"].append(
|
|
361
|
+
"Consider using page_*_render.png as figure placeholders, "
|
|
362
|
+
"or manually crop from these full-page renders."
|
|
363
|
+
)
|
|
364
|
+
elif total_figures < 3:
|
|
365
|
+
_safe_print(f"\n[4/4] Only {total_figures} figures found!")
|
|
366
|
+
report["warnings"].append(
|
|
367
|
+
f"Only {total_figures} figures extracted. "
|
|
368
|
+
"The PDF may use vector rendering that resists automatic extraction."
|
|
369
|
+
)
|
|
370
|
+
report["suggestions"].append(
|
|
371
|
+
"Try: (1) increase --dpi to 400, (2) use pdf2image for page-level conversion, "
|
|
372
|
+
"(3) manually crop figures from page renders."
|
|
373
|
+
)
|
|
374
|
+
else:
|
|
375
|
+
_safe_print(f"\n[4/4] Skipped ({total_figures} figures extracted)")
|
|
376
|
+
|
|
377
|
+
# Summary
|
|
378
|
+
_safe_print("\n" + "-" * 50)
|
|
379
|
+
_safe_print(f"Extraction complete!")
|
|
380
|
+
_safe_print(f" Total extracted: {total_figures}")
|
|
381
|
+
_safe_print(f" Output directory: {output_dir}")
|
|
382
|
+
|
|
383
|
+
if report["warnings"]:
|
|
384
|
+
_safe_print(f"\n Warnings:")
|
|
385
|
+
for w in report["warnings"]:
|
|
386
|
+
_safe_print(f" - {w}")
|
|
387
|
+
if report["suggestions"]:
|
|
388
|
+
_safe_print(f"\n Suggestions:")
|
|
389
|
+
for s in report["suggestions"]:
|
|
390
|
+
_safe_print(f" - {s}")
|
|
391
|
+
|
|
392
|
+
# Save report
|
|
393
|
+
if report_path:
|
|
394
|
+
with open(report_path, 'w', encoding='utf-8') as f:
|
|
395
|
+
json.dump(report, f, indent=2, ensure_ascii=False)
|
|
396
|
+
_safe_print(f"\nReport saved: {report_path}")
|
|
397
|
+
|
|
398
|
+
return report
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
def main():
|
|
402
|
+
if len(sys.argv) < 2:
|
|
403
|
+
print("Usage: python extract_charts.py <pdf_file> [output_dir] [dpi] [--report]")
|
|
404
|
+
print()
|
|
405
|
+
print("Examples:")
|
|
406
|
+
print(" python extract_charts.py paper.pdf")
|
|
407
|
+
print(" python extract_charts.py paper.pdf charts 300")
|
|
408
|
+
print(" python extract_charts.py paper.pdf figures 400 --report")
|
|
409
|
+
return
|
|
410
|
+
|
|
411
|
+
pdf_path = sys.argv[1]
|
|
412
|
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "extracted_charts"
|
|
413
|
+
dpi = int(sys.argv[3]) if len(sys.argv) > 3 and sys.argv[3].isdigit() else 200
|
|
414
|
+
do_report = "--report" in sys.argv
|
|
415
|
+
|
|
416
|
+
if not os.path.exists(pdf_path):
|
|
417
|
+
print(f"Error: File not found: {pdf_path}")
|
|
418
|
+
return
|
|
419
|
+
|
|
420
|
+
report_path = os.path.join(output_dir, "extraction_report.json") if do_report else None
|
|
421
|
+
smart_extract_figures(pdf_path, output_dir, dpi=dpi, report_path=report_path)
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
if __name__ == "__main__":
|
|
425
|
+
main()
|