chem-pdf2ppt 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,425 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ 图表提取脚本 — 多策略提取 PDF 中的矢量图、嵌入式图片,含回退机制
4
+ Chart Extraction — multi-strategy: vector graphics, embedded images, page renders as fallback
5
+ """
6
+ import fitz
7
+ import os
8
+ import sys
9
+ import json
10
+ import io
11
+ from PIL import Image
12
+
13
+
14
+ def _safe_print(msg):
15
+ """Windows-safe print that avoids GBK encoding crashes."""
16
+ try:
17
+ print(msg)
18
+ except UnicodeEncodeError:
19
+ print(msg.encode('ascii', errors='replace').decode('ascii'))
20
+
21
+
22
+ # ============================================================
23
+ # 核心提取函数 / Core Extraction Functions
24
+ # ============================================================
25
+
26
+ def extract_embedded_images(pdf_path, output_dir, min_size=100):
27
+ """提取 PDF 中嵌入的位图 (PNG, JPEG)"""
28
+ os.makedirs(output_dir, exist_ok=True)
29
+ doc = fitz.open(pdf_path)
30
+ count = 0
31
+ extracted = []
32
+
33
+ for page_num in range(len(doc)):
34
+ page = doc[page_num]
35
+ images = page.get_images(full=True)
36
+
37
+ for img_idx, img in enumerate(images):
38
+ xref = img[0]
39
+ base = doc.extract_image(xref)
40
+ width = base["width"]
41
+ height = base["height"]
42
+
43
+ if width > min_size and height > min_size:
44
+ ext = base["ext"]
45
+ fname = f"p{page_num+1}_img{img_idx+1}.{ext}"
46
+ output_path = os.path.join(output_dir, fname)
47
+ with open(output_path, "wb") as f:
48
+ f.write(base["image"])
49
+ count += 1
50
+ extracted.append({
51
+ "page": page_num + 1,
52
+ "file": fname,
53
+ "size": f"{width}x{height}",
54
+ "method": "embedded"
55
+ })
56
+ _safe_print(f" [embedded] {fname} ({width}x{height})")
57
+
58
+ doc.close()
59
+ return count, extracted
60
+
61
+
62
+ def _get_pymupdf_version():
63
+ """获取 PyMuPDF 主版本号"""
64
+ try:
65
+ ver = fitz.version
66
+ if isinstance(ver, tuple):
67
+ return int(ver[0].split('.')[0]) if '.' in str(ver[0]) else int(ver[0])
68
+ return 0
69
+ except Exception:
70
+ return 0
71
+
72
+
73
+ def _cluster_drawings_compat(page, x_tolerance=3, y_tolerance=3):
74
+ """
75
+ 兼容不同 PyMuPDF 版本的图形聚类
76
+
77
+ PyMuPDF >= 1.23: 使用 page.cluster_drawings()
78
+ PyMuPDF < 1.23: 使用 page.get_drawings() + 手动聚类
79
+ """
80
+ # 尝试新版 API
81
+ if hasattr(page, 'cluster_drawings'):
82
+ try:
83
+ return page.cluster_drawings(
84
+ x_tolerance=x_tolerance, y_tolerance=y_tolerance)
85
+ except Exception:
86
+ pass
87
+
88
+ # 回退:手动聚类 get_drawings() 的 rect
89
+ try:
90
+ drawings = page.get_drawings()
91
+ except Exception:
92
+ return []
93
+
94
+ if not drawings:
95
+ return []
96
+
97
+ # 收集所有 drawing rect
98
+ rects = []
99
+ for d in drawings:
100
+ r = d.get('rect')
101
+ if r is None:
102
+ continue
103
+ w = r.x1 - r.x0
104
+ h = r.y1 - r.y0
105
+ if w < 0.5 or h < 0.5: # 过滤极小的装饰线
106
+ continue
107
+ # 扩展 rect 以利于聚类
108
+ rects.append(fitz.Rect(
109
+ r.x0 - x_tolerance, r.y0 - y_tolerance,
110
+ r.x1 + x_tolerance, r.y1 + y_tolerance
111
+ ))
112
+
113
+ if not rects:
114
+ return []
115
+
116
+ # 排序并迭代合并重叠的 rect
117
+ rects.sort(key=lambda r: (r.y0, r.x0))
118
+ clusters = [rects[0]]
119
+ for r in rects[1:]:
120
+ last = clusters[-1]
121
+ if r.intersects(last):
122
+ # 合并
123
+ clusters[-1] = last | r
124
+ else:
125
+ # 也检查与其他已有 cluster 的重叠
126
+ merged = False
127
+ for i, c in enumerate(clusters):
128
+ if r.intersects(c):
129
+ clusters[i] = c | r
130
+ merged = True
131
+ break
132
+ if not merged:
133
+ clusters.append(r)
134
+
135
+ # 收缩回原始大小,去掉我们加的 tolerance padding
136
+ result = []
137
+ for c in clusters:
138
+ result.append(fitz.Rect(
139
+ c.x0 + x_tolerance, c.y0 + y_tolerance,
140
+ c.x1 - x_tolerance, c.y1 - y_tolerance
141
+ ) & page.rect)
142
+
143
+ return result
144
+
145
+
146
+ def extract_vector_figures(pdf_path, output_dir, dpi=200, min_size=100,
147
+ x_tolerance=3, y_tolerance=3):
148
+ """提取矢量图形 — 兼容 PyMuPDF 1.19+ 和 1.23+"""
149
+ os.makedirs(output_dir, exist_ok=True)
150
+ doc = fitz.open(pdf_path)
151
+ count = 0
152
+ extracted = []
153
+
154
+ for page_num in range(len(doc)):
155
+ page = doc[page_num]
156
+
157
+ try:
158
+ drawing_rects = _cluster_drawings_compat(
159
+ page, x_tolerance=x_tolerance, y_tolerance=y_tolerance)
160
+ except Exception as e:
161
+ _safe_print(f" [warn] Page {page_num+1}: drawing clustering failed: {e}")
162
+ continue
163
+
164
+ for idx, rect in enumerate(drawing_rects):
165
+ if rect.width < min_size or rect.height < min_size:
166
+ continue
167
+
168
+ rect = rect + (-10, -10, 10, 10)
169
+ rect = rect & page.rect
170
+
171
+ zoom = dpi / 72
172
+ mat = fitz.Matrix(zoom, zoom)
173
+ pix = page.get_pixmap(matrix=mat, clip=rect)
174
+
175
+ fname = f"p{page_num+1}_vec{idx+1}.png"
176
+ output_path = os.path.join(output_dir, fname)
177
+ pix.save(output_path)
178
+ count += 1
179
+ extracted.append({
180
+ "page": page_num + 1,
181
+ "file": fname,
182
+ "size": f"{int(rect.width)}x{int(rect.height)}",
183
+ "method": "vector"
184
+ })
185
+ _safe_print(f" [vector] {fname} ({int(rect.width)}x{int(rect.height)})")
186
+
187
+ doc.close()
188
+ return count, extracted
189
+
190
+
191
+ def extract_vector_multi_tolerance(pdf_path, output_dir, dpi=200, min_size=100):
192
+ """多容忍度尝试"""
193
+ tolerances = [(3, 3), (6, 6), (10, 10), (15, 15), (20, 20)]
194
+ all_extracted = []
195
+ seen_hashes = set()
196
+
197
+ for x_tol, y_tol in tolerances:
198
+ doc = fitz.open(pdf_path)
199
+ for page_num in range(len(doc)):
200
+ page = doc[page_num]
201
+ try:
202
+ rects = _cluster_drawings_compat(
203
+ page, x_tolerance=x_tol, y_tolerance=y_tol)
204
+ except Exception:
205
+ continue
206
+
207
+ for idx, rect in enumerate(rects):
208
+ if rect.width < min_size or rect.height < min_size:
209
+ continue
210
+
211
+ rect = (rect + (-10, -10, 10, 10)) & page.rect
212
+ rect_hash = f"{page_num}_{int(rect.x0)}_{int(rect.y0)}_{int(rect.width)}_{int(rect.height)}"
213
+
214
+ if rect_hash in seen_hashes:
215
+ continue
216
+ seen_hashes.add(rect_hash)
217
+
218
+ zoom = dpi / 72
219
+ pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), clip=rect)
220
+ fname = f"p{page_num+1}_vec{len(all_extracted)+1}.png"
221
+ output_path = os.path.join(output_dir, fname)
222
+ pix.save(output_path)
223
+ all_extracted.append({
224
+ "page": page_num + 1,
225
+ "file": fname,
226
+ "size": f"{int(rect.width)}x{int(rect.height)}",
227
+ "method": f"vector(tol={x_tol},{y_tol})"
228
+ })
229
+ doc.close()
230
+
231
+ _safe_print(f" [vector multi-tol] {len(all_extracted)} figures total")
232
+ return len(all_extracted), all_extracted
233
+
234
+
235
+ def extract_page_renders(pdf_path, output_dir, pages=None, dpi=200):
236
+ """页面渲染回退 — 将整页渲染为图片(当矢量提取失败时使用)"""
237
+ os.makedirs(output_dir, exist_ok=True)
238
+ doc = fitz.open(pdf_path)
239
+ count = 0
240
+ extracted = []
241
+
242
+ if pages is None:
243
+ pages = list(range(len(doc)))
244
+
245
+ for page_num in pages:
246
+ page = doc[page_num]
247
+ zoom = dpi / 72
248
+ mat = fitz.Matrix(zoom, zoom)
249
+ pix = page.get_pixmap(matrix=mat)
250
+
251
+ fname = f"page_{page_num+1}_render.png"
252
+ output_path = os.path.join(output_dir, fname)
253
+ pix.save(output_path)
254
+ count += 1
255
+ extracted.append({
256
+ "page": page_num + 1,
257
+ "file": fname,
258
+ "size": f"{pix.width}x{pix.height}",
259
+ "method": "page_render"
260
+ })
261
+ doc.close()
262
+ return count, extracted
263
+
264
+
265
+ def detect_figure_pages(pdf_path):
266
+ """检测包含图表的页码 — 通过搜索文本中的 Figure/Fig/图 引用"""
267
+ doc = fitz.open(pdf_path)
268
+ figure_pages = {}
269
+
270
+ for page_num in range(len(doc)):
271
+ page = doc[page_num]
272
+ text = page.get_text()
273
+ # 搜索 Figure 编号
274
+ import re
275
+ fig_refs = re.findall(r'(?:Figure|Fig\.?)\s*(\d+)', text, re.I)
276
+ for fig_num in fig_refs:
277
+ if fig_num not in figure_pages:
278
+ figure_pages[fig_num] = page_num + 1
279
+
280
+ doc.close()
281
+ return figure_pages
282
+
283
+
284
+ # ============================================================
285
+ # 智能提取 + 回退 / Smart Extract with Fallback
286
+ # ============================================================
287
+
288
+ def smart_extract_figures(pdf_path, output_dir, dpi=200, min_size=100,
289
+ fallback_pages=True, report_path=None):
290
+ """
291
+ 智能提取论文中的所有图表,多策略回退
292
+
293
+ 策略顺序:
294
+ 1. cluster_drawings() 默认容忍度 (3,3)
295
+ 2. 如果结果 < 3,尝试更宽松容忍度 (6,6) → (10,10) → (15,15) → (20,20)
296
+ 3. 提取嵌入式位图
297
+ 4. 回退:将包含图表的页面整页渲染(可选)
298
+
299
+ Returns:
300
+ dict: 包含提取统计和详细报告
301
+ """
302
+ os.makedirs(output_dir, exist_ok=True)
303
+
304
+ report = {
305
+ "source": os.path.basename(pdf_path),
306
+ "dpi": dpi,
307
+ "min_size": min_size,
308
+ "strategies": {},
309
+ "all_extracted": [],
310
+ "warnings": [],
311
+ "suggestions": [],
312
+ }
313
+
314
+ _safe_print(f"Processing: {os.path.basename(pdf_path)}")
315
+ _safe_print("-" * 50)
316
+
317
+ # Strategy 1: Default vector extraction
318
+ _safe_print("\n[1/4] Vector graphics (default tolerances)...")
319
+ n_vec_default, vec_default = extract_vector_figures(
320
+ pdf_path, output_dir, dpi, min_size, x_tolerance=3, y_tolerance=3)
321
+ report["strategies"]["vector_default"] = {"count": n_vec_default, "items": vec_default}
322
+ report["all_extracted"].extend(vec_default)
323
+
324
+ # Strategy 2: Multi-tolerance if default found too few
325
+ if n_vec_default < 3:
326
+ _safe_print(f"\n[2/4] Only {n_vec_default} vector figures found. Trying relaxed tolerances...")
327
+ n_vec_multi, vec_multi = extract_vector_multi_tolerance(
328
+ pdf_path, output_dir, dpi, min_size)
329
+ report["strategies"]["vector_multi_tol"] = {"count": n_vec_multi, "items": vec_multi}
330
+ report["all_extracted"].extend(vec_multi)
331
+ else:
332
+ _safe_print("\n[2/4] Skipped (enough vector figures found)")
333
+ report["strategies"]["vector_multi_tol"] = {"count": 0, "items": [], "skipped": True}
334
+
335
+ # Strategy 3: Embedded images
336
+ _safe_print("\n[3/4] Embedded images...")
337
+ n_emb, emb_items = extract_embedded_images(pdf_path, output_dir, min_size)
338
+ report["strategies"]["embedded"] = {"count": n_emb, "items": emb_items}
339
+ report["all_extracted"].extend(emb_items)
340
+
341
+ # Strategy 4: Page renders as fallback
342
+ total_figures = len(report["all_extracted"])
343
+ if total_figures < 3 and fallback_pages:
344
+ _safe_print(f"\n[4/4] Only {total_figures} figures — rendering figure pages...")
345
+ fig_pages = detect_figure_pages(pdf_path)
346
+ pages_to_render = sorted(set(fig_pages.values()))
347
+ n_pages, page_items = extract_page_renders(
348
+ pdf_path, output_dir, pages=pages_to_render, dpi=dpi)
349
+ report["strategies"]["page_render_fallback"] = {
350
+ "count": n_pages,
351
+ "pages": pages_to_render,
352
+ "figure_page_map": fig_pages,
353
+ "items": page_items
354
+ }
355
+ report["all_extracted"].extend(page_items)
356
+ report["warnings"].append(
357
+ f"Vector extraction found few figures. Rendered {n_pages} pages as images. "
358
+ f"Figures detected on pages: {pages_to_render}"
359
+ )
360
+ report["suggestions"].append(
361
+ "Consider using page_*_render.png as figure placeholders, "
362
+ "or manually crop from these full-page renders."
363
+ )
364
+ elif total_figures < 3:
365
+ _safe_print(f"\n[4/4] Only {total_figures} figures found!")
366
+ report["warnings"].append(
367
+ f"Only {total_figures} figures extracted. "
368
+ "The PDF may use vector rendering that resists automatic extraction."
369
+ )
370
+ report["suggestions"].append(
371
+ "Try: (1) increase --dpi to 400, (2) use pdf2image for page-level conversion, "
372
+ "(3) manually crop figures from page renders."
373
+ )
374
+ else:
375
+ _safe_print(f"\n[4/4] Skipped ({total_figures} figures extracted)")
376
+
377
+ # Summary
378
+ _safe_print("\n" + "-" * 50)
379
+ _safe_print(f"Extraction complete!")
380
+ _safe_print(f" Total extracted: {total_figures}")
381
+ _safe_print(f" Output directory: {output_dir}")
382
+
383
+ if report["warnings"]:
384
+ _safe_print(f"\n Warnings:")
385
+ for w in report["warnings"]:
386
+ _safe_print(f" - {w}")
387
+ if report["suggestions"]:
388
+ _safe_print(f"\n Suggestions:")
389
+ for s in report["suggestions"]:
390
+ _safe_print(f" - {s}")
391
+
392
+ # Save report
393
+ if report_path:
394
+ with open(report_path, 'w', encoding='utf-8') as f:
395
+ json.dump(report, f, indent=2, ensure_ascii=False)
396
+ _safe_print(f"\nReport saved: {report_path}")
397
+
398
+ return report
399
+
400
+
401
+ def main():
402
+ if len(sys.argv) < 2:
403
+ print("Usage: python extract_charts.py <pdf_file> [output_dir] [dpi] [--report]")
404
+ print()
405
+ print("Examples:")
406
+ print(" python extract_charts.py paper.pdf")
407
+ print(" python extract_charts.py paper.pdf charts 300")
408
+ print(" python extract_charts.py paper.pdf figures 400 --report")
409
+ return
410
+
411
+ pdf_path = sys.argv[1]
412
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "extracted_charts"
413
+ dpi = int(sys.argv[3]) if len(sys.argv) > 3 and sys.argv[3].isdigit() else 200
414
+ do_report = "--report" in sys.argv
415
+
416
+ if not os.path.exists(pdf_path):
417
+ print(f"Error: File not found: {pdf_path}")
418
+ return
419
+
420
+ report_path = os.path.join(output_dir, "extraction_report.json") if do_report else None
421
+ smart_extract_figures(pdf_path, output_dir, dpi=dpi, report_path=report_path)
422
+
423
+
424
+ if __name__ == "__main__":
425
+ main()