chem-pdf2ppt 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,172 @@
1
+ # Chemistry Academic PPT Visual Standards
2
+
3
+ This document defines the visual design standards for chemistry academic PPTs, targeting **professional, publication-quality** presentations.
4
+
5
+ ## Color Schemes
6
+
7
+ ### Preset Themes
8
+
9
+ #### 1. Academic Classic (academic) — Default, General Chemistry
10
+
11
+ | Role | Hex | Usage |
12
+ |------|-----|-------|
13
+ | Primary | `#003366` | Titles, section slide backgrounds |
14
+ | Background | `#FFFFFF` | Content slide backgrounds |
15
+ | Body text | `#333333` | Main text |
16
+ | Accent | `#B41E1E` | Decorative lines, key data highlights |
17
+ | Light bg | `#F0F4F8` | Summary slide backgrounds, table zebra stripes |
18
+ | Muted | `#8C8C8C` | Page numbers, figure captions, source labels |
19
+
20
+ #### 2. Molecular Tech (molecular) — Computational Chemistry / Materials
21
+
22
+ | Role | Hex | Usage |
23
+ |------|-----|-------|
24
+ | Primary | `#1A5276` | Titles, section slide backgrounds |
25
+ | Background | `#F8F9FA` | Content slide backgrounds |
26
+ | Body text | `#2C3E50` | Main text |
27
+ | Accent | `#E74C3C` | Decorative lines, key data highlights |
28
+ | Light bg | `#EBF0F5` | Summary slide backgrounds, table zebra stripes |
29
+
30
+ #### 3. Green Chemistry (green) — Catalysis / Energy / Environment
31
+
32
+ | Role | Hex | Usage |
33
+ |------|-----|-------|
34
+ | Primary | `#1E5631` | Titles, section slide backgrounds |
35
+ | Background | `#F7F9F4` | Content slide backgrounds |
36
+ | Body text | `#333333` | Main text |
37
+ | Accent | `#D4A017` | Decorative lines, key data highlights |
38
+ | Light bg | `#EEF3E9` | Summary slide backgrounds, table zebra stripes |
39
+
40
+ #### 4. Nature Style (nature) — CNS Journal Presentations
41
+
42
+ | Role | Hex | Usage |
43
+ |------|-----|-------|
44
+ | Primary | `#222222` | Titles, section slide backgrounds |
45
+ | Background | `#FFFFFF` | Content slide backgrounds |
46
+ | Body text | `#444444` | Main text |
47
+ | Accent | `#0066CC` | Decorative lines, key data highlights |
48
+ | Light bg | `#F8F8F8` | Summary slide backgrounds, table zebra stripes |
49
+
50
+ ### Color Principles
51
+
52
+ - One theme per deck — do not mix themes
53
+ - Accent color only for decorative lines, key numbers, emphasis marks — ≤ 5% of slide area
54
+ - Dark section slides must use white or near-white text for sufficient contrast
55
+ - No fluorescent colors, rainbow gradients, or decorative shadows
56
+
57
+ ## Typography
58
+
59
+ ### Recommended Fonts
60
+
61
+ | Usage | Chinese | English / Numbers | Size |
62
+ |------|---------|-------------------|------|
63
+ | Cover title | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold / Helvetica Bold | 36–42pt |
64
+ | Section divider | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold | 36–40pt |
65
+ | Slide title | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold | 28–34pt |
66
+ | Body bullets | Microsoft YaHei Regular / Source Han Sans Regular | Arial Regular | 16–20pt |
67
+ | Figure captions/sources | Microsoft YaHei Regular / Source Han Sans Regular | Arial Regular | 9–11pt |
68
+ | Table content | Microsoft YaHei Regular / Source Han Sans Regular | Arial Regular | 12–14pt |
69
+ | Data highlights | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold | 24–28pt |
70
+
71
+ ### Typography Principles
72
+
73
+ - Title at least 10pt larger than body — clear hierarchy
74
+ - Subscripts in chemical formulas (H₂O, SO₄²⁻) — accept plain text if styling unavailable
75
+ - Variables in italics (*E*a, *k*cat) — not mandatory
76
+ - No more than 3 font sizes per slide
77
+
78
+ ## Layout Standards
79
+
80
+ ### Canvas
81
+
82
+ - 16:9 widescreen (13.333" × 7.5")
83
+ - Horizontal padding ≥ 0.5" (0.7" recommended)
84
+ - Vertical padding ≥ 0.3"
85
+
86
+ ### Title Position
87
+
88
+ - Content slide titles fixed at top-left (0.7", 0.3"), left-aligned
89
+ - Thin decorative line below title (1.5pt, accent color) at 0.1–0.15"
90
+ - Section divider titles may be centered or left-aligned (recommended: left-aligned + vertical accent bar)
91
+
92
+ ### Figure Layouts (4 modes)
93
+
94
+ 1. **figure_right** (default): best for landscape figures with detailed captions
95
+ - Left text: 0.7"–6.5" (5.8" wide)
96
+ - Right figure: 7.2"–12.7" (5.5" wide)
97
+
98
+ 2. **figure_top**: best for wide figures with short descriptions
99
+ - Figure area: 0.7"–12.6" (3.0–3.5" tall)
100
+ - Text below in remaining space
101
+
102
+ 3. **figure_left**: best for portrait figures or figure-emphasis slides
103
+ - Left figure: 5.5" wide
104
+ - Right text: 6.2" wide
105
+
106
+ 4. **figure_full**: best for complex mechanism diagrams, multi-panel figures
107
+ - Figure: 0.5"–12.8" (full width)
108
+ - Brief caption at bottom
109
+
110
+ **Selection principle**: let the figure dictate the layout. Wide figures → `figure_top`, tall figures → `figure_left/right`, complex figures → `figure_full`.
111
+
112
+ ## Chemistry-Specific Visual Elements
113
+
114
+ ### Reaction Schemes
115
+
116
+ - Use → or chemistry arrow to connect reactants and products
117
+ - Reaction conditions above or below arrow (small font)
118
+ - Example: `A + B ──→ C (yield: 85%)`
119
+ - Use monospace or Arial font
120
+
121
+ ### Data Highlights
122
+
123
+ - Key numbers (FE%, TOF, yield, selectivity) may be enlarged and boldfaced
124
+ - Use accent color for breakthrough data
125
+ - Do not highlight more than 3 numbers simultaneously
126
+
127
+ ### Energy / Free Energy Diagrams
128
+
129
+ - Preserve original figure; annotate key barrier values on the slide
130
+ - If redrawing: X-axis = reaction coordinate, Y-axis = energy (eV or kcal/mol)
131
+ - Label the rate-determining step barrier
132
+
133
+ ### Characterization Data Interpretation Template
134
+
135
+ ```
136
+ XRD: → Phase confirmed as [...], crystallite size [...] nm (Scherrer)
137
+ TEM: → Morphology: [...], particle size distribution [...] nm
138
+ HRTEM: → Lattice fringe [...] nm, corresponding to [...] plane
139
+ XPS: → [...] element in [...] valence state, binding energy [...] eV
140
+ BET: → Surface area [...] m²/g, pore size [...] nm
141
+ ```
142
+
143
+ ### Performance Data Table Template
144
+
145
+ ```
146
+ | Catalyst | FE(C₂₊)% | j (mA/cm²) | Stability/h | Electrolyte | Ref |
147
+ |------------|----------|------------|-------------|-------------|----------|
148
+ | Ru₁/Cu | 82% | 300 | 100 | 1M KOH | This work |
149
+ | Cu NPs | 45% | 150 | 20 | 1M KOH | [1] |
150
+ | Benchmark | 60% | 200 | 50 | 1M KHCO₃ | [2] |
151
+ ```
152
+
153
+ ## Readability Checklist
154
+
155
+ - [ ] Body text ≥ 16pt
156
+ - [ ] Figure text legible at presentation scale
157
+ - [ ] Sufficient color contrast (dark text on light background, or vice versa)
158
+ - [ ] No text overflow or overlapping
159
+ - [ ] Figures not cropped or distorted
160
+ - [ ] Page numbers consecutive
161
+ - [ ] One core message per slide
162
+
163
+ ## Don'ts
164
+
165
+ - No decorative images, clip art, or emoji
166
+ - No gradient backgrounds
167
+ - No dark backgrounds on content slides
168
+ - No more than 6 bullet points per slide
169
+ - No copy-pasting paper paragraphs verbatim
170
+ - No figures too small to read
171
+ - No cramming multiple dense figures onto one slide
172
+ - No accent bar below titles on regular content slides (section slides only)
@@ -0,0 +1,20 @@
1
+ # PDF2PPT — Chemistry Academic PPT Generator
2
+ # 化学学术论文 → PowerPoint 转换器
3
+
4
+ # PDF reading and text extraction
5
+ pymupdf>=1.23.0
6
+ pdfplumber>=0.10.0
7
+
8
+ # PPTX creation
9
+ python-pptx>=0.6.23
10
+
11
+ # Image processing
12
+ Pillow>=10.0.0
13
+
14
+ # PDF page to image conversion (optional, requires Poppler)
15
+ pdf2image>=1.17.0
16
+
17
+ # Note: poppler-utils is a system dependency for pdf2image
18
+ # macOS: brew install poppler
19
+ # Linux: sudo apt-get install poppler-utils
20
+ # Windows: https://github.com/oschwartz10612/poppler-windows/releases/
@@ -0,0 +1,334 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ PDF 论文分析脚本 — 化学论文类型识别、章节结构、图表位置、关键信息提取
4
+ Chemistry Paper Analyzer with JSON output and robust encoding handling
5
+ """
6
+ import fitz
7
+ import sys
8
+ import os
9
+ import re
10
+ import json
11
+ from collections import defaultdict
12
+
13
+
14
+ def _safe_print(msg):
15
+ """Windows-safe print."""
16
+ try:
17
+ print(msg)
18
+ except UnicodeEncodeError:
19
+ print(msg.encode('ascii', errors='replace').decode('ascii'))
20
+
21
+
22
+ # ============================================================
23
+ # 化学关键词库 / Chemistry Keyword Database
24
+ # ============================================================
25
+
26
+ CHEM_KEYWORDS = {
27
+ "催化": ["cataly", "TOF", "turnover", "selectivity", "conversion",
28
+ "catalyst", "active site", "Sabatier", "d-band", "overpotential"],
29
+ "材料": ["MOF", "COF", "perovskite", "zeolite", "framework",
30
+ "polymer", "nanosheet", "nanoparticle", "quantum dot"],
31
+ "有机合成": ["synthesis", "yield", "substrate scope", "coupling",
32
+ "cross-coupling", "organocatal", "asymmetric", "functional group"],
33
+ "计算化学": [
34
+ "DFT", "density functional", "VASP", "Gaussian", "CP2K",
35
+ "Quantum ESPRESSO", "molecular dynamics", "AIMD", "ab initio",
36
+ "MD simulation", "free energy", "transition state",
37
+ "k-point", "pseudopotential", "PAW", "basis set",
38
+ "B3LYP", "PBE", "RPBE", "SCAN", "HSE", "GGA", "PBE0",
39
+ "Hartree-Fock", "coupled cluster", "CCSD", "MP2",
40
+ "Monte Carlo", "metadynamics", "enhanced sampling",
41
+ "reaction coordinate", "diabatic", "adiabatic",
42
+ "reorganization energy", "Marcus theory"
43
+ ],
44
+ "电化学": ["electrochem", "ORR", "OER", "HER", "CO2RR", "NRR",
45
+ "Li-ion", "battery", "supercapacitor", "electrolyte",
46
+ "faradaic efficiency", "Tafel", "RHE", "SHE"],
47
+ "光谱/表征": ["XRD", "XPS", "TEM", "SEM", "STEM", "HAADF",
48
+ "NMR", "IR", "Raman", "EXAFS", "XANES", "BET",
49
+ "EPR", "UV-vis", "AFM", "FTIR", "spectroscopy"],
50
+ "环境/大气": ["atmospheric", "aerosol", "SOA", "PM2.5", "oxidation",
51
+ "OH radical", "ozone", "photochem", "tropospheric"],
52
+ "能源": ["solar cell", "perovskite solar", "water splitting",
53
+ "hydrogen evolution", "photocatal", "fuel cell", "battery"],
54
+ "辐射化学": ["radiolysis", "pulse radiolysis", "hydrated electron",
55
+ "solvated electron", "transient absorption"],
56
+ }
57
+
58
+ # 表征关键词(仅在方法/实验段落出现时才算实验信号)
59
+ CHARACTERIZATION_TERMS = [
60
+ "XRD", "XPS", "TEM", "SEM", "STEM", "HAADF", "NMR", "IR",
61
+ "Raman", "EXAFS", "XANES", "BET", "EPR", "UV-vis", "AFM", "FTIR"
62
+ ]
63
+
64
+ # 计算关键词(强信号)
65
+ COMPUTATIONAL_STRONG = [
66
+ "DFT", "density functional theory", "VASP", "Gaussian", "CP2K",
67
+ "Quantum ESPRESSO", "ab initio molecular dynamics", "AIMD",
68
+ "PBE0", "B3LYP", "PBE", "Hartree-Fock", "CCSD", "MP2",
69
+ "transition state", "reaction coordinate", "diabatic",
70
+ "pseudopotential", "basis set", "k-point", "cutoff",
71
+ "molecular dynamics simulation", "trajector"
72
+ ]
73
+
74
+
75
+ def analyze_pdf(pdf_path, verbose=True, output_json=None):
76
+ """分析 PDF 论文的完整结构和内容
77
+
78
+ Returns:
79
+ dict: 论文全部分析结果
80
+ """
81
+ if not os.path.exists(pdf_path):
82
+ raise FileNotFoundError(f"PDF not found: {pdf_path}")
83
+
84
+ errors = []
85
+ doc = fitz.open(pdf_path)
86
+ result = {
87
+ "source": os.path.basename(pdf_path),
88
+ "title": "",
89
+ "paper_type": "experimental",
90
+ "paper_type_confidence": "low",
91
+ "subfields": [],
92
+ "sections": [],
93
+ "figures_detected": defaultdict(list),
94
+ "tables_detected": defaultdict(list),
95
+ "chemical_methods": [],
96
+ "total_pages": len(doc),
97
+ "has_supporting_info": False,
98
+ "errors": errors,
99
+ }
100
+
101
+ full_text = ""
102
+ first_page_text = ""
103
+ abstract_text = ""
104
+
105
+ # 提取所有文本
106
+ for page_num in range(len(doc)):
107
+ try:
108
+ page = doc[page_num]
109
+ text = page.get_text()
110
+ full_text += text
111
+ if page_num == 0:
112
+ first_page_text = text
113
+ if page_num <= 1:
114
+ abstract_text += text
115
+ except Exception as e:
116
+ errors.append(f"Page {page_num+1} text extraction failed: {e}")
117
+
118
+ full_text_lower = full_text.lower()
119
+
120
+ # ---- 标题提取 ----
121
+ lines = [l.strip() for l in first_page_text.split('\n') if l.strip()]
122
+ if lines:
123
+ result["title"] = lines[0]
124
+
125
+ # ---- 章节检测 ----
126
+ section_patterns = [
127
+ (r'^\s*(?:Abstract|摘要)\s*$', "Abstract"),
128
+ (r'^\s*(?:Introduction|引言|绪论)\s*$', "Introduction"),
129
+ (r'^\s*(?:Method|Experimental|Computational|计算方法?|实验方法?|Theory)\s*', "Methods"),
130
+ (r'^\s*(?:Result|Discussion|结果|讨论)\s*', "Results"),
131
+ (r'^\s*(?:Conclusion|结论|Summary|总结)\s*$', "Conclusions"),
132
+ (r'^\s*(?:Supporting Information|SI|附录|补充|References?)\s*', "SI/References"),
133
+ ]
134
+
135
+ for page_num in range(len(doc)):
136
+ try:
137
+ page = doc[page_num]
138
+ text = page.get_text()
139
+ for line in text.split('\n'):
140
+ line_stripped = line.strip()
141
+ if not line_stripped or len(line_stripped) > 80:
142
+ continue
143
+ for pat, label in section_patterns:
144
+ if re.match(pat, line_stripped, re.I):
145
+ result["sections"].append({
146
+ "page": page_num + 1,
147
+ "heading": line_stripped,
148
+ "label": label
149
+ })
150
+ break
151
+ except Exception:
152
+ pass
153
+
154
+ # ---- 图表检测 (version-compatible clustering) ----
155
+ def _cluster_compat(page, x_tol=3, y_tol=3):
156
+ """兼容 PyMuPDF 新旧版本的图形聚类"""
157
+ if hasattr(page, 'cluster_drawings'):
158
+ try:
159
+ return page.cluster_drawings(x_tolerance=x_tol, y_tolerance=y_tol)
160
+ except Exception:
161
+ pass
162
+ try:
163
+ drawings = page.get_drawings()
164
+ except Exception:
165
+ return []
166
+ if not drawings:
167
+ return []
168
+ rects = []
169
+ for d in drawings:
170
+ r = d.get('rect')
171
+ if r and r.x1 - r.x0 > 0.5 and r.y1 - r.y0 > 0.5:
172
+ rects.append(fitz.Rect(r.x0 - x_tol, r.y0 - y_tol, r.x1 + x_tol, r.y1 + y_tol))
173
+ if not rects:
174
+ return []
175
+ rects.sort(key=lambda r: (r.y0, r.x0))
176
+ clusters = [rects[0]]
177
+ for r in rects[1:]:
178
+ merged = False
179
+ for i, c in enumerate(clusters):
180
+ if r.intersects(c):
181
+ clusters[i] = c | r
182
+ merged = True
183
+ break
184
+ if not merged:
185
+ clusters.append(r)
186
+ return [fitz.Rect(c.x0 + x_tol, c.y0 + y_tol, c.x1 - x_tol, c.y1 - y_tol) & page.rect for c in clusters]
187
+
188
+ for page_num in range(len(doc)):
189
+ try:
190
+ page = doc[page_num]
191
+ rects = _cluster_compat(page)
192
+ for rect in rects:
193
+ if rect.width > 100 and rect.height > 80:
194
+ nearby_rect = fitz.Rect(
195
+ rect.x0 - 10, rect.y1,
196
+ rect.x1 + 10, min(rect.y1 + 120, page.rect.y1)
197
+ )
198
+ nearby_text = page.get_text(clip=nearby_rect)
199
+ caption = ""
200
+ if nearby_text:
201
+ for line in nearby_text.split('\n'):
202
+ m = re.match(r'(Fig|Figure|Table|Scheme)\s*[\.\s]', line, re.I)
203
+ if m:
204
+ caption = line.strip()[:150]
205
+ break
206
+
207
+ if caption:
208
+ if re.match(r'(Fig|Figure|Scheme)', caption, re.I):
209
+ result["figures_detected"][page_num + 1].append({
210
+ "rect": [rect.x0, rect.y0, rect.x1, rect.y1],
211
+ "caption": caption,
212
+ "size": f"{rect.width:.0f}x{rect.height:.0f}"
213
+ })
214
+ elif re.match(r'Table', caption, re.I):
215
+ result["tables_detected"][page_num + 1].append({
216
+ "rect": [rect.x0, rect.y0, rect.x1, rect.y1],
217
+ "caption": caption,
218
+ "size": f"{rect.width:.0f}x{rect.height:.0f}"
219
+ })
220
+ except Exception:
221
+ pass
222
+
223
+ # ---- 论文类型判定 (改进逻辑) ----
224
+ # 在正文前1/3部分(通常不含大量参考文献)检测信号
225
+ text_first_third = full_text[:len(full_text) // 3].lower()
226
+
227
+ computational_score = 0
228
+ for kw in COMPUTATIONAL_STRONG:
229
+ if kw.lower() in full_text_lower:
230
+ # 强信号在方法部分(前1/3)权重更高
231
+ if kw.lower() in text_first_third:
232
+ computational_score += 3
233
+ else:
234
+ computational_score += 1
235
+
236
+ # 表征关键词检测 — 只在 methods/results 区域有意义
237
+ char_in_text = sum(1 for t in CHARACTERIZATION_TERMS if t.lower() in full_text_lower)
238
+ char_in_first_third = sum(1 for t in CHARACTERIZATION_TERMS if t.lower() in text_first_third)
239
+
240
+ # 判定逻辑
241
+ has_strong_char = char_in_first_third >= 2
242
+ has_strong_comp = computational_score >= 5
243
+
244
+ if has_strong_comp and not has_strong_char:
245
+ result["paper_type"] = "computational"
246
+ result["paper_type_confidence"] = "high"
247
+ elif has_strong_char and has_strong_comp:
248
+ result["paper_type"] = "hybrid"
249
+ result["paper_type_confidence"] = "medium"
250
+ elif has_strong_comp and char_in_text >= 2:
251
+ result["paper_type"] = "hybrid"
252
+ result["paper_type_confidence"] = "low"
253
+ errors.append("Computational signal strong but characterization terms found in text (may be from references). Classified as hybrid with low confidence.")
254
+ elif has_strong_char:
255
+ result["paper_type"] = "experimental"
256
+ result["paper_type_confidence"] = "high"
257
+ else:
258
+ result["paper_type"] = "experimental"
259
+ result["paper_type_confidence"] = "low"
260
+ errors.append("Insufficient signals for confident paper type classification.")
261
+
262
+ # ---- 化学子领域识别 ----
263
+ for subfield, keywords in CHEM_KEYWORDS.items():
264
+ match_count = sum(1 for kw in keywords if kw.lower() in full_text_lower)
265
+ if match_count >= 2:
266
+ result["subfields"].append(subfield)
267
+
268
+ # ---- 特征方法提取 ----
269
+ method_patterns = [
270
+ (r'(CP2K|VASP|Gaussian\s*\d+|Q-Chem\s*\d+|GROMACS|LAMMPS|Quantum\s+ESPRESSO)', 2),
271
+ (r'(PBE0|B3LYP|PBE|RPBE|SCAN|HSE06|GGA|LDA|meta-GGA)', 1),
272
+ (r'(impregnation|sol-gel|hydrothermal|solvothermal|co-precipitation|calcination|pyrolysis)', 1),
273
+ (r'(XRD|XPS|TEM|SEM|STEM|HAADF-STEM|NMR|IR|Raman|EXAFS|XANES|BET|EPR)', 1),
274
+ ]
275
+ for pattern, weight in method_patterns:
276
+ for m in re.finditer(pattern, full_text, re.I):
277
+ method = m.group(0).strip()
278
+ if method and method not in result["chemical_methods"]:
279
+ result["chemical_methods"].append(method)
280
+
281
+ # ---- 支持信息检测 ----
282
+ if 'supporting information' in full_text_lower or 'supplementary' in full_text_lower:
283
+ result["has_supporting_info"] = True
284
+
285
+ doc.close()
286
+
287
+ # ---- 输出 ----
288
+ if verbose:
289
+ _safe_print(f"PDF Analysis: {result['source']}")
290
+ _safe_print(f" Pages: {result['total_pages']}")
291
+ _safe_print(f" Paper Type: {result['paper_type']} (confidence: {result['paper_type_confidence']})")
292
+ _safe_print(f" Subfields: {', '.join(result['subfields']) or '未识别'}")
293
+ _safe_print(f" Sections: {len(result['sections'])} detected")
294
+ _safe_print(f" Figures: {sum(len(v) for v in result['figures_detected'].values())} detected")
295
+ _safe_print(f" Methods: {', '.join(result['chemical_methods'][:8])}")
296
+ if errors:
297
+ _safe_print(f" [!] {len(errors)} issue(s):")
298
+ for e in errors:
299
+ _safe_print(f" - {e}")
300
+
301
+ if output_json:
302
+ # 将 defaultdict 转为普通 dict 以便 JSON 序列化
303
+ result_out = result.copy()
304
+ result_out["figures_detected"] = {
305
+ k: v for k, v in result["figures_detected"].items()
306
+ }
307
+ result_out["tables_detected"] = {
308
+ k: v for k, v in result["tables_detected"].items()
309
+ }
310
+ with open(output_json, 'w', encoding='utf-8') as f:
311
+ json.dump(result_out, f, indent=2, ensure_ascii=False)
312
+ _safe_print(f" JSON report: {output_json}")
313
+
314
+ return result
315
+
316
+
317
+ def main():
318
+ if len(sys.argv) < 2:
319
+ print("Usage: python analyze_paper.py <pdf_file> [--json output.json]")
320
+ print("Example: python analyze_paper.py paper.pdf --json report.json")
321
+ return
322
+
323
+ pdf_path = sys.argv[1]
324
+ output_json = None
325
+
326
+ for i, arg in enumerate(sys.argv):
327
+ if arg == "--json" and i + 1 < len(sys.argv):
328
+ output_json = sys.argv[i + 1]
329
+
330
+ analyze_pdf(pdf_path, verbose=True, output_json=output_json)
331
+
332
+
333
+ if __name__ == "__main__":
334
+ main()
@@ -0,0 +1,67 @@
1
+ """
2
+ PDF转图片脚本 - 将PDF每一页转换为高清图片
3
+ PDF to Images Converter - Converts PDF pages to high-resolution images
4
+ """
5
+ from pdf2image import convert_from_path
6
+ import os
7
+ import sys
8
+
9
+ def pdf_to_images(pdf_path, output_dir="pdf_images", dpi=200, fmt="PNG"):
10
+ """
11
+ 将PDF转换为图片
12
+
13
+ Args:
14
+ pdf_path: PDF文件路径
15
+ output_dir: 输出目录
16
+ dpi: 图片分辨率(越高越清晰,但文件越大)
17
+ fmt: 输出格式(PNG, JPEG)
18
+ """
19
+ if not os.path.exists(pdf_path):
20
+ print(f"Error: File not found: {pdf_path}")
21
+ return
22
+
23
+ # 创建输出目录
24
+ if not os.path.exists(output_dir):
25
+ os.makedirs(output_dir)
26
+
27
+ print(f"Converting PDF: {pdf_path}")
28
+ print(f"Output directory: {output_dir}")
29
+ print(f"DPI: {dpi}, Format: {fmt}")
30
+ print("-" * 40)
31
+
32
+ try:
33
+ images = convert_from_path(pdf_path, dpi=dpi)
34
+ print(f"Successfully converted {len(images)} pages")
35
+
36
+ for i, image in enumerate(images):
37
+ page_num = i + 1
38
+ image_path = os.path.join(output_dir, f"page_{page_num}.{fmt.lower()}")
39
+ image.save(image_path, fmt)
40
+ print(f" Saved: page_{page_num}.{fmt.lower()}")
41
+
42
+ print("-" * 40)
43
+ print(f"Conversion complete! {len(images)} pages saved to: {output_dir}")
44
+ return len(images)
45
+
46
+ except Exception as e:
47
+ print(f"Error: {e}")
48
+ return 0
49
+
50
+ def main():
51
+ if len(sys.argv) < 2:
52
+ print("Usage: python convert_to_images.py <pdf_file> [output_dir] [dpi]")
53
+ print("Examples:")
54
+ print(" python convert_to_images.py paper.pdf")
55
+ print(" python convert_to_images.py paper.pdf images 300")
56
+ print(" python convert_to_images.py paper.pdf pages 150 jpg")
57
+ return
58
+
59
+ pdf_path = sys.argv[1]
60
+ output_dir = sys.argv[2] if len(sys.argv) > 2 else "pdf_images"
61
+ dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 200
62
+ fmt = sys.argv[4].upper() if len(sys.argv) > 4 else "PNG"
63
+
64
+ pdf_to_images(pdf_path, output_dir, dpi, fmt)
65
+
66
+ if __name__ == "__main__":
67
+ main()