chem-pdf2ppt 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +235 -0
- package/README_EN.md +239 -0
- package/SKILL.md +469 -0
- package/SKILL_EN.md +473 -0
- package/assets/academic_template.html +197 -0
- package/cli.js +57 -0
- package/examples/example_usage.py +407 -0
- package/index.js +109 -0
- package/package.json +50 -0
- package/references/chemistry_templates.md +228 -0
- package/references/chemistry_templates_en.md +228 -0
- package/references/visual_style.md +172 -0
- package/references/visual_style_en.md +172 -0
- package/requirements.txt +20 -0
- package/scripts/analyze_paper.py +334 -0
- package/scripts/convert_to_images.py +67 -0
- package/scripts/create_ppt.py +712 -0
- package/scripts/extract_charts.py +425 -0
- package/scripts/generate_html.py +288 -0
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
# Chemistry Academic PPT Visual Standards
|
|
2
|
+
|
|
3
|
+
This document defines the visual design standards for chemistry academic PPTs, targeting **professional, publication-quality** presentations.
|
|
4
|
+
|
|
5
|
+
## Color Schemes
|
|
6
|
+
|
|
7
|
+
### Preset Themes
|
|
8
|
+
|
|
9
|
+
#### 1. Academic Classic (academic) — Default, General Chemistry
|
|
10
|
+
|
|
11
|
+
| Role | Hex | Usage |
|
|
12
|
+
|------|-----|-------|
|
|
13
|
+
| Primary | `#003366` | Titles, section slide backgrounds |
|
|
14
|
+
| Background | `#FFFFFF` | Content slide backgrounds |
|
|
15
|
+
| Body text | `#333333` | Main text |
|
|
16
|
+
| Accent | `#B41E1E` | Decorative lines, key data highlights |
|
|
17
|
+
| Light bg | `#F0F4F8` | Summary slide backgrounds, table zebra stripes |
|
|
18
|
+
| Muted | `#8C8C8C` | Page numbers, figure captions, source labels |
|
|
19
|
+
|
|
20
|
+
#### 2. Molecular Tech (molecular) — Computational Chemistry / Materials
|
|
21
|
+
|
|
22
|
+
| Role | Hex | Usage |
|
|
23
|
+
|------|-----|-------|
|
|
24
|
+
| Primary | `#1A5276` | Titles, section slide backgrounds |
|
|
25
|
+
| Background | `#F8F9FA` | Content slide backgrounds |
|
|
26
|
+
| Body text | `#2C3E50` | Main text |
|
|
27
|
+
| Accent | `#E74C3C` | Decorative lines, key data highlights |
|
|
28
|
+
| Light bg | `#EBF0F5` | Summary slide backgrounds, table zebra stripes |
|
|
29
|
+
|
|
30
|
+
#### 3. Green Chemistry (green) — Catalysis / Energy / Environment
|
|
31
|
+
|
|
32
|
+
| Role | Hex | Usage |
|
|
33
|
+
|------|-----|-------|
|
|
34
|
+
| Primary | `#1E5631` | Titles, section slide backgrounds |
|
|
35
|
+
| Background | `#F7F9F4` | Content slide backgrounds |
|
|
36
|
+
| Body text | `#333333` | Main text |
|
|
37
|
+
| Accent | `#D4A017` | Decorative lines, key data highlights |
|
|
38
|
+
| Light bg | `#EEF3E9` | Summary slide backgrounds, table zebra stripes |
|
|
39
|
+
|
|
40
|
+
#### 4. Nature Style (nature) — CNS Journal Presentations
|
|
41
|
+
|
|
42
|
+
| Role | Hex | Usage |
|
|
43
|
+
|------|-----|-------|
|
|
44
|
+
| Primary | `#222222` | Titles, section slide backgrounds |
|
|
45
|
+
| Background | `#FFFFFF` | Content slide backgrounds |
|
|
46
|
+
| Body text | `#444444` | Main text |
|
|
47
|
+
| Accent | `#0066CC` | Decorative lines, key data highlights |
|
|
48
|
+
| Light bg | `#F8F8F8` | Summary slide backgrounds, table zebra stripes |
|
|
49
|
+
|
|
50
|
+
### Color Principles
|
|
51
|
+
|
|
52
|
+
- One theme per deck — do not mix themes
|
|
53
|
+
- Accent color only for decorative lines, key numbers, emphasis marks — ≤ 5% of slide area
|
|
54
|
+
- Dark section slides must use white or near-white text for sufficient contrast
|
|
55
|
+
- No fluorescent colors, rainbow gradients, or decorative shadows
|
|
56
|
+
|
|
57
|
+
## Typography
|
|
58
|
+
|
|
59
|
+
### Recommended Fonts
|
|
60
|
+
|
|
61
|
+
| Usage | Chinese | English / Numbers | Size |
|
|
62
|
+
|------|---------|-------------------|------|
|
|
63
|
+
| Cover title | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold / Helvetica Bold | 36–42pt |
|
|
64
|
+
| Section divider | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold | 36–40pt |
|
|
65
|
+
| Slide title | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold | 28–34pt |
|
|
66
|
+
| Body bullets | Microsoft YaHei Regular / Source Han Sans Regular | Arial Regular | 16–20pt |
|
|
67
|
+
| Figure captions/sources | Microsoft YaHei Regular / Source Han Sans Regular | Arial Regular | 9–11pt |
|
|
68
|
+
| Table content | Microsoft YaHei Regular / Source Han Sans Regular | Arial Regular | 12–14pt |
|
|
69
|
+
| Data highlights | Microsoft YaHei Bold / Source Han Sans Bold | Arial Bold | 24–28pt |
|
|
70
|
+
|
|
71
|
+
### Typography Principles
|
|
72
|
+
|
|
73
|
+
- Title at least 10pt larger than body — clear hierarchy
|
|
74
|
+
- Subscripts in chemical formulas (H₂O, SO₄²⁻) — accept plain text if styling unavailable
|
|
75
|
+
- Variables in italics (*E*a, *k*cat) — not mandatory
|
|
76
|
+
- No more than 3 font sizes per slide
|
|
77
|
+
|
|
78
|
+
## Layout Standards
|
|
79
|
+
|
|
80
|
+
### Canvas
|
|
81
|
+
|
|
82
|
+
- 16:9 widescreen (13.333" × 7.5")
|
|
83
|
+
- Horizontal padding ≥ 0.5" (0.7" recommended)
|
|
84
|
+
- Vertical padding ≥ 0.3"
|
|
85
|
+
|
|
86
|
+
### Title Position
|
|
87
|
+
|
|
88
|
+
- Content slide titles fixed at top-left (0.7", 0.3"), left-aligned
|
|
89
|
+
- Thin decorative line below title (1.5pt, accent color) at 0.1–0.15"
|
|
90
|
+
- Section divider titles may be centered or left-aligned (recommended: left-aligned + vertical accent bar)
|
|
91
|
+
|
|
92
|
+
### Figure Layouts (4 modes)
|
|
93
|
+
|
|
94
|
+
1. **figure_right** (default): best for landscape figures with detailed captions
|
|
95
|
+
- Left text: 0.7"–6.5" (5.8" wide)
|
|
96
|
+
- Right figure: 7.2"–12.7" (5.5" wide)
|
|
97
|
+
|
|
98
|
+
2. **figure_top**: best for wide figures with short descriptions
|
|
99
|
+
- Figure area: 0.7"–12.6" (3.0–3.5" tall)
|
|
100
|
+
- Text below in remaining space
|
|
101
|
+
|
|
102
|
+
3. **figure_left**: best for portrait figures or figure-emphasis slides
|
|
103
|
+
- Left figure: 5.5" wide
|
|
104
|
+
- Right text: 6.2" wide
|
|
105
|
+
|
|
106
|
+
4. **figure_full**: best for complex mechanism diagrams, multi-panel figures
|
|
107
|
+
- Figure: 0.5"–12.8" (full width)
|
|
108
|
+
- Brief caption at bottom
|
|
109
|
+
|
|
110
|
+
**Selection principle**: let the figure dictate the layout. Wide figures → `figure_top`, tall figures → `figure_left/right`, complex figures → `figure_full`.
|
|
111
|
+
|
|
112
|
+
## Chemistry-Specific Visual Elements
|
|
113
|
+
|
|
114
|
+
### Reaction Schemes
|
|
115
|
+
|
|
116
|
+
- Use → or chemistry arrow to connect reactants and products
|
|
117
|
+
- Reaction conditions above or below arrow (small font)
|
|
118
|
+
- Example: `A + B ──→ C (yield: 85%)`
|
|
119
|
+
- Use monospace or Arial font
|
|
120
|
+
|
|
121
|
+
### Data Highlights
|
|
122
|
+
|
|
123
|
+
- Key numbers (FE%, TOF, yield, selectivity) may be enlarged and boldfaced
|
|
124
|
+
- Use accent color for breakthrough data
|
|
125
|
+
- Do not highlight more than 3 numbers simultaneously
|
|
126
|
+
|
|
127
|
+
### Energy / Free Energy Diagrams
|
|
128
|
+
|
|
129
|
+
- Preserve original figure; annotate key barrier values on the slide
|
|
130
|
+
- If redrawing: X-axis = reaction coordinate, Y-axis = energy (eV or kcal/mol)
|
|
131
|
+
- Label the rate-determining step barrier
|
|
132
|
+
|
|
133
|
+
### Characterization Data Interpretation Template
|
|
134
|
+
|
|
135
|
+
```
|
|
136
|
+
XRD: → Phase confirmed as [...], crystallite size [...] nm (Scherrer)
|
|
137
|
+
TEM: → Morphology: [...], particle size distribution [...] nm
|
|
138
|
+
HRTEM: → Lattice fringe [...] nm, corresponding to [...] plane
|
|
139
|
+
XPS: → [...] element in [...] valence state, binding energy [...] eV
|
|
140
|
+
BET: → Surface area [...] m²/g, pore size [...] nm
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
### Performance Data Table Template
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
| Catalyst | FE(C₂₊)% | j (mA/cm²) | Stability/h | Electrolyte | Ref |
|
|
147
|
+
|------------|----------|------------|-------------|-------------|----------|
|
|
148
|
+
| Ru₁/Cu | 82% | 300 | 100 | 1M KOH | This work |
|
|
149
|
+
| Cu NPs | 45% | 150 | 20 | 1M KOH | [1] |
|
|
150
|
+
| Benchmark | 60% | 200 | 50 | 1M KHCO₃ | [2] |
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Readability Checklist
|
|
154
|
+
|
|
155
|
+
- [ ] Body text ≥ 16pt
|
|
156
|
+
- [ ] Figure text legible at presentation scale
|
|
157
|
+
- [ ] Sufficient color contrast (dark text on light background, or vice versa)
|
|
158
|
+
- [ ] No text overflow or overlapping
|
|
159
|
+
- [ ] Figures not cropped or distorted
|
|
160
|
+
- [ ] Page numbers consecutive
|
|
161
|
+
- [ ] One core message per slide
|
|
162
|
+
|
|
163
|
+
## Don'ts
|
|
164
|
+
|
|
165
|
+
- No decorative images, clip art, or emoji
|
|
166
|
+
- No gradient backgrounds
|
|
167
|
+
- No dark backgrounds on content slides
|
|
168
|
+
- No more than 6 bullet points per slide
|
|
169
|
+
- No copy-pasting paper paragraphs verbatim
|
|
170
|
+
- No figures too small to read
|
|
171
|
+
- No cramming multiple dense figures onto one slide
|
|
172
|
+
- No accent bar below titles on regular content slides (section slides only)
|
package/requirements.txt
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# PDF2PPT — Chemistry Academic PPT Generator
|
|
2
|
+
# 化学学术论文 → PowerPoint 转换器
|
|
3
|
+
|
|
4
|
+
# PDF reading and text extraction
|
|
5
|
+
pymupdf>=1.23.0
|
|
6
|
+
pdfplumber>=0.10.0
|
|
7
|
+
|
|
8
|
+
# PPTX creation
|
|
9
|
+
python-pptx>=0.6.23
|
|
10
|
+
|
|
11
|
+
# Image processing
|
|
12
|
+
Pillow>=10.0.0
|
|
13
|
+
|
|
14
|
+
# PDF page to image conversion (optional, requires Poppler)
|
|
15
|
+
pdf2image>=1.17.0
|
|
16
|
+
|
|
17
|
+
# Note: poppler-utils is a system dependency for pdf2image
|
|
18
|
+
# macOS: brew install poppler
|
|
19
|
+
# Linux: sudo apt-get install poppler-utils
|
|
20
|
+
# Windows: https://github.com/oschwartz10612/poppler-windows/releases/
|
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
"""
|
|
3
|
+
PDF 论文分析脚本 — 化学论文类型识别、章节结构、图表位置、关键信息提取
|
|
4
|
+
Chemistry Paper Analyzer with JSON output and robust encoding handling
|
|
5
|
+
"""
|
|
6
|
+
import fitz
|
|
7
|
+
import sys
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import json
|
|
11
|
+
from collections import defaultdict
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _safe_print(msg):
|
|
15
|
+
"""Windows-safe print."""
|
|
16
|
+
try:
|
|
17
|
+
print(msg)
|
|
18
|
+
except UnicodeEncodeError:
|
|
19
|
+
print(msg.encode('ascii', errors='replace').decode('ascii'))
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ============================================================
|
|
23
|
+
# 化学关键词库 / Chemistry Keyword Database
|
|
24
|
+
# ============================================================
|
|
25
|
+
|
|
26
|
+
CHEM_KEYWORDS = {
|
|
27
|
+
"催化": ["cataly", "TOF", "turnover", "selectivity", "conversion",
|
|
28
|
+
"catalyst", "active site", "Sabatier", "d-band", "overpotential"],
|
|
29
|
+
"材料": ["MOF", "COF", "perovskite", "zeolite", "framework",
|
|
30
|
+
"polymer", "nanosheet", "nanoparticle", "quantum dot"],
|
|
31
|
+
"有机合成": ["synthesis", "yield", "substrate scope", "coupling",
|
|
32
|
+
"cross-coupling", "organocatal", "asymmetric", "functional group"],
|
|
33
|
+
"计算化学": [
|
|
34
|
+
"DFT", "density functional", "VASP", "Gaussian", "CP2K",
|
|
35
|
+
"Quantum ESPRESSO", "molecular dynamics", "AIMD", "ab initio",
|
|
36
|
+
"MD simulation", "free energy", "transition state",
|
|
37
|
+
"k-point", "pseudopotential", "PAW", "basis set",
|
|
38
|
+
"B3LYP", "PBE", "RPBE", "SCAN", "HSE", "GGA", "PBE0",
|
|
39
|
+
"Hartree-Fock", "coupled cluster", "CCSD", "MP2",
|
|
40
|
+
"Monte Carlo", "metadynamics", "enhanced sampling",
|
|
41
|
+
"reaction coordinate", "diabatic", "adiabatic",
|
|
42
|
+
"reorganization energy", "Marcus theory"
|
|
43
|
+
],
|
|
44
|
+
"电化学": ["electrochem", "ORR", "OER", "HER", "CO2RR", "NRR",
|
|
45
|
+
"Li-ion", "battery", "supercapacitor", "electrolyte",
|
|
46
|
+
"faradaic efficiency", "Tafel", "RHE", "SHE"],
|
|
47
|
+
"光谱/表征": ["XRD", "XPS", "TEM", "SEM", "STEM", "HAADF",
|
|
48
|
+
"NMR", "IR", "Raman", "EXAFS", "XANES", "BET",
|
|
49
|
+
"EPR", "UV-vis", "AFM", "FTIR", "spectroscopy"],
|
|
50
|
+
"环境/大气": ["atmospheric", "aerosol", "SOA", "PM2.5", "oxidation",
|
|
51
|
+
"OH radical", "ozone", "photochem", "tropospheric"],
|
|
52
|
+
"能源": ["solar cell", "perovskite solar", "water splitting",
|
|
53
|
+
"hydrogen evolution", "photocatal", "fuel cell", "battery"],
|
|
54
|
+
"辐射化学": ["radiolysis", "pulse radiolysis", "hydrated electron",
|
|
55
|
+
"solvated electron", "transient absorption"],
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
# 表征关键词(仅在方法/实验段落出现时才算实验信号)
|
|
59
|
+
CHARACTERIZATION_TERMS = [
|
|
60
|
+
"XRD", "XPS", "TEM", "SEM", "STEM", "HAADF", "NMR", "IR",
|
|
61
|
+
"Raman", "EXAFS", "XANES", "BET", "EPR", "UV-vis", "AFM", "FTIR"
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
# 计算关键词(强信号)
|
|
65
|
+
COMPUTATIONAL_STRONG = [
|
|
66
|
+
"DFT", "density functional theory", "VASP", "Gaussian", "CP2K",
|
|
67
|
+
"Quantum ESPRESSO", "ab initio molecular dynamics", "AIMD",
|
|
68
|
+
"PBE0", "B3LYP", "PBE", "Hartree-Fock", "CCSD", "MP2",
|
|
69
|
+
"transition state", "reaction coordinate", "diabatic",
|
|
70
|
+
"pseudopotential", "basis set", "k-point", "cutoff",
|
|
71
|
+
"molecular dynamics simulation", "trajector"
|
|
72
|
+
]
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def analyze_pdf(pdf_path, verbose=True, output_json=None):
|
|
76
|
+
"""分析 PDF 论文的完整结构和内容
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
dict: 论文全部分析结果
|
|
80
|
+
"""
|
|
81
|
+
if not os.path.exists(pdf_path):
|
|
82
|
+
raise FileNotFoundError(f"PDF not found: {pdf_path}")
|
|
83
|
+
|
|
84
|
+
errors = []
|
|
85
|
+
doc = fitz.open(pdf_path)
|
|
86
|
+
result = {
|
|
87
|
+
"source": os.path.basename(pdf_path),
|
|
88
|
+
"title": "",
|
|
89
|
+
"paper_type": "experimental",
|
|
90
|
+
"paper_type_confidence": "low",
|
|
91
|
+
"subfields": [],
|
|
92
|
+
"sections": [],
|
|
93
|
+
"figures_detected": defaultdict(list),
|
|
94
|
+
"tables_detected": defaultdict(list),
|
|
95
|
+
"chemical_methods": [],
|
|
96
|
+
"total_pages": len(doc),
|
|
97
|
+
"has_supporting_info": False,
|
|
98
|
+
"errors": errors,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
full_text = ""
|
|
102
|
+
first_page_text = ""
|
|
103
|
+
abstract_text = ""
|
|
104
|
+
|
|
105
|
+
# 提取所有文本
|
|
106
|
+
for page_num in range(len(doc)):
|
|
107
|
+
try:
|
|
108
|
+
page = doc[page_num]
|
|
109
|
+
text = page.get_text()
|
|
110
|
+
full_text += text
|
|
111
|
+
if page_num == 0:
|
|
112
|
+
first_page_text = text
|
|
113
|
+
if page_num <= 1:
|
|
114
|
+
abstract_text += text
|
|
115
|
+
except Exception as e:
|
|
116
|
+
errors.append(f"Page {page_num+1} text extraction failed: {e}")
|
|
117
|
+
|
|
118
|
+
full_text_lower = full_text.lower()
|
|
119
|
+
|
|
120
|
+
# ---- 标题提取 ----
|
|
121
|
+
lines = [l.strip() for l in first_page_text.split('\n') if l.strip()]
|
|
122
|
+
if lines:
|
|
123
|
+
result["title"] = lines[0]
|
|
124
|
+
|
|
125
|
+
# ---- 章节检测 ----
|
|
126
|
+
section_patterns = [
|
|
127
|
+
(r'^\s*(?:Abstract|摘要)\s*$', "Abstract"),
|
|
128
|
+
(r'^\s*(?:Introduction|引言|绪论)\s*$', "Introduction"),
|
|
129
|
+
(r'^\s*(?:Method|Experimental|Computational|计算方法?|实验方法?|Theory)\s*', "Methods"),
|
|
130
|
+
(r'^\s*(?:Result|Discussion|结果|讨论)\s*', "Results"),
|
|
131
|
+
(r'^\s*(?:Conclusion|结论|Summary|总结)\s*$', "Conclusions"),
|
|
132
|
+
(r'^\s*(?:Supporting Information|SI|附录|补充|References?)\s*', "SI/References"),
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
for page_num in range(len(doc)):
|
|
136
|
+
try:
|
|
137
|
+
page = doc[page_num]
|
|
138
|
+
text = page.get_text()
|
|
139
|
+
for line in text.split('\n'):
|
|
140
|
+
line_stripped = line.strip()
|
|
141
|
+
if not line_stripped or len(line_stripped) > 80:
|
|
142
|
+
continue
|
|
143
|
+
for pat, label in section_patterns:
|
|
144
|
+
if re.match(pat, line_stripped, re.I):
|
|
145
|
+
result["sections"].append({
|
|
146
|
+
"page": page_num + 1,
|
|
147
|
+
"heading": line_stripped,
|
|
148
|
+
"label": label
|
|
149
|
+
})
|
|
150
|
+
break
|
|
151
|
+
except Exception:
|
|
152
|
+
pass
|
|
153
|
+
|
|
154
|
+
# ---- 图表检测 (version-compatible clustering) ----
|
|
155
|
+
def _cluster_compat(page, x_tol=3, y_tol=3):
|
|
156
|
+
"""兼容 PyMuPDF 新旧版本的图形聚类"""
|
|
157
|
+
if hasattr(page, 'cluster_drawings'):
|
|
158
|
+
try:
|
|
159
|
+
return page.cluster_drawings(x_tolerance=x_tol, y_tolerance=y_tol)
|
|
160
|
+
except Exception:
|
|
161
|
+
pass
|
|
162
|
+
try:
|
|
163
|
+
drawings = page.get_drawings()
|
|
164
|
+
except Exception:
|
|
165
|
+
return []
|
|
166
|
+
if not drawings:
|
|
167
|
+
return []
|
|
168
|
+
rects = []
|
|
169
|
+
for d in drawings:
|
|
170
|
+
r = d.get('rect')
|
|
171
|
+
if r and r.x1 - r.x0 > 0.5 and r.y1 - r.y0 > 0.5:
|
|
172
|
+
rects.append(fitz.Rect(r.x0 - x_tol, r.y0 - y_tol, r.x1 + x_tol, r.y1 + y_tol))
|
|
173
|
+
if not rects:
|
|
174
|
+
return []
|
|
175
|
+
rects.sort(key=lambda r: (r.y0, r.x0))
|
|
176
|
+
clusters = [rects[0]]
|
|
177
|
+
for r in rects[1:]:
|
|
178
|
+
merged = False
|
|
179
|
+
for i, c in enumerate(clusters):
|
|
180
|
+
if r.intersects(c):
|
|
181
|
+
clusters[i] = c | r
|
|
182
|
+
merged = True
|
|
183
|
+
break
|
|
184
|
+
if not merged:
|
|
185
|
+
clusters.append(r)
|
|
186
|
+
return [fitz.Rect(c.x0 + x_tol, c.y0 + y_tol, c.x1 - x_tol, c.y1 - y_tol) & page.rect for c in clusters]
|
|
187
|
+
|
|
188
|
+
for page_num in range(len(doc)):
|
|
189
|
+
try:
|
|
190
|
+
page = doc[page_num]
|
|
191
|
+
rects = _cluster_compat(page)
|
|
192
|
+
for rect in rects:
|
|
193
|
+
if rect.width > 100 and rect.height > 80:
|
|
194
|
+
nearby_rect = fitz.Rect(
|
|
195
|
+
rect.x0 - 10, rect.y1,
|
|
196
|
+
rect.x1 + 10, min(rect.y1 + 120, page.rect.y1)
|
|
197
|
+
)
|
|
198
|
+
nearby_text = page.get_text(clip=nearby_rect)
|
|
199
|
+
caption = ""
|
|
200
|
+
if nearby_text:
|
|
201
|
+
for line in nearby_text.split('\n'):
|
|
202
|
+
m = re.match(r'(Fig|Figure|Table|Scheme)\s*[\.\s]', line, re.I)
|
|
203
|
+
if m:
|
|
204
|
+
caption = line.strip()[:150]
|
|
205
|
+
break
|
|
206
|
+
|
|
207
|
+
if caption:
|
|
208
|
+
if re.match(r'(Fig|Figure|Scheme)', caption, re.I):
|
|
209
|
+
result["figures_detected"][page_num + 1].append({
|
|
210
|
+
"rect": [rect.x0, rect.y0, rect.x1, rect.y1],
|
|
211
|
+
"caption": caption,
|
|
212
|
+
"size": f"{rect.width:.0f}x{rect.height:.0f}"
|
|
213
|
+
})
|
|
214
|
+
elif re.match(r'Table', caption, re.I):
|
|
215
|
+
result["tables_detected"][page_num + 1].append({
|
|
216
|
+
"rect": [rect.x0, rect.y0, rect.x1, rect.y1],
|
|
217
|
+
"caption": caption,
|
|
218
|
+
"size": f"{rect.width:.0f}x{rect.height:.0f}"
|
|
219
|
+
})
|
|
220
|
+
except Exception:
|
|
221
|
+
pass
|
|
222
|
+
|
|
223
|
+
# ---- 论文类型判定 (改进逻辑) ----
|
|
224
|
+
# 在正文前1/3部分(通常不含大量参考文献)检测信号
|
|
225
|
+
text_first_third = full_text[:len(full_text) // 3].lower()
|
|
226
|
+
|
|
227
|
+
computational_score = 0
|
|
228
|
+
for kw in COMPUTATIONAL_STRONG:
|
|
229
|
+
if kw.lower() in full_text_lower:
|
|
230
|
+
# 强信号在方法部分(前1/3)权重更高
|
|
231
|
+
if kw.lower() in text_first_third:
|
|
232
|
+
computational_score += 3
|
|
233
|
+
else:
|
|
234
|
+
computational_score += 1
|
|
235
|
+
|
|
236
|
+
# 表征关键词检测 — 只在 methods/results 区域有意义
|
|
237
|
+
char_in_text = sum(1 for t in CHARACTERIZATION_TERMS if t.lower() in full_text_lower)
|
|
238
|
+
char_in_first_third = sum(1 for t in CHARACTERIZATION_TERMS if t.lower() in text_first_third)
|
|
239
|
+
|
|
240
|
+
# 判定逻辑
|
|
241
|
+
has_strong_char = char_in_first_third >= 2
|
|
242
|
+
has_strong_comp = computational_score >= 5
|
|
243
|
+
|
|
244
|
+
if has_strong_comp and not has_strong_char:
|
|
245
|
+
result["paper_type"] = "computational"
|
|
246
|
+
result["paper_type_confidence"] = "high"
|
|
247
|
+
elif has_strong_char and has_strong_comp:
|
|
248
|
+
result["paper_type"] = "hybrid"
|
|
249
|
+
result["paper_type_confidence"] = "medium"
|
|
250
|
+
elif has_strong_comp and char_in_text >= 2:
|
|
251
|
+
result["paper_type"] = "hybrid"
|
|
252
|
+
result["paper_type_confidence"] = "low"
|
|
253
|
+
errors.append("Computational signal strong but characterization terms found in text (may be from references). Classified as hybrid with low confidence.")
|
|
254
|
+
elif has_strong_char:
|
|
255
|
+
result["paper_type"] = "experimental"
|
|
256
|
+
result["paper_type_confidence"] = "high"
|
|
257
|
+
else:
|
|
258
|
+
result["paper_type"] = "experimental"
|
|
259
|
+
result["paper_type_confidence"] = "low"
|
|
260
|
+
errors.append("Insufficient signals for confident paper type classification.")
|
|
261
|
+
|
|
262
|
+
# ---- 化学子领域识别 ----
|
|
263
|
+
for subfield, keywords in CHEM_KEYWORDS.items():
|
|
264
|
+
match_count = sum(1 for kw in keywords if kw.lower() in full_text_lower)
|
|
265
|
+
if match_count >= 2:
|
|
266
|
+
result["subfields"].append(subfield)
|
|
267
|
+
|
|
268
|
+
# ---- 特征方法提取 ----
|
|
269
|
+
method_patterns = [
|
|
270
|
+
(r'(CP2K|VASP|Gaussian\s*\d+|Q-Chem\s*\d+|GROMACS|LAMMPS|Quantum\s+ESPRESSO)', 2),
|
|
271
|
+
(r'(PBE0|B3LYP|PBE|RPBE|SCAN|HSE06|GGA|LDA|meta-GGA)', 1),
|
|
272
|
+
(r'(impregnation|sol-gel|hydrothermal|solvothermal|co-precipitation|calcination|pyrolysis)', 1),
|
|
273
|
+
(r'(XRD|XPS|TEM|SEM|STEM|HAADF-STEM|NMR|IR|Raman|EXAFS|XANES|BET|EPR)', 1),
|
|
274
|
+
]
|
|
275
|
+
for pattern, weight in method_patterns:
|
|
276
|
+
for m in re.finditer(pattern, full_text, re.I):
|
|
277
|
+
method = m.group(0).strip()
|
|
278
|
+
if method and method not in result["chemical_methods"]:
|
|
279
|
+
result["chemical_methods"].append(method)
|
|
280
|
+
|
|
281
|
+
# ---- 支持信息检测 ----
|
|
282
|
+
if 'supporting information' in full_text_lower or 'supplementary' in full_text_lower:
|
|
283
|
+
result["has_supporting_info"] = True
|
|
284
|
+
|
|
285
|
+
doc.close()
|
|
286
|
+
|
|
287
|
+
# ---- 输出 ----
|
|
288
|
+
if verbose:
|
|
289
|
+
_safe_print(f"PDF Analysis: {result['source']}")
|
|
290
|
+
_safe_print(f" Pages: {result['total_pages']}")
|
|
291
|
+
_safe_print(f" Paper Type: {result['paper_type']} (confidence: {result['paper_type_confidence']})")
|
|
292
|
+
_safe_print(f" Subfields: {', '.join(result['subfields']) or '未识别'}")
|
|
293
|
+
_safe_print(f" Sections: {len(result['sections'])} detected")
|
|
294
|
+
_safe_print(f" Figures: {sum(len(v) for v in result['figures_detected'].values())} detected")
|
|
295
|
+
_safe_print(f" Methods: {', '.join(result['chemical_methods'][:8])}")
|
|
296
|
+
if errors:
|
|
297
|
+
_safe_print(f" [!] {len(errors)} issue(s):")
|
|
298
|
+
for e in errors:
|
|
299
|
+
_safe_print(f" - {e}")
|
|
300
|
+
|
|
301
|
+
if output_json:
|
|
302
|
+
# 将 defaultdict 转为普通 dict 以便 JSON 序列化
|
|
303
|
+
result_out = result.copy()
|
|
304
|
+
result_out["figures_detected"] = {
|
|
305
|
+
k: v for k, v in result["figures_detected"].items()
|
|
306
|
+
}
|
|
307
|
+
result_out["tables_detected"] = {
|
|
308
|
+
k: v for k, v in result["tables_detected"].items()
|
|
309
|
+
}
|
|
310
|
+
with open(output_json, 'w', encoding='utf-8') as f:
|
|
311
|
+
json.dump(result_out, f, indent=2, ensure_ascii=False)
|
|
312
|
+
_safe_print(f" JSON report: {output_json}")
|
|
313
|
+
|
|
314
|
+
return result
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
def main():
|
|
318
|
+
if len(sys.argv) < 2:
|
|
319
|
+
print("Usage: python analyze_paper.py <pdf_file> [--json output.json]")
|
|
320
|
+
print("Example: python analyze_paper.py paper.pdf --json report.json")
|
|
321
|
+
return
|
|
322
|
+
|
|
323
|
+
pdf_path = sys.argv[1]
|
|
324
|
+
output_json = None
|
|
325
|
+
|
|
326
|
+
for i, arg in enumerate(sys.argv):
|
|
327
|
+
if arg == "--json" and i + 1 < len(sys.argv):
|
|
328
|
+
output_json = sys.argv[i + 1]
|
|
329
|
+
|
|
330
|
+
analyze_pdf(pdf_path, verbose=True, output_json=output_json)
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
if __name__ == "__main__":
|
|
334
|
+
main()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PDF转图片脚本 - 将PDF每一页转换为高清图片
|
|
3
|
+
PDF to Images Converter - Converts PDF pages to high-resolution images
|
|
4
|
+
"""
|
|
5
|
+
from pdf2image import convert_from_path
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
def pdf_to_images(pdf_path, output_dir="pdf_images", dpi=200, fmt="PNG"):
|
|
10
|
+
"""
|
|
11
|
+
将PDF转换为图片
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
pdf_path: PDF文件路径
|
|
15
|
+
output_dir: 输出目录
|
|
16
|
+
dpi: 图片分辨率(越高越清晰,但文件越大)
|
|
17
|
+
fmt: 输出格式(PNG, JPEG)
|
|
18
|
+
"""
|
|
19
|
+
if not os.path.exists(pdf_path):
|
|
20
|
+
print(f"Error: File not found: {pdf_path}")
|
|
21
|
+
return
|
|
22
|
+
|
|
23
|
+
# 创建输出目录
|
|
24
|
+
if not os.path.exists(output_dir):
|
|
25
|
+
os.makedirs(output_dir)
|
|
26
|
+
|
|
27
|
+
print(f"Converting PDF: {pdf_path}")
|
|
28
|
+
print(f"Output directory: {output_dir}")
|
|
29
|
+
print(f"DPI: {dpi}, Format: {fmt}")
|
|
30
|
+
print("-" * 40)
|
|
31
|
+
|
|
32
|
+
try:
|
|
33
|
+
images = convert_from_path(pdf_path, dpi=dpi)
|
|
34
|
+
print(f"Successfully converted {len(images)} pages")
|
|
35
|
+
|
|
36
|
+
for i, image in enumerate(images):
|
|
37
|
+
page_num = i + 1
|
|
38
|
+
image_path = os.path.join(output_dir, f"page_{page_num}.{fmt.lower()}")
|
|
39
|
+
image.save(image_path, fmt)
|
|
40
|
+
print(f" Saved: page_{page_num}.{fmt.lower()}")
|
|
41
|
+
|
|
42
|
+
print("-" * 40)
|
|
43
|
+
print(f"Conversion complete! {len(images)} pages saved to: {output_dir}")
|
|
44
|
+
return len(images)
|
|
45
|
+
|
|
46
|
+
except Exception as e:
|
|
47
|
+
print(f"Error: {e}")
|
|
48
|
+
return 0
|
|
49
|
+
|
|
50
|
+
def main():
|
|
51
|
+
if len(sys.argv) < 2:
|
|
52
|
+
print("Usage: python convert_to_images.py <pdf_file> [output_dir] [dpi]")
|
|
53
|
+
print("Examples:")
|
|
54
|
+
print(" python convert_to_images.py paper.pdf")
|
|
55
|
+
print(" python convert_to_images.py paper.pdf images 300")
|
|
56
|
+
print(" python convert_to_images.py paper.pdf pages 150 jpg")
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
pdf_path = sys.argv[1]
|
|
60
|
+
output_dir = sys.argv[2] if len(sys.argv) > 2 else "pdf_images"
|
|
61
|
+
dpi = int(sys.argv[3]) if len(sys.argv) > 3 else 200
|
|
62
|
+
fmt = sys.argv[4].upper() if len(sys.argv) > 4 else "PNG"
|
|
63
|
+
|
|
64
|
+
pdf_to_images(pdf_path, output_dir, dpi, fmt)
|
|
65
|
+
|
|
66
|
+
if __name__ == "__main__":
|
|
67
|
+
main()
|