pdfget 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pdfget-0.1.0/.github/workflows/ci.yml +59 -0
- pdfget-0.1.0/.github/workflows/publish.yml +141 -0
- pdfget-0.1.0/.gitignore +163 -0
- pdfget-0.1.0/.pre-commit-config.yaml +26 -0
- pdfget-0.1.0/CHANGELOG.md +122 -0
- pdfget-0.1.0/PKG-INFO +200 -0
- pdfget-0.1.0/README.md +182 -0
- pdfget-0.1.0/data/cache/PMC12501010_10.1038s41467-025-64046-1.pdf +22747 -25
- pdfget-0.1.0/data/cache/PMC12542208_10.1186s12870-025-07447-0.pdf +19027 -61
- pdfget-0.1.0/data/cache/PMC12595684_10.1186s12864-025-12188-3.pdf +9087 -35
- pdfget-0.1.0/data/cache/PMC12603064_10.1038s41467-025-64846-5.pdf +12843 -10
- pdfget-0.1.0/data/cache/PMC12613758_10.1186s12870-025-07717-x.pdf +4570 -15
- pdfget-0.1.0/data/cache/PMC12625569_10.1186s12870-025-07737-7.pdf +3822 -16
- pdfget-0.1.0/data/cache/PMC12638397_10.1007s00423-025-03883-6.pdf +29451 -97
- pdfget-0.1.0/data/cache/PMC12642243_10.1186s12870-025-07791-1.pdf +3601 -13
- pdfget-0.1.0/data/cache/PMC12645687_10.1186s12866-025-04485-4.pdf +21230 -64
- pdfget-0.1.0/data/cache/PMC12652641_10.3390ijms262210839.pdf +0 -0
- pdfget-0.1.0/data/cache/PMC12654256_10.3390microorganisms13112456.pdf +0 -0
- pdfget-0.1.0/data/cache/PMC12658294_10.1017qpb.2025.10024.pdf +0 -0
- pdfget-0.1.0/data/cache/PMC12661703_10.1186s12870-025-07807-w.pdf +3777 -15
- pdfget-0.1.0/data/cache/PMC7809815_10.1186s12870-020-02802-9.pdf +2781 -6
- pdfget-0.1.0/data/example_dois.csv +4 -0
- pdfget-0.1.0/data/example_dois.txt +3 -0
- pdfget-0.1.0/data/pdfs/download_results.json +63 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097051.json +0 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097059.json +0 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097078.json +0 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097089.json +66 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097095.json +39 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097099.json +39 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097230.json +48 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097236.json +21 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097241.json +27 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097294.json +45 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097505.json +257 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097514.json +257 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097746.json +95 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097756.json +161 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097906.json +357 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097955.json +167 -0
- pdfget-0.1.0/data/pdfs/search_results_1765097995.json +183 -0
- pdfget-0.1.0/data/pdfs/search_results_1765098011.json +247 -0
- pdfget-0.1.0/data/pdfs/search_results_1765098173.json +206 -0
- pdfget-0.1.0/data/pdfs/search_results_1765098189.json +143 -0
- pdfget-0.1.0/data/pdfs/search_results_1765112541.json +2090 -0
- pdfget-0.1.0/data/pdfs/search_results_1765112944.json +1037 -0
- pdfget-0.1.0/data/pdfs/search_results_1765114438.json +17 -0
- pdfget-0.1.0/pyproject.toml +90 -0
- pdfget-0.1.0/pytest.ini +21 -0
- pdfget-0.1.0/src/pdfget/__init__.py +12 -0
- pdfget-0.1.0/src/pdfget/__main__.py +6 -0
- pdfget-0.1.0/src/pdfget/config.py +30 -0
- pdfget-0.1.0/src/pdfget/downloader.py +308 -0
- pdfget-0.1.0/src/pdfget/fetcher.py +415 -0
- pdfget-0.1.0/src/pdfget/main.py +282 -0
- pdfget-0.1.0/tests/__init__.py +3 -0
- pdfget-0.1.0/tests/conftest.py +140 -0
- pdfget-0.1.0/tests/test_basic_functionality.py +100 -0
- pdfget-0.1.0/tests/test_config.py +86 -0
- pdfget-0.1.0/tests/test_fetcher_basic.py +175 -0
- pdfget-0.1.0/uv.lock +1142 -0
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ main, develop ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ main, develop ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- name: Checkout code
|
|
18
|
+
uses: actions/checkout@v4
|
|
19
|
+
|
|
20
|
+
- name: Set up Python ${{ matrix.python-version }}
|
|
21
|
+
uses: actions/setup-python@v4
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
|
|
25
|
+
- name: Install uv
|
|
26
|
+
uses: astral-sh/setup-uv@v2
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: |
|
|
30
|
+
uv sync --dev
|
|
31
|
+
|
|
32
|
+
- name: Run code quality checks
|
|
33
|
+
run: |
|
|
34
|
+
uv run ruff check .
|
|
35
|
+
uv run ruff format --check .
|
|
36
|
+
|
|
37
|
+
- name: Run type checking
|
|
38
|
+
run: |
|
|
39
|
+
uv run mypy src/
|
|
40
|
+
|
|
41
|
+
- name: Run tests
|
|
42
|
+
run: |
|
|
43
|
+
uv run pytest tests/ -v --cov=pdfget --cov-report=xml
|
|
44
|
+
|
|
45
|
+
- name: Upload coverage to Codecov
|
|
46
|
+
uses: codecov/codecov-action@v4
|
|
47
|
+
with:
|
|
48
|
+
file: ./coverage.xml
|
|
49
|
+
flags: unittests
|
|
50
|
+
name: codecov-umbrella
|
|
51
|
+
fail_ci_if_error: false
|
|
52
|
+
|
|
53
|
+
- name: Build package
|
|
54
|
+
run: |
|
|
55
|
+
uv build
|
|
56
|
+
|
|
57
|
+
- name: Check package
|
|
58
|
+
run: |
|
|
59
|
+
uv run twine check dist/*
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
name: Release & Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*' # 触发条件:推送以 v 开头的标签
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: write # 允许创建release
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
release-and-publish:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout code
|
|
17
|
+
uses: actions/checkout@v6.0.1
|
|
18
|
+
|
|
19
|
+
- name: Set up Python 3.12
|
|
20
|
+
uses: actions/setup-python@v5
|
|
21
|
+
with:
|
|
22
|
+
python-version: "3.12"
|
|
23
|
+
|
|
24
|
+
- name: Install uv
|
|
25
|
+
uses: astral-sh/setup-uv@v7.1.4
|
|
26
|
+
|
|
27
|
+
- name: Install dependencies
|
|
28
|
+
run: |
|
|
29
|
+
uv sync --dev
|
|
30
|
+
|
|
31
|
+
- name: Run tests
|
|
32
|
+
run: |
|
|
33
|
+
uv run pytest tests/ -v --cov=src --cov-report=xml
|
|
34
|
+
|
|
35
|
+
- name: Run code quality checks
|
|
36
|
+
run: |
|
|
37
|
+
uv run ruff check .
|
|
38
|
+
uv run ruff format --check .
|
|
39
|
+
uv run mypy src/
|
|
40
|
+
|
|
41
|
+
- name: Build package
|
|
42
|
+
run: |
|
|
43
|
+
uv build
|
|
44
|
+
|
|
45
|
+
- name: Check package
|
|
46
|
+
run: |
|
|
47
|
+
uv run twine check dist/*
|
|
48
|
+
|
|
49
|
+
- name: Generate release notes
|
|
50
|
+
id: release_notes
|
|
51
|
+
run: |
|
|
52
|
+
# 从CHANGELOG.md提取最新版本信息
|
|
53
|
+
python << 'EOF'
|
|
54
|
+
import re
|
|
55
|
+
|
|
56
|
+
with open('CHANGELOG.md', 'r', encoding='utf-8') as f:
|
|
57
|
+
content = f.read()
|
|
58
|
+
|
|
59
|
+
# 查找最新的版本信息
|
|
60
|
+
version_match = re.search(r'^## \[([0-9.]+)\] - ([0-9-]+)', content, re.MULTILINE)
|
|
61
|
+
if version_match:
|
|
62
|
+
version = version_match.group(1)
|
|
63
|
+
date = version_match.group(2)
|
|
64
|
+
|
|
65
|
+
# 提取该版本的主要内容
|
|
66
|
+
version_start = version_match.start()
|
|
67
|
+
next_version_match = re.search(r'\n## \[', content[version_start + 1:])
|
|
68
|
+
if next_version_match:
|
|
69
|
+
version_content = content[version_start:version_start + next_version_match.start()]
|
|
70
|
+
else:
|
|
71
|
+
version_content = content[version_start:]
|
|
72
|
+
|
|
73
|
+
# 提取主要功能点
|
|
74
|
+
features = []
|
|
75
|
+
lines = version_content.split('\n')
|
|
76
|
+
for line in lines:
|
|
77
|
+
if '新增功能' in line:
|
|
78
|
+
continue
|
|
79
|
+
if line.startswith('- **') and (':' in line or ':' in line):
|
|
80
|
+
features.append(line.strip())
|
|
81
|
+
|
|
82
|
+
# 设置输出
|
|
83
|
+
print(f"::set-output name=version::{version}")
|
|
84
|
+
print(f"::set-output name=date::{date}")
|
|
85
|
+
print(f"::set-output name=features::{'\\n'.join(features[:8])}") # 限制显示数量
|
|
86
|
+
else:
|
|
87
|
+
print("::set-output name=version::unknown")
|
|
88
|
+
print("::set-output name=features::No release notes found")
|
|
89
|
+
EOF
|
|
90
|
+
|
|
91
|
+
- name: Create GitHub Release
|
|
92
|
+
uses: softprops/action-gh-release@v2.5.0
|
|
93
|
+
with:
|
|
94
|
+
tag_name: ${{ github.ref_name }}
|
|
95
|
+
name: Release ${{ steps.release_notes.outputs.version }}
|
|
96
|
+
body: |
|
|
97
|
+
🎉 PDFGet ${{ steps.release_notes.outputs.version }} 发布!
|
|
98
|
+
|
|
99
|
+
## 📦 安装方法
|
|
100
|
+
```bash
|
|
101
|
+
pip install pdfget==${{ steps.release_notes.outputs.version }}
|
|
102
|
+
# 或使用uv
|
|
103
|
+
uv add pdfget==${{ steps.release_notes.outputs.version }}
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
## 🚀 主要特性
|
|
107
|
+
${{ steps.release_notes.outputs.features }}
|
|
108
|
+
|
|
109
|
+
## 📖 使用示例
|
|
110
|
+
```bash
|
|
111
|
+
# 搜索并下载文献
|
|
112
|
+
pdfget -s "machine learning" -l 20 -d -t 5
|
|
113
|
+
|
|
114
|
+
# 高级检索
|
|
115
|
+
pdfget -s "cancer AND immunotherapy NOT review" -l 30
|
|
116
|
+
|
|
117
|
+
# 批量下载
|
|
118
|
+
pdfget -i dois.csv -d -t 3
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
## 📋 完整更新日志
|
|
122
|
+
请查看 [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md)
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
🤖 自动发布于 ${{ steps.release_notes.outputs.date }}
|
|
127
|
+
draft: false
|
|
128
|
+
prerelease: false
|
|
129
|
+
|
|
130
|
+
- name: Publish to PyPI
|
|
131
|
+
env:
|
|
132
|
+
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
|
133
|
+
run: |
|
|
134
|
+
uv publish --token $PYPI_API_TOKEN
|
|
135
|
+
|
|
136
|
+
- name: Release Confirmation
|
|
137
|
+
if: success()
|
|
138
|
+
run: |
|
|
139
|
+
echo "✅ Release ${{ steps.release_notes.outputs.version }} published successfully!"
|
|
140
|
+
echo "📦 PyPI: https://pypi.org/project/pdfget/${{ steps.release_notes.outputs.version }}/"
|
|
141
|
+
echo "🔗 GitHub: https://github.com/${{ github.repository }}/releases/tag/${{ github.ref_name }}"
|
pdfget-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
pip-wheel-metadata/
|
|
24
|
+
share/python-wheels/
|
|
25
|
+
*.egg-info/
|
|
26
|
+
.installed.cfg
|
|
27
|
+
*.egg
|
|
28
|
+
MANIFEST
|
|
29
|
+
|
|
30
|
+
# PyInstaller
|
|
31
|
+
# Usually these files are written by a python script from a template
|
|
32
|
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
33
|
+
*.manifest
|
|
34
|
+
*.spec
|
|
35
|
+
|
|
36
|
+
# Installer logs
|
|
37
|
+
pip-log.txt
|
|
38
|
+
pip-delete-this-directory.txt
|
|
39
|
+
|
|
40
|
+
# Unit test / coverage reports
|
|
41
|
+
htmlcov/
|
|
42
|
+
.tox/
|
|
43
|
+
.nox/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.cache
|
|
47
|
+
nosetests.xml
|
|
48
|
+
coverage.xml
|
|
49
|
+
*.cover
|
|
50
|
+
*.py,cover
|
|
51
|
+
.hypothesis/
|
|
52
|
+
.pytest_cache/
|
|
53
|
+
|
|
54
|
+
# Translations
|
|
55
|
+
*.mo
|
|
56
|
+
*.pot
|
|
57
|
+
|
|
58
|
+
# Django stuff:
|
|
59
|
+
*.log
|
|
60
|
+
local_settings.py
|
|
61
|
+
db.sqlite3
|
|
62
|
+
db.sqlite3-journal
|
|
63
|
+
|
|
64
|
+
# Flask stuff:
|
|
65
|
+
instance/
|
|
66
|
+
.webassets-cache
|
|
67
|
+
|
|
68
|
+
# Scrapy stuff:
|
|
69
|
+
.scrapy
|
|
70
|
+
|
|
71
|
+
# Sphinx documentation
|
|
72
|
+
docs/_build/
|
|
73
|
+
|
|
74
|
+
# PyBuilder
|
|
75
|
+
target/
|
|
76
|
+
|
|
77
|
+
# Jupyter Notebook
|
|
78
|
+
.ipynb_checkpoints
|
|
79
|
+
|
|
80
|
+
# IPython
|
|
81
|
+
profile_default/
|
|
82
|
+
ipython_config.py
|
|
83
|
+
|
|
84
|
+
# pyenv
|
|
85
|
+
.python-version
|
|
86
|
+
|
|
87
|
+
# pipenv
|
|
88
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
89
|
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
90
|
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
91
|
+
# install all needed dependencies.
|
|
92
|
+
#Pipfile.lock
|
|
93
|
+
|
|
94
|
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
95
|
+
__pypackages__/
|
|
96
|
+
|
|
97
|
+
# Celery stuff
|
|
98
|
+
celerybeat-schedule
|
|
99
|
+
celerybeat.pid
|
|
100
|
+
|
|
101
|
+
# SageMath parsed files
|
|
102
|
+
*.sage.py
|
|
103
|
+
|
|
104
|
+
# Environments
|
|
105
|
+
.env
|
|
106
|
+
.venv
|
|
107
|
+
env/
|
|
108
|
+
venv/
|
|
109
|
+
ENV/
|
|
110
|
+
env.bak/
|
|
111
|
+
venv.bak/
|
|
112
|
+
|
|
113
|
+
# Spyder project settings
|
|
114
|
+
.spyderproject
|
|
115
|
+
.spyproject
|
|
116
|
+
|
|
117
|
+
# Rope project settings
|
|
118
|
+
.ropeproject
|
|
119
|
+
|
|
120
|
+
# mkdocs documentation
|
|
121
|
+
/site
|
|
122
|
+
|
|
123
|
+
# mypy
|
|
124
|
+
.mypy_cache/
|
|
125
|
+
.dmypy.json
|
|
126
|
+
dmypy.json
|
|
127
|
+
|
|
128
|
+
# Pyre type checker
|
|
129
|
+
.pyre/
|
|
130
|
+
|
|
131
|
+
# uv
|
|
132
|
+
.uv_cache/
|
|
133
|
+
|
|
134
|
+
# IDE
|
|
135
|
+
.vscode/
|
|
136
|
+
.idea/
|
|
137
|
+
*.swp
|
|
138
|
+
*.swo
|
|
139
|
+
|
|
140
|
+
# Data files
|
|
141
|
+
data/pdfs/*.pdf
|
|
142
|
+
data/cache/*.json
|
|
143
|
+
data/logs/*.log
|
|
144
|
+
|
|
145
|
+
# OS
|
|
146
|
+
.DS_Store
|
|
147
|
+
Thumbs.db
|
|
148
|
+
|
|
149
|
+
# Environment variables
|
|
150
|
+
.env
|
|
151
|
+
.env.local
|
|
152
|
+
|
|
153
|
+
# Project specific
|
|
154
|
+
temp/
|
|
155
|
+
tmp/
|
|
156
|
+
|
|
157
|
+
# Test temporary directories
|
|
158
|
+
MagicMock/
|
|
159
|
+
PaperFetcher().cache_dir/
|
|
160
|
+
unittest.mock/
|
|
161
|
+
temp_*/
|
|
162
|
+
test_*/
|
|
163
|
+
*.tmp
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
# Ruff version.
|
|
4
|
+
rev: v0.1.14
|
|
5
|
+
hooks:
|
|
6
|
+
# Run the linter.
|
|
7
|
+
- id: ruff
|
|
8
|
+
args: [--fix]
|
|
9
|
+
# Run the formatter.
|
|
10
|
+
- id: ruff-format
|
|
11
|
+
|
|
12
|
+
- repo: local
|
|
13
|
+
hooks:
|
|
14
|
+
- id: mypy-local
|
|
15
|
+
name: mypy
|
|
16
|
+
entry: uv run mypy src/
|
|
17
|
+
language: system
|
|
18
|
+
pass_filenames: false
|
|
19
|
+
|
|
20
|
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
|
21
|
+
rev: v5.0.0
|
|
22
|
+
hooks:
|
|
23
|
+
- id: trailing-whitespace
|
|
24
|
+
- id: end-of-file-fixer
|
|
25
|
+
- id: check-yaml
|
|
26
|
+
- id: check-added-large-files
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# 更新日志
|
|
2
|
+
|
|
3
|
+
本文档记录了PDFGet项目的所有重要更改。
|
|
4
|
+
|
|
5
|
+
格式基于 [Keep a Changelog](https://keepachangelog.com/zh-CN/1.0.0/),
|
|
6
|
+
并且本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。
|
|
7
|
+
|
|
8
|
+
## [0.1.0] - 2025-12-07
|
|
9
|
+
|
|
10
|
+
### 🎉 首次发布
|
|
11
|
+
|
|
12
|
+
#### ✨ 新增功能
|
|
13
|
+
- **高级文献搜索**:支持布尔运算符(AND、OR、NOT)、字段检索(title:、author:、journal:)、短语检索
|
|
14
|
+
- **智能缓存系统**:24小时自动过期,避免重复下载
|
|
15
|
+
- **并发下载**:多线程并行下载,3-5倍速度提升
|
|
16
|
+
- **批量处理**:支持CSV/TXT文件批量下载
|
|
17
|
+
- **丰富元数据**:包含作者、单位、期刊、摘要、引用等10+个字段
|
|
18
|
+
- **简洁命令行**:单字母参数(-s, -l, -t, -d, -v),易于使用
|
|
19
|
+
|
|
20
|
+
#### 🔧 核心特性
|
|
21
|
+
- **Europe PMC API集成**:权威学术数据源
|
|
22
|
+
- **线程安全设计**:并发环境下的数据一致性
|
|
23
|
+
- **智能重试机制**:网络错误自动重试
|
|
24
|
+
- **优雅降级**:PDF不可用时自动获取HTML全文
|
|
25
|
+
|
|
26
|
+
#### 📊 性能表现
|
|
27
|
+
| 文献数量 | 单线程耗时 | 并发耗时 | 性能提升 |
|
|
28
|
+
|---------|-----------|----------|----------|
|
|
29
|
+
| 5篇 | ~25秒 | ~8秒 | 3x |
|
|
30
|
+
| 20篇 | ~100秒 | ~25秒 | 4x |
|
|
31
|
+
| 50篇 | ~250秒 | ~60秒 | 4x |
|
|
32
|
+
|
|
33
|
+
#### 🛠️ 技术实现
|
|
34
|
+
- **Python 3.12+**:现代Python特性和类型注解
|
|
35
|
+
- **ThreadPoolExecutor**:高效的线程池管理
|
|
36
|
+
- **智能缓存**:24小时自动过期
|
|
37
|
+
- **模块化设计**:清晰的代码结构
|
|
38
|
+
- **自动化代码质量**:pre-commit hooks自动检查和修复
|
|
39
|
+
|
|
40
|
+
#### 📦 包结构
|
|
41
|
+
```
|
|
42
|
+
pdfget/
|
|
43
|
+
├── src/pdfget/
|
|
44
|
+
│ ├── __init__.py # 包初始化
|
|
45
|
+
│ ├── __main__.py # 命令行入口
|
|
46
|
+
│ ├── main.py # 主程序逻辑
|
|
47
|
+
│ ├── fetcher.py # 核心文献获取器
|
|
48
|
+
│ ├── concurrent_downloader.py # 并发下载器
|
|
49
|
+
│ └── config.py # 配置文件
|
|
50
|
+
├── tests/ # 测试文件
|
|
51
|
+
├── data/ # 数据目录
|
|
52
|
+
│ ├── pdfs/ # 下载的PDF
|
|
53
|
+
│ └── cache/ # 缓存文件
|
|
54
|
+
├── tests/ # pytest测试
|
|
55
|
+
├── README.md # 项目文档
|
|
56
|
+
├── CHANGELOG.md # 更新日志
|
|
57
|
+
├── pyproject.toml # 项目配置
|
|
58
|
+
└── pytest.ini # 测试配置
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
#### 🧪 测试覆盖
|
|
62
|
+
- **28个测试用例**:100%通过率
|
|
63
|
+
- **核心功能覆盖**:60%+代码覆盖率
|
|
64
|
+
- **Mock测试**:避免实际网络请求
|
|
65
|
+
- **并发测试**:验证多线程安全性
|
|
66
|
+
|
|
67
|
+
#### 📖 使用示例
|
|
68
|
+
```bash
|
|
69
|
+
# 搜索文献
|
|
70
|
+
uv run pdfget -s "machine learning" -l 20
|
|
71
|
+
|
|
72
|
+
# 高级检索
|
|
73
|
+
uv run pdfget -s "cancer AND immunotherapy NOT review" -l 30
|
|
74
|
+
|
|
75
|
+
# 并发下载
|
|
76
|
+
uv run pdfget -s "deep learning" -l 50 -d -t 5
|
|
77
|
+
|
|
78
|
+
# 单篇下载
|
|
79
|
+
uv run pdfget --doi 10.1016/j.cell.2020.01.021
|
|
80
|
+
|
|
81
|
+
# 批量下载
|
|
82
|
+
uv run pdfget -i dois.csv -d -t 3
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
#### 🔍 高级检索语法
|
|
86
|
+
```bash
|
|
87
|
+
# 布尔运算符
|
|
88
|
+
uv run pdfget -s "cancer AND immunotherapy" -l 30
|
|
89
|
+
|
|
90
|
+
# 字段检索
|
|
91
|
+
uv run pdfget -s 'title:"deep learning" AND author:hinton'
|
|
92
|
+
|
|
93
|
+
# 短语检索
|
|
94
|
+
uv run pdfget -s '"quantum computing"' -l 10
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
#### 🏗️ 开发工具集成
|
|
98
|
+
- **ruff**:代码规范检查
|
|
99
|
+
- **pytest**:单元测试框架
|
|
100
|
+
- **pytest-cov**:测试覆盖率
|
|
101
|
+
- **uv**:现代Python包管理
|
|
102
|
+
|
|
103
|
+
#### 📄 许可证
|
|
104
|
+
- MIT License - 允许自由使用和修改
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## 版本说明
|
|
109
|
+
|
|
110
|
+
- **主版本**:不兼容的API修改
|
|
111
|
+
- **次版本**:向下兼容的功能性新增
|
|
112
|
+
- **修订版本**:向下兼容的问题修正
|
|
113
|
+
|
|
114
|
+
## 贡献指南
|
|
115
|
+
|
|
116
|
+
欢迎提交Issue和Pull Request来改进这个工具!
|
|
117
|
+
|
|
118
|
+
## 联系方式
|
|
119
|
+
|
|
120
|
+
如有问题或建议,请通过以下方式联系:
|
|
121
|
+
- 提交GitHub Issue
|
|
122
|
+
- 发送邮件至 gqy (qingyu_ge@foxmail.com)
|
pdfget-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pdfget
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: 智能文献搜索与批量下载工具,支持高级检索和并发下载
|
|
5
|
+
Author-email: gqy <qingyu_ge@foxmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.12
|
|
8
|
+
Requires-Dist: pandas>=2.0.0
|
|
9
|
+
Requires-Dist: requests>=2.31.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: black>=24.0.0; extra == 'dev'
|
|
12
|
+
Requires-Dist: isort>=5.13.0; extra == 'dev'
|
|
13
|
+
Requires-Dist: mypy>=1.9.0; extra == 'dev'
|
|
14
|
+
Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
|
|
15
|
+
Requires-Dist: pytest>=8.0.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: ruff>=0.5.0; extra == 'dev'
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# PDFGet - 高效文献下载工具
|
|
20
|
+
|
|
21
|
+
智能文献搜索与批量下载工具,支持高级检索和并发下载。
|
|
22
|
+
|
|
23
|
+
## 1. 项目概述
|
|
24
|
+
|
|
25
|
+
PDFGet是一个专为科研工作者设计的智能文献搜索与批量下载工具,集成了Europe PMC等权威学术数据库,提供高效的文献获取和管理功能。
|
|
26
|
+
|
|
27
|
+
### 1.1 主要特性
|
|
28
|
+
|
|
29
|
+
- 🔍 **高级搜索**:支持布尔运算符、字段检索、短语检索
|
|
30
|
+
- 🚀 **并发下载**:多线程并行下载,3-5倍速度提升
|
|
31
|
+
- 📊 **丰富元数据**:包含作者、单位、期刊、摘要、引用等完整信息
|
|
32
|
+
- 💾 **智能缓存**:24小时缓存,避免重复下载
|
|
33
|
+
- 📄 **批量处理**:支持CSV/TXT文件批量下载
|
|
34
|
+
|
|
35
|
+
## 2. 安装与配置
|
|
36
|
+
|
|
37
|
+
### 2.1 系统要求
|
|
38
|
+
|
|
39
|
+
详细的系统要求和依赖信息请查看 [pyproject.toml](pyproject.toml) 文件。
|
|
40
|
+
|
|
41
|
+
### 2.2 安装方法
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# 使用pip安装
|
|
45
|
+
pip install pdfget
|
|
46
|
+
|
|
47
|
+
# 使用uv安装
|
|
48
|
+
uv add pdfget
|
|
49
|
+
|
|
50
|
+
# 或从源码安装
|
|
51
|
+
git clone https://github.com/gqy20/pdfget.git
|
|
52
|
+
cd pdfget
|
|
53
|
+
pip install -e .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### 2.3 快速开始
|
|
57
|
+
|
|
58
|
+
安装完成后,您可以直接使用 `pdfget` 命令:
|
|
59
|
+
|
|
60
|
+
```bash
|
|
61
|
+
# 搜索文献
|
|
62
|
+
pdfget -s "machine learning" -l 20
|
|
63
|
+
|
|
64
|
+
# 搜索并下载
|
|
65
|
+
pdfget -s "cancer immunotherapy" -d
|
|
66
|
+
|
|
67
|
+
# 并发下载(5线程)
|
|
68
|
+
pdfget -s "deep learning" -l 50 -d -t 5
|
|
69
|
+
|
|
70
|
+
# 单篇文献下载
|
|
71
|
+
pdfget --doi 10.1016/j.cell.2020.01.021
|
|
72
|
+
|
|
73
|
+
# 批量下载
|
|
74
|
+
pdfget -i dois.csv -d -t 3
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
如果您使用 uv 作为包管理器,也可以:
|
|
78
|
+
```bash
|
|
79
|
+
# 使用uv运行
|
|
80
|
+
uv run pdfget -s "machine learning" -l 20
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## 3. 高级检索语法
|
|
84
|
+
|
|
85
|
+
### 3.1 布尔运算符
|
|
86
|
+
```bash
|
|
87
|
+
# AND: 同时包含多个关键词
|
|
88
|
+
pdfget -s "cancer AND immunotherapy" -l 30
|
|
89
|
+
|
|
90
|
+
# OR: 包含任意关键词
|
|
91
|
+
pdfget -s "machine OR deep learning" -l 20
|
|
92
|
+
|
|
93
|
+
# NOT: 排除特定词汇
|
|
94
|
+
pdfget -s "cancer AND immunotherapy NOT review" -l 30
|
|
95
|
+
|
|
96
|
+
# 复杂组合
|
|
97
|
+
pdfget -s "(cancer OR tumor) AND immunotherapy NOT mice" -l 25
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### 3.2 字段检索
|
|
101
|
+
```bash
|
|
102
|
+
# 标题检索
|
|
103
|
+
pdfget -s 'title:"deep learning"' -l 15
|
|
104
|
+
|
|
105
|
+
# 作者检索
|
|
106
|
+
pdfget -s 'author:hinton AND title:"neural networks"' -l 10
|
|
107
|
+
|
|
108
|
+
# 期刊检索
|
|
109
|
+
pdfget -s 'journal:nature AND cancer' -l 20
|
|
110
|
+
|
|
111
|
+
# 年份检索
|
|
112
|
+
pdfget -s 'cancer AND year:2023' -l 15
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### 3.3 短语和精确匹配
|
|
116
|
+
```bash
|
|
117
|
+
# 短语检索(用双引号)
|
|
118
|
+
pdfget -s '"quantum computing"' -l 10
|
|
119
|
+
|
|
120
|
+
# 混合使用
|
|
121
|
+
pdfget -s '"gene expression" AND (cancer OR tumor) NOT review' -l 20
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### 3.4 实用检索技巧
|
|
125
|
+
- 使用括号分组复杂的布尔逻辑
|
|
126
|
+
- 短语用双引号确保精确匹配
|
|
127
|
+
- 可以组合多个字段进行精确检索
|
|
128
|
+
- 使用 NOT 过滤掉不相关的结果(如综述、评论等)
|
|
129
|
+
|
|
130
|
+
## 4. 性能优势
|
|
131
|
+
|
|
132
|
+
### 4.1 并发下载效率对比
|
|
133
|
+
|
|
134
|
+
| 文献数量 | 单线程耗时 | 并发耗时 | 性能提升 |
|
|
135
|
+
|---------|-----------|----------|----------|
|
|
136
|
+
| 5篇 | ~25秒 | ~8秒 | 3x |
|
|
137
|
+
| 20篇 | ~100秒 | ~25秒 | 4x |
|
|
138
|
+
| 50篇 | ~250秒 | ~60秒 | 4x |
|
|
139
|
+
|
|
140
|
+
## 5. 命令行参数详解
|
|
141
|
+
|
|
142
|
+
### 5.1 核心参数
|
|
143
|
+
- `-s QUERY` : 搜索文献
|
|
144
|
+
- `--doi DOI` : 下载单个文献
|
|
145
|
+
- `-i FILE` : 批量输入文件
|
|
146
|
+
- `-d` : 下载PDF
|
|
147
|
+
|
|
148
|
+
### 5.2 优化参数
|
|
149
|
+
- `-l NUM` : 搜索结果数量(默认50)
|
|
150
|
+
- `-t NUM` : 并发线程数(默认3)
|
|
151
|
+
- `-v` : 详细输出
|
|
152
|
+
|
|
153
|
+
## 6. 输出格式与文件结构
|
|
154
|
+
|
|
155
|
+
### 6.1 搜索结果格式
|
|
156
|
+
```json
|
|
157
|
+
{
|
|
158
|
+
"query": "关键词",
|
|
159
|
+
"total": 10,
|
|
160
|
+
"results": [
|
|
161
|
+
{
|
|
162
|
+
"title": "文献标题",
|
|
163
|
+
"authors": ["作者1", "作者2"],
|
|
164
|
+
"journal": "期刊名称",
|
|
165
|
+
"year": "2025",
|
|
166
|
+
"doi": "10.1016/xxx",
|
|
167
|
+
"affiliation": "作者单位",
|
|
168
|
+
"citedBy": 0,
|
|
169
|
+
"keywords": ["关键词1", "关键词2"]
|
|
170
|
+
}
|
|
171
|
+
]
|
|
172
|
+
}
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
### 6.2 文件目录结构
|
|
176
|
+
```
|
|
177
|
+
data/
|
|
178
|
+
├── pdfs/ # 下载的PDF文件
|
|
179
|
+
├── cache/ # 缓存文件
|
|
180
|
+
└── download_results.json # 下载结果记录
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
## 7. 许可证
|
|
184
|
+
|
|
185
|
+
本项目采用 MIT License,允许自由使用和修改。
|
|
186
|
+
|
|
187
|
+
## 📚 更新日志
|
|
188
|
+
|
|
189
|
+
<details>
|
|
190
|
+
<summary><strong>📋 查看版本更新历史</strong></summary>
|
|
191
|
+
|
|
192
|
+
- 🔗 **完整更新日志**: [CHANGELOG.md](CHANGELOG.md)
|
|
193
|
+
- ✨ **最新版本 (v0.1.0)**: 高级文献搜索 + 并发下载 + 智能缓存
|
|
194
|
+
|
|
195
|
+
</details>
|
|
196
|
+
|
|
197
|
+
## 🔗 相关链接
|
|
198
|
+
|
|
199
|
+
- **项目源码**: [GitHub Repository](https://github.com/gqy20/pdfget)
|
|
200
|
+
- **问题反馈**: [GitHub Issues](https://github.com/gqy20/pdfget/issues)
|