pdfget 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pdfget-0.1.0/.github/workflows/ci.yml +59 -0
  2. pdfget-0.1.0/.github/workflows/publish.yml +141 -0
  3. pdfget-0.1.0/.gitignore +163 -0
  4. pdfget-0.1.0/.pre-commit-config.yaml +26 -0
  5. pdfget-0.1.0/CHANGELOG.md +122 -0
  6. pdfget-0.1.0/PKG-INFO +200 -0
  7. pdfget-0.1.0/README.md +182 -0
  8. pdfget-0.1.0/data/cache/PMC12501010_10.1038s41467-025-64046-1.pdf +22747 -25
  9. pdfget-0.1.0/data/cache/PMC12542208_10.1186s12870-025-07447-0.pdf +19027 -61
  10. pdfget-0.1.0/data/cache/PMC12595684_10.1186s12864-025-12188-3.pdf +9087 -35
  11. pdfget-0.1.0/data/cache/PMC12603064_10.1038s41467-025-64846-5.pdf +12843 -10
  12. pdfget-0.1.0/data/cache/PMC12613758_10.1186s12870-025-07717-x.pdf +4570 -15
  13. pdfget-0.1.0/data/cache/PMC12625569_10.1186s12870-025-07737-7.pdf +3822 -16
  14. pdfget-0.1.0/data/cache/PMC12638397_10.1007s00423-025-03883-6.pdf +29451 -97
  15. pdfget-0.1.0/data/cache/PMC12642243_10.1186s12870-025-07791-1.pdf +3601 -13
  16. pdfget-0.1.0/data/cache/PMC12645687_10.1186s12866-025-04485-4.pdf +21230 -64
  17. pdfget-0.1.0/data/cache/PMC12652641_10.3390ijms262210839.pdf +0 -0
  18. pdfget-0.1.0/data/cache/PMC12654256_10.3390microorganisms13112456.pdf +0 -0
  19. pdfget-0.1.0/data/cache/PMC12658294_10.1017qpb.2025.10024.pdf +0 -0
  20. pdfget-0.1.0/data/cache/PMC12661703_10.1186s12870-025-07807-w.pdf +3777 -15
  21. pdfget-0.1.0/data/cache/PMC7809815_10.1186s12870-020-02802-9.pdf +2781 -6
  22. pdfget-0.1.0/data/example_dois.csv +4 -0
  23. pdfget-0.1.0/data/example_dois.txt +3 -0
  24. pdfget-0.1.0/data/pdfs/download_results.json +63 -0
  25. pdfget-0.1.0/data/pdfs/search_results_1765097051.json +0 -0
  26. pdfget-0.1.0/data/pdfs/search_results_1765097059.json +0 -0
  27. pdfget-0.1.0/data/pdfs/search_results_1765097078.json +0 -0
  28. pdfget-0.1.0/data/pdfs/search_results_1765097089.json +66 -0
  29. pdfget-0.1.0/data/pdfs/search_results_1765097095.json +39 -0
  30. pdfget-0.1.0/data/pdfs/search_results_1765097099.json +39 -0
  31. pdfget-0.1.0/data/pdfs/search_results_1765097230.json +48 -0
  32. pdfget-0.1.0/data/pdfs/search_results_1765097236.json +21 -0
  33. pdfget-0.1.0/data/pdfs/search_results_1765097241.json +27 -0
  34. pdfget-0.1.0/data/pdfs/search_results_1765097294.json +45 -0
  35. pdfget-0.1.0/data/pdfs/search_results_1765097505.json +257 -0
  36. pdfget-0.1.0/data/pdfs/search_results_1765097514.json +257 -0
  37. pdfget-0.1.0/data/pdfs/search_results_1765097746.json +95 -0
  38. pdfget-0.1.0/data/pdfs/search_results_1765097756.json +161 -0
  39. pdfget-0.1.0/data/pdfs/search_results_1765097906.json +357 -0
  40. pdfget-0.1.0/data/pdfs/search_results_1765097955.json +167 -0
  41. pdfget-0.1.0/data/pdfs/search_results_1765097995.json +183 -0
  42. pdfget-0.1.0/data/pdfs/search_results_1765098011.json +247 -0
  43. pdfget-0.1.0/data/pdfs/search_results_1765098173.json +206 -0
  44. pdfget-0.1.0/data/pdfs/search_results_1765098189.json +143 -0
  45. pdfget-0.1.0/data/pdfs/search_results_1765112541.json +2090 -0
  46. pdfget-0.1.0/data/pdfs/search_results_1765112944.json +1037 -0
  47. pdfget-0.1.0/data/pdfs/search_results_1765114438.json +17 -0
  48. pdfget-0.1.0/pyproject.toml +90 -0
  49. pdfget-0.1.0/pytest.ini +21 -0
  50. pdfget-0.1.0/src/pdfget/__init__.py +12 -0
  51. pdfget-0.1.0/src/pdfget/__main__.py +6 -0
  52. pdfget-0.1.0/src/pdfget/config.py +30 -0
  53. pdfget-0.1.0/src/pdfget/downloader.py +308 -0
  54. pdfget-0.1.0/src/pdfget/fetcher.py +415 -0
  55. pdfget-0.1.0/src/pdfget/main.py +282 -0
  56. pdfget-0.1.0/tests/__init__.py +3 -0
  57. pdfget-0.1.0/tests/conftest.py +140 -0
  58. pdfget-0.1.0/tests/test_basic_functionality.py +100 -0
  59. pdfget-0.1.0/tests/test_config.py +86 -0
  60. pdfget-0.1.0/tests/test_fetcher_basic.py +175 -0
  61. pdfget-0.1.0/uv.lock +1142 -0
@@ -0,0 +1,59 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, develop ]
6
+ pull_request:
7
+ branches: [ main, develop ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.12", "3.13"]
15
+
16
+ steps:
17
+ - name: Checkout code
18
+ uses: actions/checkout@v4
19
+
20
+ - name: Set up Python ${{ matrix.python-version }}
21
+ uses: actions/setup-python@v4
22
+ with:
23
+ python-version: ${{ matrix.python-version }}
24
+
25
+ - name: Install uv
26
+ uses: astral-sh/setup-uv@v2
27
+
28
+ - name: Install dependencies
29
+ run: |
30
+ uv sync --dev
31
+
32
+ - name: Run code quality checks
33
+ run: |
34
+ uv run ruff check .
35
+ uv run ruff format --check .
36
+
37
+ - name: Run type checking
38
+ run: |
39
+ uv run mypy src/
40
+
41
+ - name: Run tests
42
+ run: |
43
+ uv run pytest tests/ -v --cov=pdfget --cov-report=xml
44
+
45
+ - name: Upload coverage to Codecov
46
+ uses: codecov/codecov-action@v4
47
+ with:
48
+ file: ./coverage.xml
49
+ flags: unittests
50
+ name: codecov-umbrella
51
+ fail_ci_if_error: false
52
+
53
+ - name: Build package
54
+ run: |
55
+ uv build
56
+
57
+ - name: Check package
58
+ run: |
59
+ uv run twine check dist/*
@@ -0,0 +1,141 @@
1
+ name: Release & Publish
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*' # 触发条件:推送以 v 开头的标签
7
+
8
+ permissions:
9
+ contents: write # 允许创建release
10
+
11
+ jobs:
12
+ release-and-publish:
13
+ runs-on: ubuntu-latest
14
+
15
+ steps:
16
+ - name: Checkout code
17
+ uses: actions/checkout@v6.0.1
18
+
19
+ - name: Set up Python 3.12
20
+ uses: actions/setup-python@v5
21
+ with:
22
+ python-version: "3.12"
23
+
24
+ - name: Install uv
25
+ uses: astral-sh/setup-uv@v7.1.4
26
+
27
+ - name: Install dependencies
28
+ run: |
29
+ uv sync --dev
30
+
31
+ - name: Run tests
32
+ run: |
33
+ uv run pytest tests/ -v --cov=src --cov-report=xml
34
+
35
+ - name: Run code quality checks
36
+ run: |
37
+ uv run ruff check .
38
+ uv run ruff format --check .
39
+ uv run mypy src/
40
+
41
+ - name: Build package
42
+ run: |
43
+ uv build
44
+
45
+ - name: Check package
46
+ run: |
47
+ uv run twine check dist/*
48
+
49
+ - name: Generate release notes
50
+ id: release_notes
51
+ run: |
52
+ # 从CHANGELOG.md提取最新版本信息
53
+ python << 'EOF'
54
+ import re
55
+
56
+ with open('CHANGELOG.md', 'r', encoding='utf-8') as f:
57
+ content = f.read()
58
+
59
+ # 查找最新的版本信息
60
+ version_match = re.search(r'^## \[([0-9.]+)\] - ([0-9-]+)', content, re.MULTILINE)
61
+ if version_match:
62
+ version = version_match.group(1)
63
+ date = version_match.group(2)
64
+
65
+ # 提取该版本的主要内容
66
+ version_start = version_match.start()
67
+ next_version_match = re.search(r'\n## \[', content[version_start + 1:])
68
+ if next_version_match:
69
+ version_content = content[version_start:version_start + next_version_match.start()]
70
+ else:
71
+ version_content = content[version_start:]
72
+
73
+ # 提取主要功能点
74
+ features = []
75
+ lines = version_content.split('\n')
76
+ for line in lines:
77
+ if '新增功能' in line:
78
+ continue
79
+ if line.startswith('- **') and (':' in line or ':' in line):
80
+ features.append(line.strip())
81
+
82
+ # 设置输出
83
+ print(f"::set-output name=version::{version}")
84
+ print(f"::set-output name=date::{date}")
85
+ print(f"::set-output name=features::{'\\n'.join(features[:8])}") # 限制显示数量
86
+ else:
87
+ print("::set-output name=version::unknown")
88
+ print("::set-output name=features::No release notes found")
89
+ EOF
90
+
91
+ - name: Create GitHub Release
92
+ uses: softprops/action-gh-release@v2.5.0
93
+ with:
94
+ tag_name: ${{ github.ref_name }}
95
+ name: Release ${{ steps.release_notes.outputs.version }}
96
+ body: |
97
+ 🎉 PDFGet ${{ steps.release_notes.outputs.version }} 发布!
98
+
99
+ ## 📦 安装方法
100
+ ```bash
101
+ pip install pdfget==${{ steps.release_notes.outputs.version }}
102
+ # 或使用uv
103
+ uv add pdfget==${{ steps.release_notes.outputs.version }}
104
+ ```
105
+
106
+ ## 🚀 主要特性
107
+ ${{ steps.release_notes.outputs.features }}
108
+
109
+ ## 📖 使用示例
110
+ ```bash
111
+ # 搜索并下载文献
112
+ pdfget -s "machine learning" -l 20 -d -t 5
113
+
114
+ # 高级检索
115
+ pdfget -s "cancer AND immunotherapy NOT review" -l 30
116
+
117
+ # 批量下载
118
+ pdfget -i dois.csv -d -t 3
119
+ ```
120
+
121
+ ## 📋 完整更新日志
122
+ 请查看 [CHANGELOG.md](https://github.com/${{ github.repository }}/blob/main/CHANGELOG.md)
123
+
124
+ ---
125
+
126
+ 🤖 自动发布于 ${{ steps.release_notes.outputs.date }}
127
+ draft: false
128
+ prerelease: false
129
+
130
+ - name: Publish to PyPI
131
+ env:
132
+ PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
133
+ run: |
134
+ uv publish --token $PYPI_API_TOKEN
135
+
136
+ - name: Release Confirmation
137
+ if: success()
138
+ run: |
139
+ echo "✅ Release ${{ steps.release_notes.outputs.version }} published successfully!"
140
+ echo "📦 PyPI: https://pypi.org/project/pdfget/${{ steps.release_notes.outputs.version }}/"
141
+ echo "🔗 GitHub: https://github.com/${{ github.repository }}/releases/tag/${{ github.ref_name }}"
@@ -0,0 +1,163 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+ # uv
132
+ .uv_cache/
133
+
134
+ # IDE
135
+ .vscode/
136
+ .idea/
137
+ *.swp
138
+ *.swo
139
+
140
+ # Data files
141
+ data/pdfs/*.pdf
142
+ data/cache/*.json
143
+ data/logs/*.log
144
+
145
+ # OS
146
+ .DS_Store
147
+ Thumbs.db
148
+
149
+ # Environment variables
150
+ .env
151
+ .env.local
152
+
153
+ # Project specific
154
+ temp/
155
+ tmp/
156
+
157
+ # Test temporary directories
158
+ MagicMock/
159
+ PaperFetcher().cache_dir/
160
+ unittest.mock/
161
+ temp_*/
162
+ test_*/
163
+ *.tmp
@@ -0,0 +1,26 @@
1
+ repos:
2
+ - repo: https://github.com/astral-sh/ruff-pre-commit
3
+ # Ruff version.
4
+ rev: v0.1.14
5
+ hooks:
6
+ # Run the linter.
7
+ - id: ruff
8
+ args: [--fix]
9
+ # Run the formatter.
10
+ - id: ruff-format
11
+
12
+ - repo: local
13
+ hooks:
14
+ - id: mypy-local
15
+ name: mypy
16
+ entry: uv run mypy src/
17
+ language: system
18
+ pass_filenames: false
19
+
20
+ - repo: https://github.com/pre-commit/pre-commit-hooks
21
+ rev: v5.0.0
22
+ hooks:
23
+ - id: trailing-whitespace
24
+ - id: end-of-file-fixer
25
+ - id: check-yaml
26
+ - id: check-added-large-files
@@ -0,0 +1,122 @@
1
+ # 更新日志
2
+
3
+ 本文档记录了PDFGet项目的所有重要更改。
4
+
5
+ 格式基于 [Keep a Changelog](https://keepachangelog.com/zh-CN/1.0.0/),
6
+ 并且本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。
7
+
8
+ ## [0.1.0] - 2025-12-07
9
+
10
+ ### 🎉 首次发布
11
+
12
+ #### ✨ 新增功能
13
+ - **高级文献搜索**:支持布尔运算符(AND、OR、NOT)、字段检索(title:、author:、journal:)、短语检索
14
+ - **智能缓存系统**:24小时自动过期,避免重复下载
15
+ - **并发下载**:多线程并行下载,3-5倍速度提升
16
+ - **批量处理**:支持CSV/TXT文件批量下载
17
+ - **丰富元数据**:包含作者、单位、期刊、摘要、引用等10+个字段
18
+ - **简洁命令行**:单字母参数(-s, -l, -t, -d, -v),易于使用
19
+
20
+ #### 🔧 核心特性
21
+ - **Europe PMC API集成**:权威学术数据源
22
+ - **线程安全设计**:并发环境下的数据一致性
23
+ - **智能重试机制**:网络错误自动重试
24
+ - **优雅降级**:PDF不可用时自动获取HTML全文
25
+
26
+ #### 📊 性能表现
27
+ | 文献数量 | 单线程耗时 | 并发耗时 | 性能提升 |
28
+ |---------|-----------|----------|----------|
29
+ | 5篇 | ~25秒 | ~8秒 | 3x |
30
+ | 20篇 | ~100秒 | ~25秒 | 4x |
31
+ | 50篇 | ~250秒 | ~60秒 | 4x |
32
+
33
+ #### 🛠️ 技术实现
34
+ - **Python 3.12+**:现代Python特性和类型注解
35
+ - **ThreadPoolExecutor**:高效的线程池管理
36
+ - **智能缓存**:24小时自动过期
37
+ - **模块化设计**:清晰的代码结构
38
+ - **自动化代码质量**:pre-commit hooks自动检查和修复
39
+
40
+ #### 📦 包结构
41
+ ```
42
+ pdfget/
43
+ ├── src/pdfget/
44
+ │ ├── __init__.py # 包初始化
45
+ │ ├── __main__.py # 命令行入口
46
+ │ ├── main.py # 主程序逻辑
47
+ │ ├── fetcher.py # 核心文献获取器
48
+ │ ├── concurrent_downloader.py # 并发下载器
49
+ │ └── config.py # 配置文件
50
+ ├── tests/ # 测试文件
51
+ ├── data/ # 数据目录
52
+ │ ├── pdfs/ # 下载的PDF
53
+ │ └── cache/ # 缓存文件
54
+ ├── tests/ # pytest测试
55
+ ├── README.md # 项目文档
56
+ ├── CHANGELOG.md # 更新日志
57
+ ├── pyproject.toml # 项目配置
58
+ └── pytest.ini # 测试配置
59
+ ```
60
+
61
+ #### 🧪 测试覆盖
62
+ - **28个测试用例**:100%通过率
63
+ - **核心功能覆盖**:60%+代码覆盖率
64
+ - **Mock测试**:避免实际网络请求
65
+ - **并发测试**:验证多线程安全性
66
+
67
+ #### 📖 使用示例
68
+ ```bash
69
+ # 搜索文献
70
+ uv run pdfget -s "machine learning" -l 20
71
+
72
+ # 高级检索
73
+ uv run pdfget -s "cancer AND immunotherapy NOT review" -l 30
74
+
75
+ # 并发下载
76
+ uv run pdfget -s "deep learning" -l 50 -d -t 5
77
+
78
+ # 单篇下载
79
+ uv run pdfget --doi 10.1016/j.cell.2020.01.021
80
+
81
+ # 批量下载
82
+ uv run pdfget -i dois.csv -d -t 3
83
+ ```
84
+
85
+ #### 🔍 高级检索语法
86
+ ```bash
87
+ # 布尔运算符
88
+ uv run pdfget -s "cancer AND immunotherapy" -l 30
89
+
90
+ # 字段检索
91
+ uv run pdfget -s 'title:"deep learning" AND author:hinton'
92
+
93
+ # 短语检索
94
+ uv run pdfget -s '"quantum computing"' -l 10
95
+ ```
96
+
97
+ #### 🏗️ 开发工具集成
98
+ - **ruff**:代码规范检查
99
+ - **pytest**:单元测试框架
100
+ - **pytest-cov**:测试覆盖率
101
+ - **uv**:现代Python包管理
102
+
103
+ #### 📄 许可证
104
+ - MIT License - 允许自由使用和修改
105
+
106
+ ---
107
+
108
+ ## 版本说明
109
+
110
+ - **主版本**:不兼容的API修改
111
+ - **次版本**:向下兼容的功能性新增
112
+ - **修订版本**:向下兼容的问题修正
113
+
114
+ ## 贡献指南
115
+
116
+ 欢迎提交Issue和Pull Request来改进这个工具!
117
+
118
+ ## 联系方式
119
+
120
+ 如有问题或建议,请通过以下方式联系:
121
+ - 提交GitHub Issue
122
+ - 发送邮件至 gqy (qingyu_ge@foxmail.com)
pdfget-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,200 @@
1
+ Metadata-Version: 2.4
2
+ Name: pdfget
3
+ Version: 0.1.0
4
+ Summary: 智能文献搜索与批量下载工具,支持高级检索和并发下载
5
+ Author-email: gqy <qingyu_ge@foxmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.12
8
+ Requires-Dist: pandas>=2.0.0
9
+ Requires-Dist: requests>=2.31.0
10
+ Provides-Extra: dev
11
+ Requires-Dist: black>=24.0.0; extra == 'dev'
12
+ Requires-Dist: isort>=5.13.0; extra == 'dev'
13
+ Requires-Dist: mypy>=1.9.0; extra == 'dev'
14
+ Requires-Dist: pytest-cov>=5.0.0; extra == 'dev'
15
+ Requires-Dist: pytest>=8.0.0; extra == 'dev'
16
+ Requires-Dist: ruff>=0.5.0; extra == 'dev'
17
+ Description-Content-Type: text/markdown
18
+
19
+ # PDFGet - 高效文献下载工具
20
+
21
+ 智能文献搜索与批量下载工具,支持高级检索和并发下载。
22
+
23
+ ## 1. 项目概述
24
+
25
+ PDFGet是一个专为科研工作者设计的智能文献搜索与批量下载工具,集成了Europe PMC等权威学术数据库,提供高效的文献获取和管理功能。
26
+
27
+ ### 1.1 主要特性
28
+
29
+ - 🔍 **高级搜索**:支持布尔运算符、字段检索、短语检索
30
+ - 🚀 **并发下载**:多线程并行下载,3-5倍速度提升
31
+ - 📊 **丰富元数据**:包含作者、单位、期刊、摘要、引用等完整信息
32
+ - 💾 **智能缓存**:24小时缓存,避免重复下载
33
+ - 📄 **批量处理**:支持CSV/TXT文件批量下载
34
+
35
+ ## 2. 安装与配置
36
+
37
+ ### 2.1 系统要求
38
+
39
+ 详细的系统要求和依赖信息请查看 [pyproject.toml](pyproject.toml) 文件。
40
+
41
+ ### 2.2 安装方法
42
+
43
+ ```bash
44
+ # 使用pip安装
45
+ pip install pdfget
46
+
47
+ # 使用uv安装
48
+ uv add pdfget
49
+
50
+ # 或从源码安装
51
+ git clone https://github.com/gqy20/pdfget.git
52
+ cd pdfget
53
+ pip install -e .
54
+ ```
55
+
56
+ ### 2.3 快速开始
57
+
58
+ 安装完成后,您可以直接使用 `pdfget` 命令:
59
+
60
+ ```bash
61
+ # 搜索文献
62
+ pdfget -s "machine learning" -l 20
63
+
64
+ # 搜索并下载
65
+ pdfget -s "cancer immunotherapy" -d
66
+
67
+ # 并发下载(5线程)
68
+ pdfget -s "deep learning" -l 50 -d -t 5
69
+
70
+ # 单篇文献下载
71
+ pdfget --doi 10.1016/j.cell.2020.01.021
72
+
73
+ # 批量下载
74
+ pdfget -i dois.csv -d -t 3
75
+ ```
76
+
77
+ 如果您使用 uv 作为包管理器,也可以:
78
+ ```bash
79
+ # 使用uv运行
80
+ uv run pdfget -s "machine learning" -l 20
81
+ ```
82
+
83
+ ## 3. 高级检索语法
84
+
85
+ ### 3.1 布尔运算符
86
+ ```bash
87
+ # AND: 同时包含多个关键词
88
+ pdfget -s "cancer AND immunotherapy" -l 30
89
+
90
+ # OR: 包含任意关键词
91
+ pdfget -s "machine OR deep learning" -l 20
92
+
93
+ # NOT: 排除特定词汇
94
+ pdfget -s "cancer AND immunotherapy NOT review" -l 30
95
+
96
+ # 复杂组合
97
+ pdfget -s "(cancer OR tumor) AND immunotherapy NOT mice" -l 25
98
+ ```
99
+
100
+ ### 3.2 字段检索
101
+ ```bash
102
+ # 标题检索
103
+ pdfget -s 'title:"deep learning"' -l 15
104
+
105
+ # 作者检索
106
+ pdfget -s 'author:hinton AND title:"neural networks"' -l 10
107
+
108
+ # 期刊检索
109
+ pdfget -s 'journal:nature AND cancer' -l 20
110
+
111
+ # 年份检索
112
+ pdfget -s 'cancer AND year:2023' -l 15
113
+ ```
114
+
115
+ ### 3.3 短语和精确匹配
116
+ ```bash
117
+ # 短语检索(用双引号)
118
+ pdfget -s '"quantum computing"' -l 10
119
+
120
+ # 混合使用
121
+ pdfget -s '"gene expression" AND (cancer OR tumor) NOT review' -l 20
122
+ ```
123
+
124
+ ### 3.4 实用检索技巧
125
+ - 使用括号分组复杂的布尔逻辑
126
+ - 短语用双引号确保精确匹配
127
+ - 可以组合多个字段进行精确检索
128
+ - 使用 NOT 过滤掉不相关的结果(如综述、评论等)
129
+
130
+ ## 4. 性能优势
131
+
132
+ ### 4.1 并发下载效率对比
133
+
134
+ | 文献数量 | 单线程耗时 | 并发耗时 | 性能提升 |
135
+ |---------|-----------|----------|----------|
136
+ | 5篇 | ~25秒 | ~8秒 | 3x |
137
+ | 20篇 | ~100秒 | ~25秒 | 4x |
138
+ | 50篇 | ~250秒 | ~60秒 | 4x |
139
+
140
+ ## 5. 命令行参数详解
141
+
142
+ ### 5.1 核心参数
143
+ - `-s QUERY` : 搜索文献
144
+ - `--doi DOI` : 下载单个文献
145
+ - `-i FILE` : 批量输入文件
146
+ - `-d` : 下载PDF
147
+
148
+ ### 5.2 优化参数
149
+ - `-l NUM` : 搜索结果数量(默认50)
150
+ - `-t NUM` : 并发线程数(默认3)
151
+ - `-v` : 详细输出
152
+
153
+ ## 6. 输出格式与文件结构
154
+
155
+ ### 6.1 搜索结果格式
156
+ ```json
157
+ {
158
+ "query": "关键词",
159
+ "total": 10,
160
+ "results": [
161
+ {
162
+ "title": "文献标题",
163
+ "authors": ["作者1", "作者2"],
164
+ "journal": "期刊名称",
165
+ "year": "2025",
166
+ "doi": "10.1016/xxx",
167
+ "affiliation": "作者单位",
168
+ "citedBy": 0,
169
+ "keywords": ["关键词1", "关键词2"]
170
+ }
171
+ ]
172
+ }
173
+ ```
174
+
175
+ ### 6.2 文件目录结构
176
+ ```
177
+ data/
178
+ ├── pdfs/ # 下载的PDF文件
179
+ ├── cache/ # 缓存文件
180
+ └── download_results.json # 下载结果记录
181
+ ```
182
+
183
+ ## 7. 许可证
184
+
185
+ 本项目采用 MIT License,允许自由使用和修改。
186
+
187
+ ## 📚 更新日志
188
+
189
+ <details>
190
+ <summary><strong>📋 查看版本更新历史</strong></summary>
191
+
192
+ - 🔗 **完整更新日志**: [CHANGELOG.md](CHANGELOG.md)
193
+ - ✨ **最新版本 (v0.1.0)**: 高级文献搜索 + 并发下载 + 智能缓存
194
+
195
+ </details>
196
+
197
+ ## 🔗 相关链接
198
+
199
+ - **项目源码**: [GitHub Repository](https://github.com/gqy20/pdfget)
200
+ - **问题反馈**: [GitHub Issues](https://github.com/gqy20/pdfget/issues)