docpull 1.0.1__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docpull-1.1.0/PKG-INFO +221 -0
- docpull-1.1.0/README.md +154 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/__init__.py +1 -1
- {docpull-1.0.1 → docpull-1.1.0}/docpull/cli.py +83 -12
- docpull-1.1.0/docpull/doctor.py +188 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/async_fetcher.py +8 -8
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/base.py +1 -3
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/plaid.py +3 -3
- docpull-1.1.0/docpull.egg-info/PKG-INFO +221 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/SOURCES.txt +1 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/requires.txt +1 -1
- {docpull-1.0.1 → docpull-1.1.0}/pyproject.toml +6 -2
- docpull-1.0.1/PKG-INFO +0 -440
- docpull-1.0.1/README.md +0 -373
- docpull-1.0.1/docpull.egg-info/PKG-INFO +0 -440
- {docpull-1.0.1 → docpull-1.1.0}/LICENSE +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/__main__.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/config.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/__init__.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/bun.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/d3.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/generic.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/generic_async.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/nextjs.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/parallel_base.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/react.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/stripe.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/tailwind.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/turborepo.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/__init__.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/base.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/bun.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/d3.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/nextjs.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/plaid.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/react.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/stripe.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/tailwind.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/turborepo.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/py.typed +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/utils/__init__.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/utils/file_utils.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull/utils/logging_config.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/entry_points.txt +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/top_level.txt +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/setup.cfg +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/tests/test_async_fetcher.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/tests/test_config.py +0 -0
- {docpull-1.0.1 → docpull-1.1.0}/tests/test_fetchers.py +0 -0
docpull-1.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: docpull
|
|
3
|
+
Version: 1.1.0
|
|
4
|
+
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
|
+
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
|
+
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/raintree-technology/docpull
|
|
9
|
+
Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/raintree-technology/docpull
|
|
11
|
+
Project-URL: Source Code, https://github.com/raintree-technology/docpull
|
|
12
|
+
Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
|
|
13
|
+
Project-URL: Changelog, https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md
|
|
14
|
+
Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
|
|
15
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
16
|
+
Classifier: Intended Audience :: Developers
|
|
17
|
+
Classifier: Intended Audience :: Information Technology
|
|
18
|
+
Classifier: Intended Audience :: Science/Research
|
|
19
|
+
Classifier: Intended Audience :: Education
|
|
20
|
+
Classifier: Environment :: Console
|
|
21
|
+
Classifier: Topic :: Documentation
|
|
22
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
23
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
26
|
+
Classifier: Topic :: Utilities
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
28
|
+
Classifier: Natural Language :: English
|
|
29
|
+
Classifier: Operating System :: OS Independent
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
32
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
33
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
36
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
37
|
+
Classifier: Typing :: Typed
|
|
38
|
+
Requires-Python: >=3.9
|
|
39
|
+
Description-Content-Type: text/markdown
|
|
40
|
+
License-File: LICENSE
|
|
41
|
+
Requires-Dist: requests>=2.31.0
|
|
42
|
+
Requires-Dist: beautifulsoup4>=4.12.0
|
|
43
|
+
Requires-Dist: html2text>=2020.1.16
|
|
44
|
+
Requires-Dist: defusedxml>=0.7.1
|
|
45
|
+
Requires-Dist: aiohttp>=3.9.0
|
|
46
|
+
Requires-Dist: rich>=13.0.0
|
|
47
|
+
Provides-Extra: yaml
|
|
48
|
+
Requires-Dist: pyyaml>=6.0; extra == "yaml"
|
|
49
|
+
Provides-Extra: js
|
|
50
|
+
Requires-Dist: playwright>=1.40.0; extra == "js"
|
|
51
|
+
Provides-Extra: all
|
|
52
|
+
Requires-Dist: pyyaml>=6.0; extra == "all"
|
|
53
|
+
Requires-Dist: playwright>=1.40.0; extra == "all"
|
|
54
|
+
Provides-Extra: dev
|
|
55
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
56
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
57
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
58
|
+
Requires-Dist: black>=23.0.0; extra == "dev"
|
|
59
|
+
Requires-Dist: mypy>=1.0.0; extra == "dev"
|
|
60
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
61
|
+
Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
62
|
+
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
63
|
+
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
64
|
+
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
65
|
+
Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
|
|
66
|
+
Dynamic: license-file
|
|
67
|
+
|
|
68
|
+
# docpull
|
|
69
|
+
|
|
70
|
+
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
71
|
+
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
72
|
+
|
|
73
|
+
[](https://www.python.org/downloads/)
|
|
74
|
+
[](https://badge.fury.io/py/docpull)
|
|
75
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
76
|
+
[](https://github.com/psf/black)
|
|
77
|
+
[](http://mypy-lang.org/)
|
|
78
|
+
[](https://github.com/PyCQA/bandit)
|
|
79
|
+
|
|
80
|
+
## Why docpull?
|
|
81
|
+
|
|
82
|
+
Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
|
|
83
|
+
|
|
84
|
+
## Key Features
|
|
85
|
+
|
|
86
|
+
- Works on any documentation site
|
|
87
|
+
- Smart extraction of main content
|
|
88
|
+
- Async + parallel fetching (up to 10× faster)
|
|
89
|
+
- Optional JavaScript rendering via Playwright
|
|
90
|
+
- Sitemap + link crawling
|
|
91
|
+
- URL-based filtering (include/exclude)
|
|
92
|
+
- Rate limiting, timeouts, content-type checks
|
|
93
|
+
- Saves docs in structured Markdown with YAML metadata
|
|
94
|
+
- Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
|
|
95
|
+
|
|
96
|
+
## Quick Start
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pip install docpull
|
|
100
|
+
docpull --doctor # verify installation
|
|
101
|
+
docpull https://aptos.dev
|
|
102
|
+
docpull stripe # use a built-in profile
|
|
103
|
+
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### JavaScript-heavy sites
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
pip install docpull[js]
|
|
110
|
+
python -m playwright install chromium
|
|
111
|
+
docpull https://site.com --js
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Python API
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
from docpull import GenericAsyncFetcher
|
|
118
|
+
|
|
119
|
+
fetcher = GenericAsyncFetcher(
|
|
120
|
+
url_or_profile="https://aptos.dev",
|
|
121
|
+
output_dir="./docs",
|
|
122
|
+
max_pages=100,
|
|
123
|
+
max_concurrent=20,
|
|
124
|
+
)
|
|
125
|
+
fetcher.fetch()
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Common Options
|
|
129
|
+
|
|
130
|
+
- `--doctor` – verify installation and dependencies
|
|
131
|
+
- `--max-pages N` – limit crawl size
|
|
132
|
+
- `--max-depth N` – restrict link depth
|
|
133
|
+
- `--max-concurrent N` – control parallel fetches
|
|
134
|
+
- `--js` – enable Playwright rendering
|
|
135
|
+
- `--output-dir DIR`
|
|
136
|
+
- `--rate-limit X`
|
|
137
|
+
- `--no-skip-existing`
|
|
138
|
+
- `--dry-run`
|
|
139
|
+
|
|
140
|
+
## Performance
|
|
141
|
+
|
|
142
|
+
Async fetching drastically reduces runtime:
|
|
143
|
+
|
|
144
|
+
| Pages | Sync | Async | Speedup |
|
|
145
|
+
|-------|------|-------|---------|
|
|
146
|
+
| 50 | ~50s | ~6s | 8× faster |
|
|
147
|
+
|
|
148
|
+
Higher concurrency yields even better results.
|
|
149
|
+
|
|
150
|
+
## Output Format
|
|
151
|
+
|
|
152
|
+
Each downloaded page becomes a Markdown file:
|
|
153
|
+
|
|
154
|
+
```markdown
|
|
155
|
+
---
|
|
156
|
+
url: https://stripe.com/docs/payments
|
|
157
|
+
fetched: 2025-11-13
|
|
158
|
+
---
|
|
159
|
+
# Payment Intents
|
|
160
|
+
...
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
Directory layout mirrors the target site's structure.
|
|
164
|
+
|
|
165
|
+
## Configuration File (Optional)
|
|
166
|
+
|
|
167
|
+
```yaml
|
|
168
|
+
output_dir: ./docs
|
|
169
|
+
rate_limit: 0.5
|
|
170
|
+
sources:
|
|
171
|
+
- stripe
|
|
172
|
+
- nextjs
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Run with:
|
|
176
|
+
```bash
|
|
177
|
+
docpull --config config.yaml
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## Custom Profiles
|
|
181
|
+
|
|
182
|
+
Easily define profiles for frequently scraped sites.
|
|
183
|
+
|
|
184
|
+
```python
|
|
185
|
+
from docpull.profiles.base import SiteProfile
|
|
186
|
+
|
|
187
|
+
MY_PROFILE = SiteProfile(
|
|
188
|
+
name="mysite",
|
|
189
|
+
domains={"docs.mysite.com"},
|
|
190
|
+
include_patterns=["/docs/", "/api/"],
|
|
191
|
+
)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Security
|
|
195
|
+
|
|
196
|
+
- HTTPS-only
|
|
197
|
+
- Blocks private network IPs
|
|
198
|
+
- 50MB page size limit
|
|
199
|
+
- Timeout controls
|
|
200
|
+
- Validates content-type
|
|
201
|
+
- Playwright sandboxing
|
|
202
|
+
|
|
203
|
+
## Troubleshooting
|
|
204
|
+
|
|
205
|
+
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
206
|
+
- **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
|
|
207
|
+
- **Site requires JS**: install Playwright + `--js`
|
|
208
|
+
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
209
|
+
- **Large sites**: set `--max-pages`
|
|
210
|
+
|
|
211
|
+
For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
|
|
212
|
+
|
|
213
|
+
## Links
|
|
214
|
+
|
|
215
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
216
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
217
|
+
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
218
|
+
|
|
219
|
+
## License
|
|
220
|
+
|
|
221
|
+
MIT License - see [LICENSE](LICENSE) file for details
|
docpull-1.1.0/README.md
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# docpull
|
|
2
|
+
|
|
3
|
+
**Pull documentation from any website and converts it into clean, AI-ready Markdown.**
|
|
4
|
+
Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
|
|
5
|
+
|
|
6
|
+
[](https://www.python.org/downloads/)
|
|
7
|
+
[](https://badge.fury.io/py/docpull)
|
|
8
|
+
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
9
|
+
[](https://github.com/psf/black)
|
|
10
|
+
[](http://mypy-lang.org/)
|
|
11
|
+
[](https://github.com/PyCQA/bandit)
|
|
12
|
+
|
|
13
|
+
## Why docpull?
|
|
14
|
+
|
|
15
|
+
Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
|
|
16
|
+
|
|
17
|
+
## Key Features
|
|
18
|
+
|
|
19
|
+
- Works on any documentation site
|
|
20
|
+
- Smart extraction of main content
|
|
21
|
+
- Async + parallel fetching (up to 10× faster)
|
|
22
|
+
- Optional JavaScript rendering via Playwright
|
|
23
|
+
- Sitemap + link crawling
|
|
24
|
+
- URL-based filtering (include/exclude)
|
|
25
|
+
- Rate limiting, timeouts, content-type checks
|
|
26
|
+
- Saves docs in structured Markdown with YAML metadata
|
|
27
|
+
- Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
|
|
28
|
+
|
|
29
|
+
## Quick Start
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install docpull
|
|
33
|
+
docpull --doctor # verify installation
|
|
34
|
+
docpull https://aptos.dev
|
|
35
|
+
docpull stripe # use a built-in profile
|
|
36
|
+
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
### JavaScript-heavy sites
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install docpull[js]
|
|
43
|
+
python -m playwright install chromium
|
|
44
|
+
docpull https://site.com --js
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Python API
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from docpull import GenericAsyncFetcher
|
|
51
|
+
|
|
52
|
+
fetcher = GenericAsyncFetcher(
|
|
53
|
+
url_or_profile="https://aptos.dev",
|
|
54
|
+
output_dir="./docs",
|
|
55
|
+
max_pages=100,
|
|
56
|
+
max_concurrent=20,
|
|
57
|
+
)
|
|
58
|
+
fetcher.fetch()
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Common Options
|
|
62
|
+
|
|
63
|
+
- `--doctor` – verify installation and dependencies
|
|
64
|
+
- `--max-pages N` – limit crawl size
|
|
65
|
+
- `--max-depth N` – restrict link depth
|
|
66
|
+
- `--max-concurrent N` – control parallel fetches
|
|
67
|
+
- `--js` – enable Playwright rendering
|
|
68
|
+
- `--output-dir DIR`
|
|
69
|
+
- `--rate-limit X`
|
|
70
|
+
- `--no-skip-existing`
|
|
71
|
+
- `--dry-run`
|
|
72
|
+
|
|
73
|
+
## Performance
|
|
74
|
+
|
|
75
|
+
Async fetching drastically reduces runtime:
|
|
76
|
+
|
|
77
|
+
| Pages | Sync | Async | Speedup |
|
|
78
|
+
|-------|------|-------|---------|
|
|
79
|
+
| 50 | ~50s | ~6s | 8× faster |
|
|
80
|
+
|
|
81
|
+
Higher concurrency yields even better results.
|
|
82
|
+
|
|
83
|
+
## Output Format
|
|
84
|
+
|
|
85
|
+
Each downloaded page becomes a Markdown file:
|
|
86
|
+
|
|
87
|
+
```markdown
|
|
88
|
+
---
|
|
89
|
+
url: https://stripe.com/docs/payments
|
|
90
|
+
fetched: 2025-11-13
|
|
91
|
+
---
|
|
92
|
+
# Payment Intents
|
|
93
|
+
...
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
Directory layout mirrors the target site's structure.
|
|
97
|
+
|
|
98
|
+
## Configuration File (Optional)
|
|
99
|
+
|
|
100
|
+
```yaml
|
|
101
|
+
output_dir: ./docs
|
|
102
|
+
rate_limit: 0.5
|
|
103
|
+
sources:
|
|
104
|
+
- stripe
|
|
105
|
+
- nextjs
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
Run with:
|
|
109
|
+
```bash
|
|
110
|
+
docpull --config config.yaml
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Custom Profiles
|
|
114
|
+
|
|
115
|
+
Easily define profiles for frequently scraped sites.
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from docpull.profiles.base import SiteProfile
|
|
119
|
+
|
|
120
|
+
MY_PROFILE = SiteProfile(
|
|
121
|
+
name="mysite",
|
|
122
|
+
domains={"docs.mysite.com"},
|
|
123
|
+
include_patterns=["/docs/", "/api/"],
|
|
124
|
+
)
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
## Security
|
|
128
|
+
|
|
129
|
+
- HTTPS-only
|
|
130
|
+
- Blocks private network IPs
|
|
131
|
+
- 50MB page size limit
|
|
132
|
+
- Timeout controls
|
|
133
|
+
- Validates content-type
|
|
134
|
+
- Playwright sandboxing
|
|
135
|
+
|
|
136
|
+
## Troubleshooting
|
|
137
|
+
|
|
138
|
+
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
139
|
+
- **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
|
|
140
|
+
- **Site requires JS**: install Playwright + `--js`
|
|
141
|
+
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
142
|
+
- **Large sites**: set `--max-pages`
|
|
143
|
+
|
|
144
|
+
For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
|
|
145
|
+
|
|
146
|
+
## Links
|
|
147
|
+
|
|
148
|
+
- [PyPI](https://pypi.org/project/docpull/)
|
|
149
|
+
- [GitHub](https://github.com/raintree-technology/docpull)
|
|
150
|
+
- [Issues](https://github.com/raintree-technology/docpull/issues)
|
|
151
|
+
|
|
152
|
+
## License
|
|
153
|
+
|
|
154
|
+
MIT License - see [LICENSE](LICENSE) file for details
|
|
@@ -3,6 +3,40 @@ import sys
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
+
# Check if --doctor flag is present before checking dependencies
|
|
7
|
+
# This allows users to diagnose issues even when dependencies are missing
|
|
8
|
+
if "--doctor" in sys.argv:
|
|
9
|
+
from .doctor import run_doctor
|
|
10
|
+
|
|
11
|
+
# Parse output dir if provided
|
|
12
|
+
output_dir = None
|
|
13
|
+
if "--output-dir" in sys.argv or "-o" in sys.argv:
|
|
14
|
+
try:
|
|
15
|
+
flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
|
|
16
|
+
if flag_idx + 1 < len(sys.argv):
|
|
17
|
+
output_dir = Path(sys.argv[flag_idx + 1])
|
|
18
|
+
except (ValueError, IndexError):
|
|
19
|
+
pass
|
|
20
|
+
sys.exit(run_doctor(output_dir=output_dir))
|
|
21
|
+
|
|
22
|
+
# Verify core dependencies are available
|
|
23
|
+
try:
|
|
24
|
+
import aiohttp # noqa: F401
|
|
25
|
+
import bs4 # noqa: F401
|
|
26
|
+
import defusedxml # noqa: F401
|
|
27
|
+
import html2text # noqa: F401
|
|
28
|
+
import requests # noqa: F401
|
|
29
|
+
import rich # noqa: F401
|
|
30
|
+
except ImportError as e:
|
|
31
|
+
print(f"\nERROR: Missing required dependency: {e.name}", file=sys.stderr)
|
|
32
|
+
print("\nDocpull requires all core dependencies to be installed.", file=sys.stderr)
|
|
33
|
+
print("\nRecommended fixes:", file=sys.stderr)
|
|
34
|
+
print(" 1. For pipx users: pipx reinstall docpull --force", file=sys.stderr)
|
|
35
|
+
print(" 2. For pip users: pip install --upgrade --force-reinstall docpull", file=sys.stderr)
|
|
36
|
+
print(" 3. For development: pip install -e .[dev]", file=sys.stderr)
|
|
37
|
+
print("\nTo diagnose issues, run: docpull --doctor", file=sys.stderr)
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
6
40
|
from . import __version__
|
|
7
41
|
from .config import FetcherConfig
|
|
8
42
|
from .fetchers import (
|
|
@@ -185,6 +219,12 @@ Examples:
|
|
|
185
219
|
version=f"%(prog)s {__version__}",
|
|
186
220
|
)
|
|
187
221
|
|
|
222
|
+
parser.add_argument(
|
|
223
|
+
"--doctor",
|
|
224
|
+
action="store_true",
|
|
225
|
+
help="Run diagnostic checks to verify installation",
|
|
226
|
+
)
|
|
227
|
+
|
|
188
228
|
return parser
|
|
189
229
|
|
|
190
230
|
|
|
@@ -200,17 +240,31 @@ def generate_sample_config(output_path: Path) -> None:
|
|
|
200
240
|
# Determine format from extension
|
|
201
241
|
suffix = output_path.suffix.lower()
|
|
202
242
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
243
|
+
try:
|
|
244
|
+
if suffix in [".yaml", ".yml"]:
|
|
245
|
+
config.save_yaml(output_path)
|
|
246
|
+
print(f"Sample YAML config generated: {output_path}")
|
|
247
|
+
elif suffix == ".json":
|
|
248
|
+
config.save_json(output_path)
|
|
249
|
+
print(f"Sample JSON config generated: {output_path}")
|
|
250
|
+
else:
|
|
251
|
+
# Try YAML first, fall back to JSON if PyYAML not available
|
|
252
|
+
try:
|
|
253
|
+
print(f"Warning: Unknown extension {suffix}, generating YAML")
|
|
254
|
+
output_path = output_path.with_suffix(".yaml")
|
|
255
|
+
config.save_yaml(output_path)
|
|
256
|
+
print(f"Sample YAML config generated: {output_path}")
|
|
257
|
+
except ImportError:
|
|
258
|
+
print("PyYAML not installed, generating JSON instead")
|
|
259
|
+
output_path = output_path.with_suffix(".json")
|
|
260
|
+
config.save_json(output_path)
|
|
261
|
+
print(f"Sample JSON config generated: {output_path}")
|
|
262
|
+
except ImportError:
|
|
263
|
+
print("\nERROR: PyYAML is required for YAML config files")
|
|
264
|
+
print("Install it with: pip install docpull[yaml]")
|
|
265
|
+
print("\nAlternatively, use JSON format:")
|
|
266
|
+
print(f" docpull --generate-config {output_path.with_suffix('.json')}")
|
|
267
|
+
raise
|
|
214
268
|
|
|
215
269
|
|
|
216
270
|
def get_config(args: argparse.Namespace) -> FetcherConfig:
|
|
@@ -224,7 +278,17 @@ def get_config(args: argparse.Namespace) -> FetcherConfig:
|
|
|
224
278
|
FetcherConfig instance
|
|
225
279
|
"""
|
|
226
280
|
# Load from config file if provided
|
|
227
|
-
|
|
281
|
+
if args.config:
|
|
282
|
+
try:
|
|
283
|
+
config = FetcherConfig.from_file(args.config)
|
|
284
|
+
except ImportError as e:
|
|
285
|
+
print(f"\nERROR: Error loading config file: {e}")
|
|
286
|
+
if "yaml" in str(e).lower() or "pyyaml" in str(e).lower():
|
|
287
|
+
print("Install PyYAML with: pip install docpull[yaml]")
|
|
288
|
+
print("\nAlternatively, convert your config to JSON format")
|
|
289
|
+
raise
|
|
290
|
+
else:
|
|
291
|
+
config = FetcherConfig()
|
|
228
292
|
|
|
229
293
|
# Override with command-line arguments
|
|
230
294
|
if args.output_dir is not None:
|
|
@@ -411,6 +475,13 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
411
475
|
parser = create_parser()
|
|
412
476
|
args = parser.parse_args(argv)
|
|
413
477
|
|
|
478
|
+
# Handle --doctor
|
|
479
|
+
if args.doctor:
|
|
480
|
+
from .doctor import run_doctor
|
|
481
|
+
|
|
482
|
+
output_dir = Path(args.output_dir) if args.output_dir else None
|
|
483
|
+
return run_doctor(output_dir=output_dir)
|
|
484
|
+
|
|
414
485
|
# Handle --generate-config
|
|
415
486
|
if args.generate_config:
|
|
416
487
|
try:
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Diagnostic tool for verifying docpull installation and dependencies."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from importlib import import_module
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
RICH_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
RICH_AVAILABLE = False
|
|
15
|
+
Console = None # type: ignore
|
|
16
|
+
Table = None # type: ignore
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def check_dependency(
|
|
20
|
+
module_name: str, package_name: Optional[str] = None, optional: bool = False
|
|
21
|
+
) -> tuple[bool, str]:
|
|
22
|
+
"""
|
|
23
|
+
Check if a Python module is importable.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
module_name: Name of the module to import
|
|
27
|
+
package_name: Display name of the package (defaults to module_name)
|
|
28
|
+
optional: Whether this is an optional dependency
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Tuple of (success: bool, message: str)
|
|
32
|
+
"""
|
|
33
|
+
display_name = package_name or module_name
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
import_module(module_name)
|
|
37
|
+
return True, f"[OK] {display_name}"
|
|
38
|
+
except ImportError:
|
|
39
|
+
if optional:
|
|
40
|
+
return False, f"[WARN] {display_name} (optional - not installed)"
|
|
41
|
+
else:
|
|
42
|
+
return False, f"[MISSING] {display_name}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def check_network() -> tuple[bool, str]:
|
|
46
|
+
"""
|
|
47
|
+
Check basic network connectivity.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Tuple of (success: bool, message: str)
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
import socket
|
|
54
|
+
|
|
55
|
+
# Try to resolve a common DNS name
|
|
56
|
+
socket.gethostbyname("www.google.com")
|
|
57
|
+
return True, "[OK] Network connectivity"
|
|
58
|
+
except socket.gaierror:
|
|
59
|
+
return False, "[FAIL] Network connectivity - DNS resolution failed"
|
|
60
|
+
except Exception as e:
|
|
61
|
+
return False, f"[WARN] Network connectivity - {str(e)}"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def check_output_dir(output_dir: Optional[Path] = None) -> tuple[bool, str]:
|
|
65
|
+
"""
|
|
66
|
+
Check if output directory is writable.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
output_dir: Directory to check (defaults to ./docs)
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Tuple of (success: bool, message: str)
|
|
73
|
+
"""
|
|
74
|
+
test_dir = output_dir or Path("./docs")
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
# Create directory if it doesn't exist
|
|
78
|
+
test_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
|
|
80
|
+
# Try to write a test file
|
|
81
|
+
test_file = test_dir / ".docpull_test"
|
|
82
|
+
test_file.write_text("test")
|
|
83
|
+
test_file.unlink()
|
|
84
|
+
|
|
85
|
+
return True, f"[OK] Output directory writable ({test_dir})"
|
|
86
|
+
except PermissionError:
|
|
87
|
+
return False, f"[FAIL] Output directory - permission denied ({test_dir})"
|
|
88
|
+
except Exception as e:
|
|
89
|
+
return False, f"[FAIL] Output directory - {str(e)} ({test_dir})"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def run_doctor(output_dir: Optional[Path] = None, use_rich: bool = True) -> int:
|
|
93
|
+
"""
|
|
94
|
+
Run diagnostic checks and display results.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
output_dir: Output directory to check for writability
|
|
98
|
+
use_rich: Whether to use rich formatting (if available)
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Exit code (0 if all core dependencies OK, 1 if any core dependency missing)
|
|
102
|
+
"""
|
|
103
|
+
# Determine if we can use rich formatting
|
|
104
|
+
use_rich = use_rich and RICH_AVAILABLE
|
|
105
|
+
|
|
106
|
+
print("Running docpull diagnostics...\n")
|
|
107
|
+
|
|
108
|
+
# Core dependencies
|
|
109
|
+
core_checks = [
|
|
110
|
+
("requests", "requests"),
|
|
111
|
+
("bs4", "beautifulsoup4"),
|
|
112
|
+
("html2text", "html2text"),
|
|
113
|
+
("defusedxml", "defusedxml"),
|
|
114
|
+
("aiohttp", "aiohttp"),
|
|
115
|
+
("rich", "rich"),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# Optional dependencies
|
|
119
|
+
optional_checks = [
|
|
120
|
+
("yaml", "pyyaml", True),
|
|
121
|
+
("playwright.async_api", "playwright", True),
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
# Other checks
|
|
125
|
+
system_checks = [
|
|
126
|
+
check_network(),
|
|
127
|
+
check_output_dir(output_dir),
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
# Run core dependency checks
|
|
131
|
+
core_results = [check_dependency(mod, pkg) for mod, pkg in core_checks]
|
|
132
|
+
optional_results = [check_dependency(mod, pkg, opt) for mod, pkg, opt in optional_checks]
|
|
133
|
+
|
|
134
|
+
all_checks = {
|
|
135
|
+
"Core Dependencies": core_results,
|
|
136
|
+
"Optional Dependencies": optional_results,
|
|
137
|
+
"System": system_checks,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Display results
|
|
141
|
+
if use_rich:
|
|
142
|
+
console = Console()
|
|
143
|
+
|
|
144
|
+
for category, results in all_checks.items():
|
|
145
|
+
table = Table(title=category, show_header=False, box=None)
|
|
146
|
+
table.add_column("Status", style="bold")
|
|
147
|
+
|
|
148
|
+
for success, message in results:
|
|
149
|
+
style = "green" if success else ("yellow" if "optional" in message else "red")
|
|
150
|
+
table.add_row(message, style=style)
|
|
151
|
+
|
|
152
|
+
console.print(table)
|
|
153
|
+
console.print()
|
|
154
|
+
else:
|
|
155
|
+
# Fallback to plain text
|
|
156
|
+
for category, results in all_checks.items():
|
|
157
|
+
print(f"{category}:")
|
|
158
|
+
for _success, message in results:
|
|
159
|
+
print(f" {message}")
|
|
160
|
+
print()
|
|
161
|
+
|
|
162
|
+
# Check if any core dependencies failed
|
|
163
|
+
core_failed = any(not success for success, _ in core_results)
|
|
164
|
+
|
|
165
|
+
# Print summary
|
|
166
|
+
if core_failed:
|
|
167
|
+
print("\nWARNING: Some core dependencies are missing!")
|
|
168
|
+
print("\nRecommended fixes:")
|
|
169
|
+
print(" 1. For pipx users: pipx reinstall docpull --force")
|
|
170
|
+
print(" 2. For pip users: pip install --upgrade --force-reinstall docpull")
|
|
171
|
+
print(" 3. For development: pip install -e .[dev]")
|
|
172
|
+
return 1
|
|
173
|
+
else:
|
|
174
|
+
print("\nAll core dependencies installed correctly!")
|
|
175
|
+
|
|
176
|
+
# Check if optional dependencies are missing
|
|
177
|
+
optional_missing = [msg for success, msg in optional_results if not success]
|
|
178
|
+
if optional_missing:
|
|
179
|
+
print("\nOptional features available:")
|
|
180
|
+
print(" - YAML config support: pip install docpull[yaml]")
|
|
181
|
+
print(" - JavaScript rendering: pip install docpull[js]")
|
|
182
|
+
print(" - All optional features: pip install docpull[all]")
|
|
183
|
+
|
|
184
|
+
return 0
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
if __name__ == "__main__":
|
|
188
|
+
sys.exit(run_doctor())
|