docpull 1.0.1__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. docpull-1.1.0/PKG-INFO +221 -0
  2. docpull-1.1.0/README.md +154 -0
  3. {docpull-1.0.1 → docpull-1.1.0}/docpull/__init__.py +1 -1
  4. {docpull-1.0.1 → docpull-1.1.0}/docpull/cli.py +83 -12
  5. docpull-1.1.0/docpull/doctor.py +188 -0
  6. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/async_fetcher.py +8 -8
  7. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/base.py +1 -3
  8. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/plaid.py +3 -3
  9. docpull-1.1.0/docpull.egg-info/PKG-INFO +221 -0
  10. {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/SOURCES.txt +1 -0
  11. {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/requires.txt +1 -1
  12. {docpull-1.0.1 → docpull-1.1.0}/pyproject.toml +6 -2
  13. docpull-1.0.1/PKG-INFO +0 -440
  14. docpull-1.0.1/README.md +0 -373
  15. docpull-1.0.1/docpull.egg-info/PKG-INFO +0 -440
  16. {docpull-1.0.1 → docpull-1.1.0}/LICENSE +0 -0
  17. {docpull-1.0.1 → docpull-1.1.0}/docpull/__main__.py +0 -0
  18. {docpull-1.0.1 → docpull-1.1.0}/docpull/config.py +0 -0
  19. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/__init__.py +0 -0
  20. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/bun.py +0 -0
  21. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/d3.py +0 -0
  22. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/generic.py +0 -0
  23. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/generic_async.py +0 -0
  24. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/nextjs.py +0 -0
  25. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/parallel_base.py +0 -0
  26. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/react.py +0 -0
  27. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/stripe.py +0 -0
  28. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/tailwind.py +0 -0
  29. {docpull-1.0.1 → docpull-1.1.0}/docpull/fetchers/turborepo.py +0 -0
  30. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/__init__.py +0 -0
  31. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/base.py +0 -0
  32. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/bun.py +0 -0
  33. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/d3.py +0 -0
  34. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/nextjs.py +0 -0
  35. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/plaid.py +0 -0
  36. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/react.py +0 -0
  37. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/stripe.py +0 -0
  38. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/tailwind.py +0 -0
  39. {docpull-1.0.1 → docpull-1.1.0}/docpull/profiles/turborepo.py +0 -0
  40. {docpull-1.0.1 → docpull-1.1.0}/docpull/py.typed +0 -0
  41. {docpull-1.0.1 → docpull-1.1.0}/docpull/utils/__init__.py +0 -0
  42. {docpull-1.0.1 → docpull-1.1.0}/docpull/utils/file_utils.py +0 -0
  43. {docpull-1.0.1 → docpull-1.1.0}/docpull/utils/logging_config.py +0 -0
  44. {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/dependency_links.txt +0 -0
  45. {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/entry_points.txt +0 -0
  46. {docpull-1.0.1 → docpull-1.1.0}/docpull.egg-info/top_level.txt +0 -0
  47. {docpull-1.0.1 → docpull-1.1.0}/setup.cfg +0 -0
  48. {docpull-1.0.1 → docpull-1.1.0}/tests/test_async_fetcher.py +0 -0
  49. {docpull-1.0.1 → docpull-1.1.0}/tests/test_config.py +0 -0
  50. {docpull-1.0.1 → docpull-1.1.0}/tests/test_fetchers.py +0 -0
docpull-1.1.0/PKG-INFO ADDED
@@ -0,0 +1,221 @@
1
+ Metadata-Version: 2.4
2
+ Name: docpull
3
+ Version: 1.1.0
4
+ Summary: Pull documentation from the web and convert to clean markdown
5
+ Author-email: Zachary Roth <support@raintree.technology>
6
+ Maintainer-email: Raintree Technology <support@raintree.technology>
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/raintree-technology/docpull
9
+ Project-URL: Documentation, https://github.com/raintree-technology/docpull#readme
10
+ Project-URL: Repository, https://github.com/raintree-technology/docpull
11
+ Project-URL: Source Code, https://github.com/raintree-technology/docpull
12
+ Project-URL: Bug Tracker, https://github.com/raintree-technology/docpull/issues
13
+ Project-URL: Changelog, https://github.com/raintree-technology/docpull/blob/main/CHANGELOG.md
14
+ Keywords: python,markdown,documentation,web-scraping,developer-tools,claude,ai-training-data
15
+ Classifier: Development Status :: 5 - Production/Stable
16
+ Classifier: Intended Audience :: Developers
17
+ Classifier: Intended Audience :: Information Technology
18
+ Classifier: Intended Audience :: Science/Research
19
+ Classifier: Intended Audience :: Education
20
+ Classifier: Environment :: Console
21
+ Classifier: Topic :: Documentation
22
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
23
+ Classifier: Topic :: Software Development :: Documentation
24
+ Classifier: Topic :: Text Processing :: Markup :: HTML
25
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
26
+ Classifier: Topic :: Utilities
27
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
28
+ Classifier: Natural Language :: English
29
+ Classifier: Operating System :: OS Independent
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: Programming Language :: Python :: 3.9
32
+ Classifier: Programming Language :: Python :: 3.10
33
+ Classifier: Programming Language :: Python :: 3.11
34
+ Classifier: Programming Language :: Python :: 3.12
35
+ Classifier: Programming Language :: Python :: 3.13
36
+ Classifier: Programming Language :: Python :: 3 :: Only
37
+ Classifier: Typing :: Typed
38
+ Requires-Python: >=3.9
39
+ Description-Content-Type: text/markdown
40
+ License-File: LICENSE
41
+ Requires-Dist: requests>=2.31.0
42
+ Requires-Dist: beautifulsoup4>=4.12.0
43
+ Requires-Dist: html2text>=2020.1.16
44
+ Requires-Dist: defusedxml>=0.7.1
45
+ Requires-Dist: aiohttp>=3.9.0
46
+ Requires-Dist: rich>=13.0.0
47
+ Provides-Extra: yaml
48
+ Requires-Dist: pyyaml>=6.0; extra == "yaml"
49
+ Provides-Extra: js
50
+ Requires-Dist: playwright>=1.40.0; extra == "js"
51
+ Provides-Extra: all
52
+ Requires-Dist: pyyaml>=6.0; extra == "all"
53
+ Requires-Dist: playwright>=1.40.0; extra == "all"
54
+ Provides-Extra: dev
55
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
56
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
57
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
58
+ Requires-Dist: black>=23.0.0; extra == "dev"
59
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
60
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
61
+ Requires-Dist: bandit>=1.7.0; extra == "dev"
62
+ Requires-Dist: pip-audit>=2.0.0; extra == "dev"
63
+ Requires-Dist: types-requests>=2.31.0; extra == "dev"
64
+ Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
65
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
66
+ Dynamic: license-file
67
+
68
+ # docpull
69
+
70
+ **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
71
+ Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
72
+
73
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
74
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
75
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
76
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
77
+ [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
78
+ [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
79
+
80
+ ## Why docpull?
81
+
82
+ Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
83
+
84
+ ## Key Features
85
+
86
+ - Works on any documentation site
87
+ - Smart extraction of main content
88
+ - Async + parallel fetching (up to 10× faster)
89
+ - Optional JavaScript rendering via Playwright
90
+ - Sitemap + link crawling
91
+ - URL-based filtering (include/exclude)
92
+ - Rate limiting, timeouts, content-type checks
93
+ - Saves docs in structured Markdown with YAML metadata
94
+ - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
95
+
96
+ ## Quick Start
97
+
98
+ ```bash
99
+ pip install docpull
100
+ docpull --doctor # verify installation
101
+ docpull https://aptos.dev
102
+ docpull stripe # use a built-in profile
103
+ docpull https://site.com/docs --max-pages 100 --max-concurrent 20
104
+ ```
105
+
106
+ ### JavaScript-heavy sites
107
+
108
+ ```bash
109
+ pip install docpull[js]
110
+ python -m playwright install chromium
111
+ docpull https://site.com --js
112
+ ```
113
+
114
+ ## Python API
115
+
116
+ ```python
117
+ from docpull import GenericAsyncFetcher
118
+
119
+ fetcher = GenericAsyncFetcher(
120
+ url_or_profile="https://aptos.dev",
121
+ output_dir="./docs",
122
+ max_pages=100,
123
+ max_concurrent=20,
124
+ )
125
+ fetcher.fetch()
126
+ ```
127
+
128
+ ## Common Options
129
+
130
+ - `--doctor` – verify installation and dependencies
131
+ - `--max-pages N` – limit crawl size
132
+ - `--max-depth N` – restrict link depth
133
+ - `--max-concurrent N` – control parallel fetches
134
+ - `--js` – enable Playwright rendering
135
+ - `--output-dir DIR`
136
+ - `--rate-limit X`
137
+ - `--no-skip-existing`
138
+ - `--dry-run`
139
+
140
+ ## Performance
141
+
142
+ Async fetching drastically reduces runtime:
143
+
144
+ | Pages | Sync | Async | Speedup |
145
+ |-------|------|-------|---------|
146
+ | 50 | ~50s | ~6s | 8× faster |
147
+
148
+ Higher concurrency yields even better results.
149
+
150
+ ## Output Format
151
+
152
+ Each downloaded page becomes a Markdown file:
153
+
154
+ ```markdown
155
+ ---
156
+ url: https://stripe.com/docs/payments
157
+ fetched: 2025-11-13
158
+ ---
159
+ # Payment Intents
160
+ ...
161
+ ```
162
+
163
+ Directory layout mirrors the target site's structure.
164
+
165
+ ## Configuration File (Optional)
166
+
167
+ ```yaml
168
+ output_dir: ./docs
169
+ rate_limit: 0.5
170
+ sources:
171
+ - stripe
172
+ - nextjs
173
+ ```
174
+
175
+ Run with:
176
+ ```bash
177
+ docpull --config config.yaml
178
+ ```
179
+
180
+ ## Custom Profiles
181
+
182
+ Easily define profiles for frequently scraped sites.
183
+
184
+ ```python
185
+ from docpull.profiles.base import SiteProfile
186
+
187
+ MY_PROFILE = SiteProfile(
188
+ name="mysite",
189
+ domains={"docs.mysite.com"},
190
+ include_patterns=["/docs/", "/api/"],
191
+ )
192
+ ```
193
+
194
+ ## Security
195
+
196
+ - HTTPS-only
197
+ - Blocks private network IPs
198
+ - 50MB page size limit
199
+ - Timeout controls
200
+ - Validates content-type
201
+ - Playwright sandboxing
202
+
203
+ ## Troubleshooting
204
+
205
+ - **Installation issues**: Run `docpull --doctor` to diagnose problems
206
+ - **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
207
+ - **Site requires JS**: install Playwright + `--js`
208
+ - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
209
+ - **Large sites**: set `--max-pages`
210
+
211
+ For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
212
+
213
+ ## Links
214
+
215
+ - [PyPI](https://pypi.org/project/docpull/)
216
+ - [GitHub](https://github.com/raintree-technology/docpull)
217
+ - [Issues](https://github.com/raintree-technology/docpull/issues)
218
+
219
+ ## License
220
+
221
+ MIT License - see [LICENSE](LICENSE) file for details
@@ -0,0 +1,154 @@
1
+ # docpull
2
+
3
+ **Pull documentation from any website and converts it into clean, AI-ready Markdown.**
4
+ Fast, type-safe, secure, and optimized for building knowledge bases or training datasets.
5
+
6
+ [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
7
+ [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
8
+ [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
9
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
10
+ [![Type checked: mypy](https://img.shields.io/badge/type%20checked-mypy-blue.svg)](http://mypy-lang.org/)
11
+ [![Security: bandit](https://img.shields.io/badge/security-bandit-yellow.svg)](https://github.com/PyCQA/bandit)
12
+
13
+ ## Why docpull?
14
+
15
+ Unlike tools like wget or httrack, docpull extracts only the main content, removing ads, navbars, and clutter. Output is clean Markdown with optional YAML frontmatter—ideal for RAG systems, offline docs, or ML pipelines.
16
+
17
+ ## Key Features
18
+
19
+ - Works on any documentation site
20
+ - Smart extraction of main content
21
+ - Async + parallel fetching (up to 10× faster)
22
+ - Optional JavaScript rendering via Playwright
23
+ - Sitemap + link crawling
24
+ - URL-based filtering (include/exclude)
25
+ - Rate limiting, timeouts, content-type checks
26
+ - Saves docs in structured Markdown with YAML metadata
27
+ - Optimized profiles for popular platforms (Stripe, Next.js, React, Plaid, Tailwind, etc.)
28
+
29
+ ## Quick Start
30
+
31
+ ```bash
32
+ pip install docpull
33
+ docpull --doctor # verify installation
34
+ docpull https://aptos.dev
35
+ docpull stripe # use a built-in profile
36
+ docpull https://site.com/docs --max-pages 100 --max-concurrent 20
37
+ ```
38
+
39
+ ### JavaScript-heavy sites
40
+
41
+ ```bash
42
+ pip install docpull[js]
43
+ python -m playwright install chromium
44
+ docpull https://site.com --js
45
+ ```
46
+
47
+ ## Python API
48
+
49
+ ```python
50
+ from docpull import GenericAsyncFetcher
51
+
52
+ fetcher = GenericAsyncFetcher(
53
+ url_or_profile="https://aptos.dev",
54
+ output_dir="./docs",
55
+ max_pages=100,
56
+ max_concurrent=20,
57
+ )
58
+ fetcher.fetch()
59
+ ```
60
+
61
+ ## Common Options
62
+
63
+ - `--doctor` – verify installation and dependencies
64
+ - `--max-pages N` – limit crawl size
65
+ - `--max-depth N` – restrict link depth
66
+ - `--max-concurrent N` – control parallel fetches
67
+ - `--js` – enable Playwright rendering
68
+ - `--output-dir DIR`
69
+ - `--rate-limit X`
70
+ - `--no-skip-existing`
71
+ - `--dry-run`
72
+
73
+ ## Performance
74
+
75
+ Async fetching drastically reduces runtime:
76
+
77
+ | Pages | Sync | Async | Speedup |
78
+ |-------|------|-------|---------|
79
+ | 50 | ~50s | ~6s | 8× faster |
80
+
81
+ Higher concurrency yields even better results.
82
+
83
+ ## Output Format
84
+
85
+ Each downloaded page becomes a Markdown file:
86
+
87
+ ```markdown
88
+ ---
89
+ url: https://stripe.com/docs/payments
90
+ fetched: 2025-11-13
91
+ ---
92
+ # Payment Intents
93
+ ...
94
+ ```
95
+
96
+ Directory layout mirrors the target site's structure.
97
+
98
+ ## Configuration File (Optional)
99
+
100
+ ```yaml
101
+ output_dir: ./docs
102
+ rate_limit: 0.5
103
+ sources:
104
+ - stripe
105
+ - nextjs
106
+ ```
107
+
108
+ Run with:
109
+ ```bash
110
+ docpull --config config.yaml
111
+ ```
112
+
113
+ ## Custom Profiles
114
+
115
+ Easily define profiles for frequently scraped sites.
116
+
117
+ ```python
118
+ from docpull.profiles.base import SiteProfile
119
+
120
+ MY_PROFILE = SiteProfile(
121
+ name="mysite",
122
+ domains={"docs.mysite.com"},
123
+ include_patterns=["/docs/", "/api/"],
124
+ )
125
+ ```
126
+
127
+ ## Security
128
+
129
+ - HTTPS-only
130
+ - Blocks private network IPs
131
+ - 50MB page size limit
132
+ - Timeout controls
133
+ - Validates content-type
134
+ - Playwright sandboxing
135
+
136
+ ## Troubleshooting
137
+
138
+ - **Installation issues**: Run `docpull --doctor` to diagnose problems
139
+ - **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
140
+ - **Site requires JS**: install Playwright + `--js`
141
+ - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
142
+ - **Large sites**: set `--max-pages`
143
+
144
+ For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
145
+
146
+ ## Links
147
+
148
+ - [PyPI](https://pypi.org/project/docpull/)
149
+ - [GitHub](https://github.com/raintree-technology/docpull)
150
+ - [Issues](https://github.com/raintree-technology/docpull/issues)
151
+
152
+ ## License
153
+
154
+ MIT License - see [LICENSE](LICENSE) file for details
@@ -1,4 +1,4 @@
1
- __version__ = "1.0.1"
1
+ __version__ = "1.1.0"
2
2
 
3
3
  from .fetchers.base import BaseFetcher
4
4
  from .fetchers.bun import BunFetcher
@@ -3,6 +3,40 @@ import sys
3
3
  from pathlib import Path
4
4
  from typing import Optional
5
5
 
6
+ # Check if --doctor flag is present before checking dependencies
7
+ # This allows users to diagnose issues even when dependencies are missing
8
+ if "--doctor" in sys.argv:
9
+ from .doctor import run_doctor
10
+
11
+ # Parse output dir if provided
12
+ output_dir = None
13
+ if "--output-dir" in sys.argv or "-o" in sys.argv:
14
+ try:
15
+ flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
16
+ if flag_idx + 1 < len(sys.argv):
17
+ output_dir = Path(sys.argv[flag_idx + 1])
18
+ except (ValueError, IndexError):
19
+ pass
20
+ sys.exit(run_doctor(output_dir=output_dir))
21
+
22
+ # Verify core dependencies are available
23
+ try:
24
+ import aiohttp # noqa: F401
25
+ import bs4 # noqa: F401
26
+ import defusedxml # noqa: F401
27
+ import html2text # noqa: F401
28
+ import requests # noqa: F401
29
+ import rich # noqa: F401
30
+ except ImportError as e:
31
+ print(f"\nERROR: Missing required dependency: {e.name}", file=sys.stderr)
32
+ print("\nDocpull requires all core dependencies to be installed.", file=sys.stderr)
33
+ print("\nRecommended fixes:", file=sys.stderr)
34
+ print(" 1. For pipx users: pipx reinstall docpull --force", file=sys.stderr)
35
+ print(" 2. For pip users: pip install --upgrade --force-reinstall docpull", file=sys.stderr)
36
+ print(" 3. For development: pip install -e .[dev]", file=sys.stderr)
37
+ print("\nTo diagnose issues, run: docpull --doctor", file=sys.stderr)
38
+ sys.exit(1)
39
+
6
40
  from . import __version__
7
41
  from .config import FetcherConfig
8
42
  from .fetchers import (
@@ -185,6 +219,12 @@ Examples:
185
219
  version=f"%(prog)s {__version__}",
186
220
  )
187
221
 
222
+ parser.add_argument(
223
+ "--doctor",
224
+ action="store_true",
225
+ help="Run diagnostic checks to verify installation",
226
+ )
227
+
188
228
  return parser
189
229
 
190
230
 
@@ -200,17 +240,31 @@ def generate_sample_config(output_path: Path) -> None:
200
240
  # Determine format from extension
201
241
  suffix = output_path.suffix.lower()
202
242
 
203
- if suffix in [".yaml", ".yml"]:
204
- config.save_yaml(output_path)
205
- print(f"Sample YAML config generated: {output_path}")
206
- elif suffix == ".json":
207
- config.save_json(output_path)
208
- print(f"Sample JSON config generated: {output_path}")
209
- else:
210
- print(f"Warning: Unknown extension {suffix}, generating YAML")
211
- output_path = output_path.with_suffix(".yaml")
212
- config.save_yaml(output_path)
213
- print(f"Sample YAML config generated: {output_path}")
243
+ try:
244
+ if suffix in [".yaml", ".yml"]:
245
+ config.save_yaml(output_path)
246
+ print(f"Sample YAML config generated: {output_path}")
247
+ elif suffix == ".json":
248
+ config.save_json(output_path)
249
+ print(f"Sample JSON config generated: {output_path}")
250
+ else:
251
+ # Try YAML first, fall back to JSON if PyYAML not available
252
+ try:
253
+ print(f"Warning: Unknown extension {suffix}, generating YAML")
254
+ output_path = output_path.with_suffix(".yaml")
255
+ config.save_yaml(output_path)
256
+ print(f"Sample YAML config generated: {output_path}")
257
+ except ImportError:
258
+ print("PyYAML not installed, generating JSON instead")
259
+ output_path = output_path.with_suffix(".json")
260
+ config.save_json(output_path)
261
+ print(f"Sample JSON config generated: {output_path}")
262
+ except ImportError:
263
+ print("\nERROR: PyYAML is required for YAML config files")
264
+ print("Install it with: pip install docpull[yaml]")
265
+ print("\nAlternatively, use JSON format:")
266
+ print(f" docpull --generate-config {output_path.with_suffix('.json')}")
267
+ raise
214
268
 
215
269
 
216
270
  def get_config(args: argparse.Namespace) -> FetcherConfig:
@@ -224,7 +278,17 @@ def get_config(args: argparse.Namespace) -> FetcherConfig:
224
278
  FetcherConfig instance
225
279
  """
226
280
  # Load from config file if provided
227
- config = FetcherConfig.from_file(args.config) if args.config else FetcherConfig()
281
+ if args.config:
282
+ try:
283
+ config = FetcherConfig.from_file(args.config)
284
+ except ImportError as e:
285
+ print(f"\nERROR: Error loading config file: {e}")
286
+ if "yaml" in str(e).lower() or "pyyaml" in str(e).lower():
287
+ print("Install PyYAML with: pip install docpull[yaml]")
288
+ print("\nAlternatively, convert your config to JSON format")
289
+ raise
290
+ else:
291
+ config = FetcherConfig()
228
292
 
229
293
  # Override with command-line arguments
230
294
  if args.output_dir is not None:
@@ -411,6 +475,13 @@ def main(argv: Optional[list[str]] = None) -> int:
411
475
  parser = create_parser()
412
476
  args = parser.parse_args(argv)
413
477
 
478
+ # Handle --doctor
479
+ if args.doctor:
480
+ from .doctor import run_doctor
481
+
482
+ output_dir = Path(args.output_dir) if args.output_dir else None
483
+ return run_doctor(output_dir=output_dir)
484
+
414
485
  # Handle --generate-config
415
486
  if args.generate_config:
416
487
  try:
@@ -0,0 +1,188 @@
1
+ """Diagnostic tool for verifying docpull installation and dependencies."""
2
+
3
+ import sys
4
+ from importlib import import_module
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ try:
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ RICH_AVAILABLE = True
13
+ except ImportError:
14
+ RICH_AVAILABLE = False
15
+ Console = None # type: ignore
16
+ Table = None # type: ignore
17
+
18
+
19
+ def check_dependency(
20
+ module_name: str, package_name: Optional[str] = None, optional: bool = False
21
+ ) -> tuple[bool, str]:
22
+ """
23
+ Check if a Python module is importable.
24
+
25
+ Args:
26
+ module_name: Name of the module to import
27
+ package_name: Display name of the package (defaults to module_name)
28
+ optional: Whether this is an optional dependency
29
+
30
+ Returns:
31
+ Tuple of (success: bool, message: str)
32
+ """
33
+ display_name = package_name or module_name
34
+
35
+ try:
36
+ import_module(module_name)
37
+ return True, f"[OK] {display_name}"
38
+ except ImportError:
39
+ if optional:
40
+ return False, f"[WARN] {display_name} (optional - not installed)"
41
+ else:
42
+ return False, f"[MISSING] {display_name}"
43
+
44
+
45
+ def check_network() -> tuple[bool, str]:
46
+ """
47
+ Check basic network connectivity.
48
+
49
+ Returns:
50
+ Tuple of (success: bool, message: str)
51
+ """
52
+ try:
53
+ import socket
54
+
55
+ # Try to resolve a common DNS name
56
+ socket.gethostbyname("www.google.com")
57
+ return True, "[OK] Network connectivity"
58
+ except socket.gaierror:
59
+ return False, "[FAIL] Network connectivity - DNS resolution failed"
60
+ except Exception as e:
61
+ return False, f"[WARN] Network connectivity - {str(e)}"
62
+
63
+
64
+ def check_output_dir(output_dir: Optional[Path] = None) -> tuple[bool, str]:
65
+ """
66
+ Check if output directory is writable.
67
+
68
+ Args:
69
+ output_dir: Directory to check (defaults to ./docs)
70
+
71
+ Returns:
72
+ Tuple of (success: bool, message: str)
73
+ """
74
+ test_dir = output_dir or Path("./docs")
75
+
76
+ try:
77
+ # Create directory if it doesn't exist
78
+ test_dir.mkdir(parents=True, exist_ok=True)
79
+
80
+ # Try to write a test file
81
+ test_file = test_dir / ".docpull_test"
82
+ test_file.write_text("test")
83
+ test_file.unlink()
84
+
85
+ return True, f"[OK] Output directory writable ({test_dir})"
86
+ except PermissionError:
87
+ return False, f"[FAIL] Output directory - permission denied ({test_dir})"
88
+ except Exception as e:
89
+ return False, f"[FAIL] Output directory - {str(e)} ({test_dir})"
90
+
91
+
92
+ def run_doctor(output_dir: Optional[Path] = None, use_rich: bool = True) -> int:
93
+ """
94
+ Run diagnostic checks and display results.
95
+
96
+ Args:
97
+ output_dir: Output directory to check for writability
98
+ use_rich: Whether to use rich formatting (if available)
99
+
100
+ Returns:
101
+ Exit code (0 if all core dependencies OK, 1 if any core dependency missing)
102
+ """
103
+ # Determine if we can use rich formatting
104
+ use_rich = use_rich and RICH_AVAILABLE
105
+
106
+ print("Running docpull diagnostics...\n")
107
+
108
+ # Core dependencies
109
+ core_checks = [
110
+ ("requests", "requests"),
111
+ ("bs4", "beautifulsoup4"),
112
+ ("html2text", "html2text"),
113
+ ("defusedxml", "defusedxml"),
114
+ ("aiohttp", "aiohttp"),
115
+ ("rich", "rich"),
116
+ ]
117
+
118
+ # Optional dependencies
119
+ optional_checks = [
120
+ ("yaml", "pyyaml", True),
121
+ ("playwright.async_api", "playwright", True),
122
+ ]
123
+
124
+ # Other checks
125
+ system_checks = [
126
+ check_network(),
127
+ check_output_dir(output_dir),
128
+ ]
129
+
130
+ # Run core dependency checks
131
+ core_results = [check_dependency(mod, pkg) for mod, pkg in core_checks]
132
+ optional_results = [check_dependency(mod, pkg, opt) for mod, pkg, opt in optional_checks]
133
+
134
+ all_checks = {
135
+ "Core Dependencies": core_results,
136
+ "Optional Dependencies": optional_results,
137
+ "System": system_checks,
138
+ }
139
+
140
+ # Display results
141
+ if use_rich:
142
+ console = Console()
143
+
144
+ for category, results in all_checks.items():
145
+ table = Table(title=category, show_header=False, box=None)
146
+ table.add_column("Status", style="bold")
147
+
148
+ for success, message in results:
149
+ style = "green" if success else ("yellow" if "optional" in message else "red")
150
+ table.add_row(message, style=style)
151
+
152
+ console.print(table)
153
+ console.print()
154
+ else:
155
+ # Fallback to plain text
156
+ for category, results in all_checks.items():
157
+ print(f"{category}:")
158
+ for _success, message in results:
159
+ print(f" {message}")
160
+ print()
161
+
162
+ # Check if any core dependencies failed
163
+ core_failed = any(not success for success, _ in core_results)
164
+
165
+ # Print summary
166
+ if core_failed:
167
+ print("\nWARNING: Some core dependencies are missing!")
168
+ print("\nRecommended fixes:")
169
+ print(" 1. For pipx users: pipx reinstall docpull --force")
170
+ print(" 2. For pip users: pip install --upgrade --force-reinstall docpull")
171
+ print(" 3. For development: pip install -e .[dev]")
172
+ return 1
173
+ else:
174
+ print("\nAll core dependencies installed correctly!")
175
+
176
+ # Check if optional dependencies are missing
177
+ optional_missing = [msg for success, msg in optional_results if not success]
178
+ if optional_missing:
179
+ print("\nOptional features available:")
180
+ print(" - YAML config support: pip install docpull[yaml]")
181
+ print(" - JavaScript rendering: pip install docpull[js]")
182
+ print(" - All optional features: pip install docpull[all]")
183
+
184
+ return 0
185
+
186
+
187
+ if __name__ == "__main__":
188
+ sys.exit(run_doctor())