docpull 1.0.2__tar.gz → 1.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. {docpull-1.0.2 → docpull-1.1.0}/PKG-INFO +8 -2
  2. {docpull-1.0.2 → docpull-1.1.0}/README.md +6 -0
  3. {docpull-1.0.2 → docpull-1.1.0}/docpull/__init__.py +1 -1
  4. {docpull-1.0.2 → docpull-1.1.0}/docpull/cli.py +83 -12
  5. docpull-1.1.0/docpull/doctor.py +188 -0
  6. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/async_fetcher.py +8 -8
  7. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/base.py +1 -3
  8. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/plaid.py +3 -3
  9. {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/PKG-INFO +8 -2
  10. {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/SOURCES.txt +1 -0
  11. {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/requires.txt +1 -1
  12. {docpull-1.0.2 → docpull-1.1.0}/pyproject.toml +6 -2
  13. {docpull-1.0.2 → docpull-1.1.0}/LICENSE +0 -0
  14. {docpull-1.0.2 → docpull-1.1.0}/docpull/__main__.py +0 -0
  15. {docpull-1.0.2 → docpull-1.1.0}/docpull/config.py +0 -0
  16. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/__init__.py +0 -0
  17. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/bun.py +0 -0
  18. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/d3.py +0 -0
  19. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/generic.py +0 -0
  20. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/generic_async.py +0 -0
  21. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/nextjs.py +0 -0
  22. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/parallel_base.py +0 -0
  23. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/react.py +0 -0
  24. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/stripe.py +0 -0
  25. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/tailwind.py +0 -0
  26. {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/turborepo.py +0 -0
  27. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/__init__.py +0 -0
  28. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/base.py +0 -0
  29. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/bun.py +0 -0
  30. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/d3.py +0 -0
  31. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/nextjs.py +0 -0
  32. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/plaid.py +0 -0
  33. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/react.py +0 -0
  34. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/stripe.py +0 -0
  35. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/tailwind.py +0 -0
  36. {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/turborepo.py +0 -0
  37. {docpull-1.0.2 → docpull-1.1.0}/docpull/py.typed +0 -0
  38. {docpull-1.0.2 → docpull-1.1.0}/docpull/utils/__init__.py +0 -0
  39. {docpull-1.0.2 → docpull-1.1.0}/docpull/utils/file_utils.py +0 -0
  40. {docpull-1.0.2 → docpull-1.1.0}/docpull/utils/logging_config.py +0 -0
  41. {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/dependency_links.txt +0 -0
  42. {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/entry_points.txt +0 -0
  43. {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/top_level.txt +0 -0
  44. {docpull-1.0.2 → docpull-1.1.0}/setup.cfg +0 -0
  45. {docpull-1.0.2 → docpull-1.1.0}/tests/test_async_fetcher.py +0 -0
  46. {docpull-1.0.2 → docpull-1.1.0}/tests/test_config.py +0 -0
  47. {docpull-1.0.2 → docpull-1.1.0}/tests/test_fetchers.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 1.0.2
3
+ Version: 1.1.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -62,7 +62,7 @@ Requires-Dist: bandit>=1.7.0; extra == "dev"
62
62
  Requires-Dist: pip-audit>=2.0.0; extra == "dev"
63
63
  Requires-Dist: types-requests>=2.31.0; extra == "dev"
64
64
  Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
65
- Requires-Dist: types-aiohttp>=3.9.0; extra == "dev"
65
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
66
66
  Dynamic: license-file
67
67
 
68
68
  # docpull
@@ -97,6 +97,7 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
97
97
 
98
98
  ```bash
99
99
  pip install docpull
100
+ docpull --doctor # verify installation
100
101
  docpull https://aptos.dev
101
102
  docpull stripe # use a built-in profile
102
103
  docpull https://site.com/docs --max-pages 100 --max-concurrent 20
@@ -126,6 +127,7 @@ fetcher.fetch()
126
127
 
127
128
  ## Common Options
128
129
 
130
+ - `--doctor` – verify installation and dependencies
129
131
  - `--max-pages N` – limit crawl size
130
132
  - `--max-depth N` – restrict link depth
131
133
  - `--max-concurrent N` – control parallel fetches
@@ -200,10 +202,14 @@ MY_PROFILE = SiteProfile(
200
202
 
201
203
  ## Troubleshooting
202
204
 
205
+ - **Installation issues**: Run `docpull --doctor` to diagnose problems
206
+ - **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
203
207
  - **Site requires JS**: install Playwright + `--js`
204
208
  - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
205
209
  - **Large sites**: set `--max-pages`
206
210
 
211
+ For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
212
+
207
213
  ## Links
208
214
 
209
215
  - [PyPI](https://pypi.org/project/docpull/)
@@ -30,6 +30,7 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
30
30
 
31
31
  ```bash
32
32
  pip install docpull
33
+ docpull --doctor # verify installation
33
34
  docpull https://aptos.dev
34
35
  docpull stripe # use a built-in profile
35
36
  docpull https://site.com/docs --max-pages 100 --max-concurrent 20
@@ -59,6 +60,7 @@ fetcher.fetch()
59
60
 
60
61
  ## Common Options
61
62
 
63
+ - `--doctor` – verify installation and dependencies
62
64
  - `--max-pages N` – limit crawl size
63
65
  - `--max-depth N` – restrict link depth
64
66
  - `--max-concurrent N` – control parallel fetches
@@ -133,10 +135,14 @@ MY_PROFILE = SiteProfile(
133
135
 
134
136
  ## Troubleshooting
135
137
 
138
+ - **Installation issues**: Run `docpull --doctor` to diagnose problems
139
+ - **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
136
140
  - **Site requires JS**: install Playwright + `--js`
137
141
  - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
138
142
  - **Large sites**: set `--max-pages`
139
143
 
144
+ For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
145
+
140
146
  ## Links
141
147
 
142
148
  - [PyPI](https://pypi.org/project/docpull/)
@@ -1,4 +1,4 @@
1
- __version__ = "1.0.2"
1
+ __version__ = "1.1.0"
2
2
 
3
3
  from .fetchers.base import BaseFetcher
4
4
  from .fetchers.bun import BunFetcher
@@ -3,6 +3,40 @@ import sys
3
3
  from pathlib import Path
4
4
  from typing import Optional
5
5
 
6
+ # Check if --doctor flag is present before checking dependencies
7
+ # This allows users to diagnose issues even when dependencies are missing
8
+ if "--doctor" in sys.argv:
9
+ from .doctor import run_doctor
10
+
11
+ # Parse output dir if provided
12
+ output_dir = None
13
+ if "--output-dir" in sys.argv or "-o" in sys.argv:
14
+ try:
15
+ flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
16
+ if flag_idx + 1 < len(sys.argv):
17
+ output_dir = Path(sys.argv[flag_idx + 1])
18
+ except (ValueError, IndexError):
19
+ pass
20
+ sys.exit(run_doctor(output_dir=output_dir))
21
+
22
+ # Verify core dependencies are available
23
+ try:
24
+ import aiohttp # noqa: F401
25
+ import bs4 # noqa: F401
26
+ import defusedxml # noqa: F401
27
+ import html2text # noqa: F401
28
+ import requests # noqa: F401
29
+ import rich # noqa: F401
30
+ except ImportError as e:
31
+ print(f"\nERROR: Missing required dependency: {e.name}", file=sys.stderr)
32
+ print("\nDocpull requires all core dependencies to be installed.", file=sys.stderr)
33
+ print("\nRecommended fixes:", file=sys.stderr)
34
+ print(" 1. For pipx users: pipx reinstall docpull --force", file=sys.stderr)
35
+ print(" 2. For pip users: pip install --upgrade --force-reinstall docpull", file=sys.stderr)
36
+ print(" 3. For development: pip install -e .[dev]", file=sys.stderr)
37
+ print("\nTo diagnose issues, run: docpull --doctor", file=sys.stderr)
38
+ sys.exit(1)
39
+
6
40
  from . import __version__
7
41
  from .config import FetcherConfig
8
42
  from .fetchers import (
@@ -185,6 +219,12 @@ Examples:
185
219
  version=f"%(prog)s {__version__}",
186
220
  )
187
221
 
222
+ parser.add_argument(
223
+ "--doctor",
224
+ action="store_true",
225
+ help="Run diagnostic checks to verify installation",
226
+ )
227
+
188
228
  return parser
189
229
 
190
230
 
@@ -200,17 +240,31 @@ def generate_sample_config(output_path: Path) -> None:
200
240
  # Determine format from extension
201
241
  suffix = output_path.suffix.lower()
202
242
 
203
- if suffix in [".yaml", ".yml"]:
204
- config.save_yaml(output_path)
205
- print(f"Sample YAML config generated: {output_path}")
206
- elif suffix == ".json":
207
- config.save_json(output_path)
208
- print(f"Sample JSON config generated: {output_path}")
209
- else:
210
- print(f"Warning: Unknown extension {suffix}, generating YAML")
211
- output_path = output_path.with_suffix(".yaml")
212
- config.save_yaml(output_path)
213
- print(f"Sample YAML config generated: {output_path}")
243
+ try:
244
+ if suffix in [".yaml", ".yml"]:
245
+ config.save_yaml(output_path)
246
+ print(f"Sample YAML config generated: {output_path}")
247
+ elif suffix == ".json":
248
+ config.save_json(output_path)
249
+ print(f"Sample JSON config generated: {output_path}")
250
+ else:
251
+ # Try YAML first, fall back to JSON if PyYAML not available
252
+ try:
253
+ print(f"Warning: Unknown extension {suffix}, generating YAML")
254
+ output_path = output_path.with_suffix(".yaml")
255
+ config.save_yaml(output_path)
256
+ print(f"Sample YAML config generated: {output_path}")
257
+ except ImportError:
258
+ print("PyYAML not installed, generating JSON instead")
259
+ output_path = output_path.with_suffix(".json")
260
+ config.save_json(output_path)
261
+ print(f"Sample JSON config generated: {output_path}")
262
+ except ImportError:
263
+ print("\nERROR: PyYAML is required for YAML config files")
264
+ print("Install it with: pip install docpull[yaml]")
265
+ print("\nAlternatively, use JSON format:")
266
+ print(f" docpull --generate-config {output_path.with_suffix('.json')}")
267
+ raise
214
268
 
215
269
 
216
270
  def get_config(args: argparse.Namespace) -> FetcherConfig:
@@ -224,7 +278,17 @@ def get_config(args: argparse.Namespace) -> FetcherConfig:
224
278
  FetcherConfig instance
225
279
  """
226
280
  # Load from config file if provided
227
- config = FetcherConfig.from_file(args.config) if args.config else FetcherConfig()
281
+ if args.config:
282
+ try:
283
+ config = FetcherConfig.from_file(args.config)
284
+ except ImportError as e:
285
+ print(f"\nERROR: Error loading config file: {e}")
286
+ if "yaml" in str(e).lower() or "pyyaml" in str(e).lower():
287
+ print("Install PyYAML with: pip install docpull[yaml]")
288
+ print("\nAlternatively, convert your config to JSON format")
289
+ raise
290
+ else:
291
+ config = FetcherConfig()
228
292
 
229
293
  # Override with command-line arguments
230
294
  if args.output_dir is not None:
@@ -411,6 +475,13 @@ def main(argv: Optional[list[str]] = None) -> int:
411
475
  parser = create_parser()
412
476
  args = parser.parse_args(argv)
413
477
 
478
+ # Handle --doctor
479
+ if args.doctor:
480
+ from .doctor import run_doctor
481
+
482
+ output_dir = Path(args.output_dir) if args.output_dir else None
483
+ return run_doctor(output_dir=output_dir)
484
+
414
485
  # Handle --generate-config
415
486
  if args.generate_config:
416
487
  try:
@@ -0,0 +1,188 @@
1
+ """Diagnostic tool for verifying docpull installation and dependencies."""
2
+
3
+ import sys
4
+ from importlib import import_module
5
+ from pathlib import Path
6
+ from typing import Optional
7
+
8
+ try:
9
+ from rich.console import Console
10
+ from rich.table import Table
11
+
12
+ RICH_AVAILABLE = True
13
+ except ImportError:
14
+ RICH_AVAILABLE = False
15
+ Console = None # type: ignore
16
+ Table = None # type: ignore
17
+
18
+
19
+ def check_dependency(
20
+ module_name: str, package_name: Optional[str] = None, optional: bool = False
21
+ ) -> tuple[bool, str]:
22
+ """
23
+ Check if a Python module is importable.
24
+
25
+ Args:
26
+ module_name: Name of the module to import
27
+ package_name: Display name of the package (defaults to module_name)
28
+ optional: Whether this is an optional dependency
29
+
30
+ Returns:
31
+ Tuple of (success: bool, message: str)
32
+ """
33
+ display_name = package_name or module_name
34
+
35
+ try:
36
+ import_module(module_name)
37
+ return True, f"[OK] {display_name}"
38
+ except ImportError:
39
+ if optional:
40
+ return False, f"[WARN] {display_name} (optional - not installed)"
41
+ else:
42
+ return False, f"[MISSING] {display_name}"
43
+
44
+
45
+ def check_network() -> tuple[bool, str]:
46
+ """
47
+ Check basic network connectivity.
48
+
49
+ Returns:
50
+ Tuple of (success: bool, message: str)
51
+ """
52
+ try:
53
+ import socket
54
+
55
+ # Try to resolve a common DNS name
56
+ socket.gethostbyname("www.google.com")
57
+ return True, "[OK] Network connectivity"
58
+ except socket.gaierror:
59
+ return False, "[FAIL] Network connectivity - DNS resolution failed"
60
+ except Exception as e:
61
+ return False, f"[WARN] Network connectivity - {str(e)}"
62
+
63
+
64
+ def check_output_dir(output_dir: Optional[Path] = None) -> tuple[bool, str]:
65
+ """
66
+ Check if output directory is writable.
67
+
68
+ Args:
69
+ output_dir: Directory to check (defaults to ./docs)
70
+
71
+ Returns:
72
+ Tuple of (success: bool, message: str)
73
+ """
74
+ test_dir = output_dir or Path("./docs")
75
+
76
+ try:
77
+ # Create directory if it doesn't exist
78
+ test_dir.mkdir(parents=True, exist_ok=True)
79
+
80
+ # Try to write a test file
81
+ test_file = test_dir / ".docpull_test"
82
+ test_file.write_text("test")
83
+ test_file.unlink()
84
+
85
+ return True, f"[OK] Output directory writable ({test_dir})"
86
+ except PermissionError:
87
+ return False, f"[FAIL] Output directory - permission denied ({test_dir})"
88
+ except Exception as e:
89
+ return False, f"[FAIL] Output directory - {str(e)} ({test_dir})"
90
+
91
+
92
+ def run_doctor(output_dir: Optional[Path] = None, use_rich: bool = True) -> int:
93
+ """
94
+ Run diagnostic checks and display results.
95
+
96
+ Args:
97
+ output_dir: Output directory to check for writability
98
+ use_rich: Whether to use rich formatting (if available)
99
+
100
+ Returns:
101
+ Exit code (0 if all core dependencies OK, 1 if any core dependency missing)
102
+ """
103
+ # Determine if we can use rich formatting
104
+ use_rich = use_rich and RICH_AVAILABLE
105
+
106
+ print("Running docpull diagnostics...\n")
107
+
108
+ # Core dependencies
109
+ core_checks = [
110
+ ("requests", "requests"),
111
+ ("bs4", "beautifulsoup4"),
112
+ ("html2text", "html2text"),
113
+ ("defusedxml", "defusedxml"),
114
+ ("aiohttp", "aiohttp"),
115
+ ("rich", "rich"),
116
+ ]
117
+
118
+ # Optional dependencies
119
+ optional_checks = [
120
+ ("yaml", "pyyaml", True),
121
+ ("playwright.async_api", "playwright", True),
122
+ ]
123
+
124
+ # Other checks
125
+ system_checks = [
126
+ check_network(),
127
+ check_output_dir(output_dir),
128
+ ]
129
+
130
+ # Run core dependency checks
131
+ core_results = [check_dependency(mod, pkg) for mod, pkg in core_checks]
132
+ optional_results = [check_dependency(mod, pkg, opt) for mod, pkg, opt in optional_checks]
133
+
134
+ all_checks = {
135
+ "Core Dependencies": core_results,
136
+ "Optional Dependencies": optional_results,
137
+ "System": system_checks,
138
+ }
139
+
140
+ # Display results
141
+ if use_rich:
142
+ console = Console()
143
+
144
+ for category, results in all_checks.items():
145
+ table = Table(title=category, show_header=False, box=None)
146
+ table.add_column("Status", style="bold")
147
+
148
+ for success, message in results:
149
+ style = "green" if success else ("yellow" if "optional" in message else "red")
150
+ table.add_row(message, style=style)
151
+
152
+ console.print(table)
153
+ console.print()
154
+ else:
155
+ # Fallback to plain text
156
+ for category, results in all_checks.items():
157
+ print(f"{category}:")
158
+ for _success, message in results:
159
+ print(f" {message}")
160
+ print()
161
+
162
+ # Check if any core dependencies failed
163
+ core_failed = any(not success for success, _ in core_results)
164
+
165
+ # Print summary
166
+ if core_failed:
167
+ print("\nWARNING: Some core dependencies are missing!")
168
+ print("\nRecommended fixes:")
169
+ print(" 1. For pipx users: pipx reinstall docpull --force")
170
+ print(" 2. For pip users: pip install --upgrade --force-reinstall docpull")
171
+ print(" 3. For development: pip install -e .[dev]")
172
+ return 1
173
+ else:
174
+ print("\nAll core dependencies installed correctly!")
175
+
176
+ # Check if optional dependencies are missing
177
+ optional_missing = [msg for success, msg in optional_results if not success]
178
+ if optional_missing:
179
+ print("\nOptional features available:")
180
+ print(" - YAML config support: pip install docpull[yaml]")
181
+ print(" - JavaScript rendering: pip install docpull[js]")
182
+ print(" - All optional features: pip install docpull[all]")
183
+
184
+ return 0
185
+
186
+
187
+ if __name__ == "__main__":
188
+ sys.exit(run_doctor())
@@ -3,7 +3,7 @@
3
3
  import asyncio
4
4
  import time
5
5
  from pathlib import Path
6
- from typing import TYPE_CHECKING, Any, List, Optional, Tuple
6
+ from typing import Any, Optional
7
7
 
8
8
  import aiohttp
9
9
  from bs4 import BeautifulSoup
@@ -12,15 +12,15 @@ from ..utils.file_utils import ensure_dir, validate_output_path
12
12
  from .base import BaseFetcher
13
13
 
14
14
  # Optional Playwright support
15
- if TYPE_CHECKING:
16
- from playwright.async_api import Browser, Page, Playwright
17
-
18
15
  try:
19
- from playwright.async_api import async_playwright
16
+ from playwright.async_api import Browser, Playwright, async_playwright
20
17
 
21
18
  PLAYWRIGHT_AVAILABLE = True
22
19
  except ImportError:
23
20
  PLAYWRIGHT_AVAILABLE = False
21
+ # Fallback types for when playwright is not installed
22
+ Browser = Any # type: ignore[misc,assignment]
23
+ Playwright = Any # type: ignore[misc,assignment]
24
24
 
25
25
 
26
26
  class AsyncFetcher:
@@ -68,8 +68,8 @@ class AsyncFetcher:
68
68
  self.rate_limit_delay = base_fetcher.rate_limit
69
69
 
70
70
  # Browser instance (if using JS)
71
- self.browser: Optional["Browser"] = None
72
- self.playwright: Optional["Playwright"] = None
71
+ self.browser: Optional[Browser] = None # type: ignore[no-any-unimported]
72
+ self.playwright: Optional[Playwright] = None # type: ignore[no-any-unimported]
73
73
 
74
74
  if use_js and not PLAYWRIGHT_AVAILABLE:
75
75
  self.logger.warning("Playwright not installed. Install with: pip install docpull[js]")
@@ -152,7 +152,7 @@ class AsyncFetcher:
152
152
  )
153
153
 
154
154
  # Get rendered HTML
155
- content = await page.content()
155
+ content: str = await page.content()
156
156
 
157
157
  return content
158
158
 
@@ -84,9 +84,7 @@ class BaseFetcher(ABC):
84
84
  self.validator_func = validator_func
85
85
  super().__init__(*args, **kwargs)
86
86
 
87
- def send( # type: ignore[override]
88
- self, request: PreparedRequest, **kwargs: Any
89
- ) -> Response:
87
+ def send(self, request: PreparedRequest, **kwargs: Any) -> Response: # type: ignore[override]
90
88
  if request.url is None:
91
89
  raise ValueError("Request URL is None")
92
90
  if not self.validator_func(request.url):
@@ -57,9 +57,9 @@ class PlaidFetcher(BaseFetcher):
57
57
  sitemap_urls = self.fetch_sitemap(self.sitemap_url)
58
58
 
59
59
  for url in sitemap_urls:
60
- if (
61
- "/docs/" in url or "/api/" in url
62
- ) and not any(x in url for x in ["/blog/", "/resources/", "/company/", "/customers/"]):
60
+ if ("/docs/" in url or "/api/" in url) and not any(
61
+ x in url for x in ["/blog/", "/resources/", "/company/", "/customers/"]
62
+ ):
63
63
  doc_urls.add(url.split("#")[0].split("?")[0])
64
64
 
65
65
  doc_urls_list = sorted(doc_urls)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 1.0.2
3
+ Version: 1.1.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -62,7 +62,7 @@ Requires-Dist: bandit>=1.7.0; extra == "dev"
62
62
  Requires-Dist: pip-audit>=2.0.0; extra == "dev"
63
63
  Requires-Dist: types-requests>=2.31.0; extra == "dev"
64
64
  Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
65
- Requires-Dist: types-aiohttp>=3.9.0; extra == "dev"
65
+ Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
66
66
  Dynamic: license-file
67
67
 
68
68
  # docpull
@@ -97,6 +97,7 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
97
97
 
98
98
  ```bash
99
99
  pip install docpull
100
+ docpull --doctor # verify installation
100
101
  docpull https://aptos.dev
101
102
  docpull stripe # use a built-in profile
102
103
  docpull https://site.com/docs --max-pages 100 --max-concurrent 20
@@ -126,6 +127,7 @@ fetcher.fetch()
126
127
 
127
128
  ## Common Options
128
129
 
130
+ - `--doctor` – verify installation and dependencies
129
131
  - `--max-pages N` – limit crawl size
130
132
  - `--max-depth N` – restrict link depth
131
133
  - `--max-concurrent N` – control parallel fetches
@@ -200,10 +202,14 @@ MY_PROFILE = SiteProfile(
200
202
 
201
203
  ## Troubleshooting
202
204
 
205
+ - **Installation issues**: Run `docpull --doctor` to diagnose problems
206
+ - **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
203
207
  - **Site requires JS**: install Playwright + `--js`
204
208
  - **Slow or rate limited**: lower concurrency or raise `--rate-limit`
205
209
  - **Large sites**: set `--max-pages`
206
210
 
211
+ For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
212
+
207
213
  ## Links
208
214
 
209
215
  - [PyPI](https://pypi.org/project/docpull/)
@@ -5,6 +5,7 @@ docpull/__init__.py
5
5
  docpull/__main__.py
6
6
  docpull/cli.py
7
7
  docpull/config.py
8
+ docpull/doctor.py
8
9
  docpull/py.typed
9
10
  docpull.egg-info/PKG-INFO
10
11
  docpull.egg-info/SOURCES.txt
@@ -20,7 +20,7 @@ bandit>=1.7.0
20
20
  pip-audit>=2.0.0
21
21
  types-requests>=2.31.0
22
22
  types-beautifulsoup4>=4.12.0
23
- types-aiohttp>=3.9.0
23
+ types-defusedxml>=0.7.0
24
24
 
25
25
  [js]
26
26
  playwright>=1.40.0
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "1.0.2"
7
+ version = "1.1.0"
8
8
  description = "Pull documentation from the web and convert to clean markdown"
9
9
  readme = {file = "README.md", content-type = "text/markdown"}
10
10
  requires-python = ">=3.9"
@@ -89,7 +89,7 @@ dev = [
89
89
  "pip-audit>=2.0.0",
90
90
  "types-requests>=2.31.0",
91
91
  "types-beautifulsoup4>=4.12.0",
92
- "types-aiohttp>=3.9.0",
92
+ "types-defusedxml>=0.7.0",
93
93
  ]
94
94
 
95
95
  [project.scripts]
@@ -132,6 +132,10 @@ no_implicit_optional = true
132
132
  strict_equality = true
133
133
  warn_redundant_casts = true
134
134
 
135
+ [[tool.mypy.overrides]]
136
+ module = "playwright.*"
137
+ ignore_missing_imports = true
138
+
135
139
  [tool.pytest.ini_options]
136
140
  minversion = "7.0"
137
141
  testpaths = ["tests"]
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes