docpull 1.0.2__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-1.0.2 → docpull-1.1.0}/PKG-INFO +8 -2
- {docpull-1.0.2 → docpull-1.1.0}/README.md +6 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/__init__.py +1 -1
- {docpull-1.0.2 → docpull-1.1.0}/docpull/cli.py +83 -12
- docpull-1.1.0/docpull/doctor.py +188 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/async_fetcher.py +8 -8
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/base.py +1 -3
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/plaid.py +3 -3
- {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/PKG-INFO +8 -2
- {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/SOURCES.txt +1 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/requires.txt +1 -1
- {docpull-1.0.2 → docpull-1.1.0}/pyproject.toml +6 -2
- {docpull-1.0.2 → docpull-1.1.0}/LICENSE +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/__main__.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/config.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/__init__.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/bun.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/d3.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/generic.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/generic_async.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/nextjs.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/parallel_base.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/react.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/stripe.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/tailwind.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/fetchers/turborepo.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/__init__.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/base.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/bun.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/d3.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/nextjs.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/plaid.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/react.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/stripe.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/tailwind.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/profiles/turborepo.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/py.typed +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/utils/__init__.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/utils/file_utils.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull/utils/logging_config.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/entry_points.txt +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/docpull.egg-info/top_level.txt +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/setup.cfg +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/tests/test_async_fetcher.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/tests/test_config.py +0 -0
- {docpull-1.0.2 → docpull-1.1.0}/tests/test_fetchers.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -62,7 +62,7 @@ Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
|
62
62
|
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
63
63
|
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
64
64
|
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
65
|
-
Requires-Dist: types-
|
|
65
|
+
Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
|
|
66
66
|
Dynamic: license-file
|
|
67
67
|
|
|
68
68
|
# docpull
|
|
@@ -97,6 +97,7 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
97
97
|
|
|
98
98
|
```bash
|
|
99
99
|
pip install docpull
|
|
100
|
+
docpull --doctor # verify installation
|
|
100
101
|
docpull https://aptos.dev
|
|
101
102
|
docpull stripe # use a built-in profile
|
|
102
103
|
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
@@ -126,6 +127,7 @@ fetcher.fetch()
|
|
|
126
127
|
|
|
127
128
|
## Common Options
|
|
128
129
|
|
|
130
|
+
- `--doctor` – verify installation and dependencies
|
|
129
131
|
- `--max-pages N` – limit crawl size
|
|
130
132
|
- `--max-depth N` – restrict link depth
|
|
131
133
|
- `--max-concurrent N` – control parallel fetches
|
|
@@ -200,10 +202,14 @@ MY_PROFILE = SiteProfile(
|
|
|
200
202
|
|
|
201
203
|
## Troubleshooting
|
|
202
204
|
|
|
205
|
+
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
206
|
+
- **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
|
|
203
207
|
- **Site requires JS**: install Playwright + `--js`
|
|
204
208
|
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
205
209
|
- **Large sites**: set `--max-pages`
|
|
206
210
|
|
|
211
|
+
For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
|
|
212
|
+
|
|
207
213
|
## Links
|
|
208
214
|
|
|
209
215
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
@@ -30,6 +30,7 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
32
|
pip install docpull
|
|
33
|
+
docpull --doctor # verify installation
|
|
33
34
|
docpull https://aptos.dev
|
|
34
35
|
docpull stripe # use a built-in profile
|
|
35
36
|
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
@@ -59,6 +60,7 @@ fetcher.fetch()
|
|
|
59
60
|
|
|
60
61
|
## Common Options
|
|
61
62
|
|
|
63
|
+
- `--doctor` – verify installation and dependencies
|
|
62
64
|
- `--max-pages N` – limit crawl size
|
|
63
65
|
- `--max-depth N` – restrict link depth
|
|
64
66
|
- `--max-concurrent N` – control parallel fetches
|
|
@@ -133,10 +135,14 @@ MY_PROFILE = SiteProfile(
|
|
|
133
135
|
|
|
134
136
|
## Troubleshooting
|
|
135
137
|
|
|
138
|
+
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
139
|
+
- **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
|
|
136
140
|
- **Site requires JS**: install Playwright + `--js`
|
|
137
141
|
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
138
142
|
- **Large sites**: set `--max-pages`
|
|
139
143
|
|
|
144
|
+
For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
|
|
145
|
+
|
|
140
146
|
## Links
|
|
141
147
|
|
|
142
148
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
@@ -3,6 +3,40 @@ import sys
|
|
|
3
3
|
from pathlib import Path
|
|
4
4
|
from typing import Optional
|
|
5
5
|
|
|
6
|
+
# Check if --doctor flag is present before checking dependencies
|
|
7
|
+
# This allows users to diagnose issues even when dependencies are missing
|
|
8
|
+
if "--doctor" in sys.argv:
|
|
9
|
+
from .doctor import run_doctor
|
|
10
|
+
|
|
11
|
+
# Parse output dir if provided
|
|
12
|
+
output_dir = None
|
|
13
|
+
if "--output-dir" in sys.argv or "-o" in sys.argv:
|
|
14
|
+
try:
|
|
15
|
+
flag_idx = sys.argv.index("--output-dir") if "--output-dir" in sys.argv else sys.argv.index("-o")
|
|
16
|
+
if flag_idx + 1 < len(sys.argv):
|
|
17
|
+
output_dir = Path(sys.argv[flag_idx + 1])
|
|
18
|
+
except (ValueError, IndexError):
|
|
19
|
+
pass
|
|
20
|
+
sys.exit(run_doctor(output_dir=output_dir))
|
|
21
|
+
|
|
22
|
+
# Verify core dependencies are available
|
|
23
|
+
try:
|
|
24
|
+
import aiohttp # noqa: F401
|
|
25
|
+
import bs4 # noqa: F401
|
|
26
|
+
import defusedxml # noqa: F401
|
|
27
|
+
import html2text # noqa: F401
|
|
28
|
+
import requests # noqa: F401
|
|
29
|
+
import rich # noqa: F401
|
|
30
|
+
except ImportError as e:
|
|
31
|
+
print(f"\nERROR: Missing required dependency: {e.name}", file=sys.stderr)
|
|
32
|
+
print("\nDocpull requires all core dependencies to be installed.", file=sys.stderr)
|
|
33
|
+
print("\nRecommended fixes:", file=sys.stderr)
|
|
34
|
+
print(" 1. For pipx users: pipx reinstall docpull --force", file=sys.stderr)
|
|
35
|
+
print(" 2. For pip users: pip install --upgrade --force-reinstall docpull", file=sys.stderr)
|
|
36
|
+
print(" 3. For development: pip install -e .[dev]", file=sys.stderr)
|
|
37
|
+
print("\nTo diagnose issues, run: docpull --doctor", file=sys.stderr)
|
|
38
|
+
sys.exit(1)
|
|
39
|
+
|
|
6
40
|
from . import __version__
|
|
7
41
|
from .config import FetcherConfig
|
|
8
42
|
from .fetchers import (
|
|
@@ -185,6 +219,12 @@ Examples:
|
|
|
185
219
|
version=f"%(prog)s {__version__}",
|
|
186
220
|
)
|
|
187
221
|
|
|
222
|
+
parser.add_argument(
|
|
223
|
+
"--doctor",
|
|
224
|
+
action="store_true",
|
|
225
|
+
help="Run diagnostic checks to verify installation",
|
|
226
|
+
)
|
|
227
|
+
|
|
188
228
|
return parser
|
|
189
229
|
|
|
190
230
|
|
|
@@ -200,17 +240,31 @@ def generate_sample_config(output_path: Path) -> None:
|
|
|
200
240
|
# Determine format from extension
|
|
201
241
|
suffix = output_path.suffix.lower()
|
|
202
242
|
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
243
|
+
try:
|
|
244
|
+
if suffix in [".yaml", ".yml"]:
|
|
245
|
+
config.save_yaml(output_path)
|
|
246
|
+
print(f"Sample YAML config generated: {output_path}")
|
|
247
|
+
elif suffix == ".json":
|
|
248
|
+
config.save_json(output_path)
|
|
249
|
+
print(f"Sample JSON config generated: {output_path}")
|
|
250
|
+
else:
|
|
251
|
+
# Try YAML first, fall back to JSON if PyYAML not available
|
|
252
|
+
try:
|
|
253
|
+
print(f"Warning: Unknown extension {suffix}, generating YAML")
|
|
254
|
+
output_path = output_path.with_suffix(".yaml")
|
|
255
|
+
config.save_yaml(output_path)
|
|
256
|
+
print(f"Sample YAML config generated: {output_path}")
|
|
257
|
+
except ImportError:
|
|
258
|
+
print("PyYAML not installed, generating JSON instead")
|
|
259
|
+
output_path = output_path.with_suffix(".json")
|
|
260
|
+
config.save_json(output_path)
|
|
261
|
+
print(f"Sample JSON config generated: {output_path}")
|
|
262
|
+
except ImportError:
|
|
263
|
+
print("\nERROR: PyYAML is required for YAML config files")
|
|
264
|
+
print("Install it with: pip install docpull[yaml]")
|
|
265
|
+
print("\nAlternatively, use JSON format:")
|
|
266
|
+
print(f" docpull --generate-config {output_path.with_suffix('.json')}")
|
|
267
|
+
raise
|
|
214
268
|
|
|
215
269
|
|
|
216
270
|
def get_config(args: argparse.Namespace) -> FetcherConfig:
|
|
@@ -224,7 +278,17 @@ def get_config(args: argparse.Namespace) -> FetcherConfig:
|
|
|
224
278
|
FetcherConfig instance
|
|
225
279
|
"""
|
|
226
280
|
# Load from config file if provided
|
|
227
|
-
|
|
281
|
+
if args.config:
|
|
282
|
+
try:
|
|
283
|
+
config = FetcherConfig.from_file(args.config)
|
|
284
|
+
except ImportError as e:
|
|
285
|
+
print(f"\nERROR: Error loading config file: {e}")
|
|
286
|
+
if "yaml" in str(e).lower() or "pyyaml" in str(e).lower():
|
|
287
|
+
print("Install PyYAML with: pip install docpull[yaml]")
|
|
288
|
+
print("\nAlternatively, convert your config to JSON format")
|
|
289
|
+
raise
|
|
290
|
+
else:
|
|
291
|
+
config = FetcherConfig()
|
|
228
292
|
|
|
229
293
|
# Override with command-line arguments
|
|
230
294
|
if args.output_dir is not None:
|
|
@@ -411,6 +475,13 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
411
475
|
parser = create_parser()
|
|
412
476
|
args = parser.parse_args(argv)
|
|
413
477
|
|
|
478
|
+
# Handle --doctor
|
|
479
|
+
if args.doctor:
|
|
480
|
+
from .doctor import run_doctor
|
|
481
|
+
|
|
482
|
+
output_dir = Path(args.output_dir) if args.output_dir else None
|
|
483
|
+
return run_doctor(output_dir=output_dir)
|
|
484
|
+
|
|
414
485
|
# Handle --generate-config
|
|
415
486
|
if args.generate_config:
|
|
416
487
|
try:
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Diagnostic tool for verifying docpull installation and dependencies."""
|
|
2
|
+
|
|
3
|
+
import sys
|
|
4
|
+
from importlib import import_module
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.table import Table
|
|
11
|
+
|
|
12
|
+
RICH_AVAILABLE = True
|
|
13
|
+
except ImportError:
|
|
14
|
+
RICH_AVAILABLE = False
|
|
15
|
+
Console = None # type: ignore
|
|
16
|
+
Table = None # type: ignore
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def check_dependency(
|
|
20
|
+
module_name: str, package_name: Optional[str] = None, optional: bool = False
|
|
21
|
+
) -> tuple[bool, str]:
|
|
22
|
+
"""
|
|
23
|
+
Check if a Python module is importable.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
module_name: Name of the module to import
|
|
27
|
+
package_name: Display name of the package (defaults to module_name)
|
|
28
|
+
optional: Whether this is an optional dependency
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
Tuple of (success: bool, message: str)
|
|
32
|
+
"""
|
|
33
|
+
display_name = package_name or module_name
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
import_module(module_name)
|
|
37
|
+
return True, f"[OK] {display_name}"
|
|
38
|
+
except ImportError:
|
|
39
|
+
if optional:
|
|
40
|
+
return False, f"[WARN] {display_name} (optional - not installed)"
|
|
41
|
+
else:
|
|
42
|
+
return False, f"[MISSING] {display_name}"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def check_network() -> tuple[bool, str]:
|
|
46
|
+
"""
|
|
47
|
+
Check basic network connectivity.
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Tuple of (success: bool, message: str)
|
|
51
|
+
"""
|
|
52
|
+
try:
|
|
53
|
+
import socket
|
|
54
|
+
|
|
55
|
+
# Try to resolve a common DNS name
|
|
56
|
+
socket.gethostbyname("www.google.com")
|
|
57
|
+
return True, "[OK] Network connectivity"
|
|
58
|
+
except socket.gaierror:
|
|
59
|
+
return False, "[FAIL] Network connectivity - DNS resolution failed"
|
|
60
|
+
except Exception as e:
|
|
61
|
+
return False, f"[WARN] Network connectivity - {str(e)}"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def check_output_dir(output_dir: Optional[Path] = None) -> tuple[bool, str]:
|
|
65
|
+
"""
|
|
66
|
+
Check if output directory is writable.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
output_dir: Directory to check (defaults to ./docs)
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Tuple of (success: bool, message: str)
|
|
73
|
+
"""
|
|
74
|
+
test_dir = output_dir or Path("./docs")
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
# Create directory if it doesn't exist
|
|
78
|
+
test_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
|
|
80
|
+
# Try to write a test file
|
|
81
|
+
test_file = test_dir / ".docpull_test"
|
|
82
|
+
test_file.write_text("test")
|
|
83
|
+
test_file.unlink()
|
|
84
|
+
|
|
85
|
+
return True, f"[OK] Output directory writable ({test_dir})"
|
|
86
|
+
except PermissionError:
|
|
87
|
+
return False, f"[FAIL] Output directory - permission denied ({test_dir})"
|
|
88
|
+
except Exception as e:
|
|
89
|
+
return False, f"[FAIL] Output directory - {str(e)} ({test_dir})"
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def run_doctor(output_dir: Optional[Path] = None, use_rich: bool = True) -> int:
|
|
93
|
+
"""
|
|
94
|
+
Run diagnostic checks and display results.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
output_dir: Output directory to check for writability
|
|
98
|
+
use_rich: Whether to use rich formatting (if available)
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Exit code (0 if all core dependencies OK, 1 if any core dependency missing)
|
|
102
|
+
"""
|
|
103
|
+
# Determine if we can use rich formatting
|
|
104
|
+
use_rich = use_rich and RICH_AVAILABLE
|
|
105
|
+
|
|
106
|
+
print("Running docpull diagnostics...\n")
|
|
107
|
+
|
|
108
|
+
# Core dependencies
|
|
109
|
+
core_checks = [
|
|
110
|
+
("requests", "requests"),
|
|
111
|
+
("bs4", "beautifulsoup4"),
|
|
112
|
+
("html2text", "html2text"),
|
|
113
|
+
("defusedxml", "defusedxml"),
|
|
114
|
+
("aiohttp", "aiohttp"),
|
|
115
|
+
("rich", "rich"),
|
|
116
|
+
]
|
|
117
|
+
|
|
118
|
+
# Optional dependencies
|
|
119
|
+
optional_checks = [
|
|
120
|
+
("yaml", "pyyaml", True),
|
|
121
|
+
("playwright.async_api", "playwright", True),
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
# Other checks
|
|
125
|
+
system_checks = [
|
|
126
|
+
check_network(),
|
|
127
|
+
check_output_dir(output_dir),
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
# Run core dependency checks
|
|
131
|
+
core_results = [check_dependency(mod, pkg) for mod, pkg in core_checks]
|
|
132
|
+
optional_results = [check_dependency(mod, pkg, opt) for mod, pkg, opt in optional_checks]
|
|
133
|
+
|
|
134
|
+
all_checks = {
|
|
135
|
+
"Core Dependencies": core_results,
|
|
136
|
+
"Optional Dependencies": optional_results,
|
|
137
|
+
"System": system_checks,
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
# Display results
|
|
141
|
+
if use_rich:
|
|
142
|
+
console = Console()
|
|
143
|
+
|
|
144
|
+
for category, results in all_checks.items():
|
|
145
|
+
table = Table(title=category, show_header=False, box=None)
|
|
146
|
+
table.add_column("Status", style="bold")
|
|
147
|
+
|
|
148
|
+
for success, message in results:
|
|
149
|
+
style = "green" if success else ("yellow" if "optional" in message else "red")
|
|
150
|
+
table.add_row(message, style=style)
|
|
151
|
+
|
|
152
|
+
console.print(table)
|
|
153
|
+
console.print()
|
|
154
|
+
else:
|
|
155
|
+
# Fallback to plain text
|
|
156
|
+
for category, results in all_checks.items():
|
|
157
|
+
print(f"{category}:")
|
|
158
|
+
for _success, message in results:
|
|
159
|
+
print(f" {message}")
|
|
160
|
+
print()
|
|
161
|
+
|
|
162
|
+
# Check if any core dependencies failed
|
|
163
|
+
core_failed = any(not success for success, _ in core_results)
|
|
164
|
+
|
|
165
|
+
# Print summary
|
|
166
|
+
if core_failed:
|
|
167
|
+
print("\nWARNING: Some core dependencies are missing!")
|
|
168
|
+
print("\nRecommended fixes:")
|
|
169
|
+
print(" 1. For pipx users: pipx reinstall docpull --force")
|
|
170
|
+
print(" 2. For pip users: pip install --upgrade --force-reinstall docpull")
|
|
171
|
+
print(" 3. For development: pip install -e .[dev]")
|
|
172
|
+
return 1
|
|
173
|
+
else:
|
|
174
|
+
print("\nAll core dependencies installed correctly!")
|
|
175
|
+
|
|
176
|
+
# Check if optional dependencies are missing
|
|
177
|
+
optional_missing = [msg for success, msg in optional_results if not success]
|
|
178
|
+
if optional_missing:
|
|
179
|
+
print("\nOptional features available:")
|
|
180
|
+
print(" - YAML config support: pip install docpull[yaml]")
|
|
181
|
+
print(" - JavaScript rendering: pip install docpull[js]")
|
|
182
|
+
print(" - All optional features: pip install docpull[all]")
|
|
183
|
+
|
|
184
|
+
return 0
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
if __name__ == "__main__":
|
|
188
|
+
sys.exit(run_doctor())
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import time
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import
|
|
6
|
+
from typing import Any, Optional
|
|
7
7
|
|
|
8
8
|
import aiohttp
|
|
9
9
|
from bs4 import BeautifulSoup
|
|
@@ -12,15 +12,15 @@ from ..utils.file_utils import ensure_dir, validate_output_path
|
|
|
12
12
|
from .base import BaseFetcher
|
|
13
13
|
|
|
14
14
|
# Optional Playwright support
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from playwright.async_api import Browser, Page, Playwright
|
|
17
|
-
|
|
18
15
|
try:
|
|
19
|
-
from playwright.async_api import async_playwright
|
|
16
|
+
from playwright.async_api import Browser, Playwright, async_playwright
|
|
20
17
|
|
|
21
18
|
PLAYWRIGHT_AVAILABLE = True
|
|
22
19
|
except ImportError:
|
|
23
20
|
PLAYWRIGHT_AVAILABLE = False
|
|
21
|
+
# Fallback types for when playwright is not installed
|
|
22
|
+
Browser = Any # type: ignore[misc,assignment]
|
|
23
|
+
Playwright = Any # type: ignore[misc,assignment]
|
|
24
24
|
|
|
25
25
|
|
|
26
26
|
class AsyncFetcher:
|
|
@@ -68,8 +68,8 @@ class AsyncFetcher:
|
|
|
68
68
|
self.rate_limit_delay = base_fetcher.rate_limit
|
|
69
69
|
|
|
70
70
|
# Browser instance (if using JS)
|
|
71
|
-
self.browser: Optional[
|
|
72
|
-
self.playwright: Optional[
|
|
71
|
+
self.browser: Optional[Browser] = None # type: ignore[no-any-unimported]
|
|
72
|
+
self.playwright: Optional[Playwright] = None # type: ignore[no-any-unimported]
|
|
73
73
|
|
|
74
74
|
if use_js and not PLAYWRIGHT_AVAILABLE:
|
|
75
75
|
self.logger.warning("Playwright not installed. Install with: pip install docpull[js]")
|
|
@@ -152,7 +152,7 @@ class AsyncFetcher:
|
|
|
152
152
|
)
|
|
153
153
|
|
|
154
154
|
# Get rendered HTML
|
|
155
|
-
content = await page.content()
|
|
155
|
+
content: str = await page.content()
|
|
156
156
|
|
|
157
157
|
return content
|
|
158
158
|
|
|
@@ -84,9 +84,7 @@ class BaseFetcher(ABC):
|
|
|
84
84
|
self.validator_func = validator_func
|
|
85
85
|
super().__init__(*args, **kwargs)
|
|
86
86
|
|
|
87
|
-
def send( # type: ignore[override]
|
|
88
|
-
self, request: PreparedRequest, **kwargs: Any
|
|
89
|
-
) -> Response:
|
|
87
|
+
def send(self, request: PreparedRequest, **kwargs: Any) -> Response: # type: ignore[override]
|
|
90
88
|
if request.url is None:
|
|
91
89
|
raise ValueError("Request URL is None")
|
|
92
90
|
if not self.validator_func(request.url):
|
|
@@ -57,9 +57,9 @@ class PlaidFetcher(BaseFetcher):
|
|
|
57
57
|
sitemap_urls = self.fetch_sitemap(self.sitemap_url)
|
|
58
58
|
|
|
59
59
|
for url in sitemap_urls:
|
|
60
|
-
if (
|
|
61
|
-
|
|
62
|
-
)
|
|
60
|
+
if ("/docs/" in url or "/api/" in url) and not any(
|
|
61
|
+
x in url for x in ["/blog/", "/resources/", "/company/", "/customers/"]
|
|
62
|
+
):
|
|
63
63
|
doc_urls.add(url.split("#")[0].split("?")[0])
|
|
64
64
|
|
|
65
65
|
doc_urls_list = sorted(doc_urls)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 1.0
|
|
3
|
+
Version: 1.1.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -62,7 +62,7 @@ Requires-Dist: bandit>=1.7.0; extra == "dev"
|
|
|
62
62
|
Requires-Dist: pip-audit>=2.0.0; extra == "dev"
|
|
63
63
|
Requires-Dist: types-requests>=2.31.0; extra == "dev"
|
|
64
64
|
Requires-Dist: types-beautifulsoup4>=4.12.0; extra == "dev"
|
|
65
|
-
Requires-Dist: types-
|
|
65
|
+
Requires-Dist: types-defusedxml>=0.7.0; extra == "dev"
|
|
66
66
|
Dynamic: license-file
|
|
67
67
|
|
|
68
68
|
# docpull
|
|
@@ -97,6 +97,7 @@ Unlike tools like wget or httrack, docpull extracts only the main content, remov
|
|
|
97
97
|
|
|
98
98
|
```bash
|
|
99
99
|
pip install docpull
|
|
100
|
+
docpull --doctor # verify installation
|
|
100
101
|
docpull https://aptos.dev
|
|
101
102
|
docpull stripe # use a built-in profile
|
|
102
103
|
docpull https://site.com/docs --max-pages 100 --max-concurrent 20
|
|
@@ -126,6 +127,7 @@ fetcher.fetch()
|
|
|
126
127
|
|
|
127
128
|
## Common Options
|
|
128
129
|
|
|
130
|
+
- `--doctor` – verify installation and dependencies
|
|
129
131
|
- `--max-pages N` – limit crawl size
|
|
130
132
|
- `--max-depth N` – restrict link depth
|
|
131
133
|
- `--max-concurrent N` – control parallel fetches
|
|
@@ -200,10 +202,14 @@ MY_PROFILE = SiteProfile(
|
|
|
200
202
|
|
|
201
203
|
## Troubleshooting
|
|
202
204
|
|
|
205
|
+
- **Installation issues**: Run `docpull --doctor` to diagnose problems
|
|
206
|
+
- **Missing dependencies**: See [TROUBLESHOOTING.md](TROUBLESHOOTING.md) for common fixes
|
|
203
207
|
- **Site requires JS**: install Playwright + `--js`
|
|
204
208
|
- **Slow or rate limited**: lower concurrency or raise `--rate-limit`
|
|
205
209
|
- **Large sites**: set `--max-pages`
|
|
206
210
|
|
|
211
|
+
For detailed troubleshooting, see [TROUBLESHOOTING.md](TROUBLESHOOTING.md).
|
|
212
|
+
|
|
207
213
|
## Links
|
|
208
214
|
|
|
209
215
|
- [PyPI](https://pypi.org/project/docpull/)
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "1.0
|
|
7
|
+
version = "1.1.0"
|
|
8
8
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
9
9
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
10
10
|
requires-python = ">=3.9"
|
|
@@ -89,7 +89,7 @@ dev = [
|
|
|
89
89
|
"pip-audit>=2.0.0",
|
|
90
90
|
"types-requests>=2.31.0",
|
|
91
91
|
"types-beautifulsoup4>=4.12.0",
|
|
92
|
-
"types-
|
|
92
|
+
"types-defusedxml>=0.7.0",
|
|
93
93
|
]
|
|
94
94
|
|
|
95
95
|
[project.scripts]
|
|
@@ -132,6 +132,10 @@ no_implicit_optional = true
|
|
|
132
132
|
strict_equality = true
|
|
133
133
|
warn_redundant_casts = true
|
|
134
134
|
|
|
135
|
+
[[tool.mypy.overrides]]
|
|
136
|
+
module = "playwright.*"
|
|
137
|
+
ignore_missing_imports = true
|
|
138
|
+
|
|
135
139
|
[tool.pytest.ini_options]
|
|
136
140
|
minversion = "7.0"
|
|
137
141
|
testpaths = ["tests"]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|