docpull 2.0.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-2.0.0/src/docpull.egg-info → docpull-2.2.0}/PKG-INFO +3 -2
- {docpull-2.0.0 → docpull-2.2.0}/README.md +1 -0
- {docpull-2.0.0 → docpull-2.2.0}/pyproject.toml +3 -3
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/__init__.py +1 -1
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/manager.py +132 -18
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/streaming_dedup.py +6 -5
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cli.py +121 -7
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/concurrency/browser_pool.py +2 -3
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/extractor.py +5 -4
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/markdown.py +6 -4
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/core/fetcher.py +169 -22
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/__init__.py +17 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/crawler.py +24 -11
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/filters.py +5 -4
- docpull-2.2.0/src/docpull/discovery/link_extractors/__init__.py +22 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/browser.py +294 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/enhanced.py +315 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/protocols.py +33 -0
- docpull-2.2.0/src/docpull/discovery/link_extractors/static.py +160 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/sitemap.py +39 -10
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/doctor.py +5 -4
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/__init__.py +2 -1
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/client.py +47 -15
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/protocols.py +4 -2
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/rate_limiter.py +115 -4
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/metadata_extractor.py +16 -14
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/__init__.py +6 -1
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/config.py +88 -13
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/events.py +17 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/profiles.py +2 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/base.py +15 -13
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/__init__.py +4 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/browser_fetch.py +1 -1
- docpull-2.2.0/src/docpull/pipeline/steps/save_json.py +191 -0
- docpull-2.2.0/src/docpull/pipeline/steps/save_sqlite.py +171 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/security/robots.py +7 -6
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/security/url_validator.py +10 -9
- {docpull-2.0.0 → docpull-2.2.0/src/docpull.egg-info}/PKG-INFO +3 -2
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/SOURCES.txt +8 -0
- docpull-2.2.0/tests/test_link_extractors.py +270 -0
- {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_conversion.py +0 -1
- {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_discovery.py +1 -0
- {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_integration.py +1 -0
- {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_pipeline.py +1 -0
- {docpull-2.0.0 → docpull-2.2.0}/LICENSE +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/setup.cfg +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/__main__.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/concurrency/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/concurrency/manager.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/protocols.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/core/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/composite.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/protocols.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/logging_config.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/convert.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/dedup.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/fetch.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/save.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/validate.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/py.typed +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull/security/__init__.py +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/requires.txt +0 -0
- {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.2.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -36,7 +36,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
|
36
36
|
Classifier: Programming Language :: Python :: 3.14
|
|
37
37
|
Classifier: Programming Language :: Python :: 3 :: Only
|
|
38
38
|
Classifier: Typing :: Typed
|
|
39
|
-
Requires-Python: >=3.
|
|
39
|
+
Requires-Python: >=3.10
|
|
40
40
|
Description-Content-Type: text/markdown
|
|
41
41
|
License-File: LICENSE
|
|
42
42
|
Requires-Dist: requests>=2.31.0
|
|
@@ -81,6 +81,7 @@ Dynamic: license-file
|
|
|
81
81
|
|
|
82
82
|
[](https://www.python.org/downloads/)
|
|
83
83
|
[](https://badge.fury.io/py/docpull)
|
|
84
|
+
[](https://pepy.tech/project/docpull)
|
|
84
85
|
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
85
86
|
|
|
86
87
|
## Install
|
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
[](https://www.python.org/downloads/)
|
|
6
6
|
[](https://badge.fury.io/py/docpull)
|
|
7
|
+
[](https://pepy.tech/project/docpull)
|
|
7
8
|
[](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
|
|
8
9
|
|
|
9
10
|
## Install
|
|
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.2.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
11
|
-
requires-python = ">=3.
|
|
11
|
+
requires-python = ">=3.10"
|
|
12
12
|
license = "MIT"
|
|
13
13
|
license-files = ["LICENSE"]
|
|
14
14
|
authors = [
|
|
@@ -137,7 +137,7 @@ select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
|
|
|
137
137
|
ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
|
|
138
138
|
|
|
139
139
|
[tool.mypy]
|
|
140
|
-
python_version = "3.
|
|
140
|
+
python_version = "3.10"
|
|
141
141
|
warn_return_any = true
|
|
142
142
|
warn_unused_configs = true
|
|
143
143
|
disallow_untyped_defs = true
|
|
@@ -1,11 +1,13 @@
|
|
|
1
1
|
"""Cache management for update detection and incremental fetching."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import hashlib
|
|
4
6
|
import json
|
|
5
7
|
import logging
|
|
6
8
|
from datetime import datetime, timedelta
|
|
7
9
|
from pathlib import Path
|
|
8
|
-
from typing import
|
|
10
|
+
from typing import TypedDict
|
|
9
11
|
|
|
10
12
|
logger = logging.getLogger(__name__)
|
|
11
13
|
|
|
@@ -29,7 +31,15 @@ class CacheState(TypedDict, total=False):
|
|
|
29
31
|
|
|
30
32
|
fetched_urls: list[str]
|
|
31
33
|
failed_urls: list[str]
|
|
32
|
-
last_run:
|
|
34
|
+
last_run: str | None
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class DiscoveredUrlsState(TypedDict, total=False):
|
|
38
|
+
"""Type for discovered URLs persistence (for resume capability)."""
|
|
39
|
+
|
|
40
|
+
start_url: str
|
|
41
|
+
discovered_at: str
|
|
42
|
+
urls: list[str]
|
|
33
43
|
|
|
34
44
|
|
|
35
45
|
class _InternalState:
|
|
@@ -38,10 +48,10 @@ class _InternalState:
|
|
|
38
48
|
def __init__(self) -> None:
|
|
39
49
|
self.fetched_urls: set[str] = set()
|
|
40
50
|
self.failed_urls: set[str] = set()
|
|
41
|
-
self.last_run:
|
|
51
|
+
self.last_run: str | None = None
|
|
42
52
|
|
|
43
53
|
@classmethod
|
|
44
|
-
def from_cache_state(cls, state: CacheState) ->
|
|
54
|
+
def from_cache_state(cls, state: CacheState) -> _InternalState:
|
|
45
55
|
"""Create internal state from serialized CacheState."""
|
|
46
56
|
internal = cls()
|
|
47
57
|
internal.fetched_urls = set(state.get("fetched_urls", []))
|
|
@@ -68,7 +78,7 @@ class CacheManager:
|
|
|
68
78
|
- Consistent hashing: Uses bytes input for SHA-256 computation
|
|
69
79
|
"""
|
|
70
80
|
|
|
71
|
-
def __init__(self, cache_dir: Path, ttl_days:
|
|
81
|
+
def __init__(self, cache_dir: Path, ttl_days: int | None = None):
|
|
72
82
|
"""Initialize cache manager.
|
|
73
83
|
|
|
74
84
|
Args:
|
|
@@ -81,6 +91,7 @@ class CacheManager:
|
|
|
81
91
|
|
|
82
92
|
self.manifest_file = self.cache_dir / "manifest.json"
|
|
83
93
|
self.state_file = self.cache_dir / "state.json"
|
|
94
|
+
self.discovered_urls_file = self.cache_dir / "discovered_urls.json"
|
|
84
95
|
|
|
85
96
|
self.manifest: dict[str, ManifestEntry] = self._load_manifest()
|
|
86
97
|
self._state: _InternalState = _InternalState.from_cache_state(self._load_state())
|
|
@@ -157,21 +168,21 @@ class CacheManager:
|
|
|
157
168
|
self._save_manifest()
|
|
158
169
|
self._save_state()
|
|
159
170
|
|
|
160
|
-
def __enter__(self) ->
|
|
171
|
+
def __enter__(self) -> CacheManager:
|
|
161
172
|
"""Context manager entry."""
|
|
162
173
|
return self
|
|
163
174
|
|
|
164
175
|
def __exit__(
|
|
165
176
|
self,
|
|
166
|
-
exc_type:
|
|
167
|
-
exc_val:
|
|
168
|
-
exc_tb:
|
|
177
|
+
exc_type: type[BaseException] | None,
|
|
178
|
+
exc_val: BaseException | None,
|
|
179
|
+
exc_tb: object | None,
|
|
169
180
|
) -> None:
|
|
170
181
|
"""Context manager exit - auto-flush on exit."""
|
|
171
182
|
self.flush()
|
|
172
183
|
|
|
173
184
|
@staticmethod
|
|
174
|
-
def compute_checksum(content:
|
|
185
|
+
def compute_checksum(content: str | bytes) -> str:
|
|
175
186
|
"""Compute SHA-256 checksum of content.
|
|
176
187
|
|
|
177
188
|
Args:
|
|
@@ -187,9 +198,9 @@ class CacheManager:
|
|
|
187
198
|
def has_changed(
|
|
188
199
|
self,
|
|
189
200
|
url: str,
|
|
190
|
-
content:
|
|
191
|
-
etag:
|
|
192
|
-
last_modified:
|
|
201
|
+
content: str | None = None,
|
|
202
|
+
etag: str | None = None,
|
|
203
|
+
last_modified: str | None = None,
|
|
193
204
|
) -> bool:
|
|
194
205
|
"""Check if content has changed since last fetch.
|
|
195
206
|
|
|
@@ -226,10 +237,10 @@ class CacheManager:
|
|
|
226
237
|
def update_cache(
|
|
227
238
|
self,
|
|
228
239
|
url: str,
|
|
229
|
-
content:
|
|
240
|
+
content: str | bytes,
|
|
230
241
|
file_path: Path,
|
|
231
|
-
etag:
|
|
232
|
-
last_modified:
|
|
242
|
+
etag: str | None = None,
|
|
243
|
+
last_modified: str | None = None,
|
|
233
244
|
) -> None:
|
|
234
245
|
"""Update cache entry for a URL.
|
|
235
246
|
|
|
@@ -317,7 +328,7 @@ class CacheManager:
|
|
|
317
328
|
self.flush()
|
|
318
329
|
logger.info("Cleared incremental state")
|
|
319
330
|
|
|
320
|
-
def get_cache_stats(self) -> dict[str,
|
|
331
|
+
def get_cache_stats(self) -> dict[str, str | int | None]:
|
|
321
332
|
"""Get cache statistics.
|
|
322
333
|
|
|
323
334
|
Returns:
|
|
@@ -330,7 +341,7 @@ class CacheManager:
|
|
|
330
341
|
"last_run": self._state.last_run,
|
|
331
342
|
}
|
|
332
343
|
|
|
333
|
-
def evict_expired(self, ttl_days:
|
|
344
|
+
def evict_expired(self, ttl_days: int | None = None) -> int:
|
|
334
345
|
"""Remove cache entries older than TTL.
|
|
335
346
|
|
|
336
347
|
Args:
|
|
@@ -386,3 +397,106 @@ class CacheManager:
|
|
|
386
397
|
True if URL failed to fetch
|
|
387
398
|
"""
|
|
388
399
|
return url in self._state.failed_urls
|
|
400
|
+
|
|
401
|
+
# Resume capability methods
|
|
402
|
+
|
|
403
|
+
def save_discovered_urls(self, urls: list[str], start_url: str) -> None:
|
|
404
|
+
"""Save discovered URLs for resume capability.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
urls: List of discovered URLs
|
|
408
|
+
start_url: The starting URL for this crawl
|
|
409
|
+
|
|
410
|
+
Note:
|
|
411
|
+
This is written immediately (not batched) to ensure
|
|
412
|
+
URLs are persisted before fetching begins.
|
|
413
|
+
"""
|
|
414
|
+
data: DiscoveredUrlsState = {
|
|
415
|
+
"start_url": start_url,
|
|
416
|
+
"discovered_at": datetime.now().isoformat(),
|
|
417
|
+
"urls": urls,
|
|
418
|
+
}
|
|
419
|
+
try:
|
|
420
|
+
with open(self.discovered_urls_file, "w", encoding="utf-8") as f:
|
|
421
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
422
|
+
logger.info(f"Saved {len(urls)} discovered URLs for resume capability")
|
|
423
|
+
except Exception as e:
|
|
424
|
+
logger.error(f"Could not save discovered URLs: {e}")
|
|
425
|
+
|
|
426
|
+
def load_discovered_urls(self, start_url: str) -> list[str] | None:
|
|
427
|
+
"""Load previously discovered URLs if they match the start URL.
|
|
428
|
+
|
|
429
|
+
Args:
|
|
430
|
+
start_url: The starting URL to match
|
|
431
|
+
|
|
432
|
+
Returns:
|
|
433
|
+
List of discovered URLs if found and matching, None otherwise
|
|
434
|
+
"""
|
|
435
|
+
if not self.discovered_urls_file.exists():
|
|
436
|
+
return None
|
|
437
|
+
|
|
438
|
+
try:
|
|
439
|
+
with open(self.discovered_urls_file, encoding="utf-8") as f:
|
|
440
|
+
data: DiscoveredUrlsState = json.load(f)
|
|
441
|
+
|
|
442
|
+
if data.get("start_url") != start_url:
|
|
443
|
+
logger.info("Discovered URLs file exists but start_url doesn't match")
|
|
444
|
+
return None
|
|
445
|
+
|
|
446
|
+
urls = data.get("urls", [])
|
|
447
|
+
logger.info(f"Loaded {len(urls)} discovered URLs from previous run")
|
|
448
|
+
return urls
|
|
449
|
+
except Exception as e:
|
|
450
|
+
logger.warning(f"Could not load discovered URLs: {e}")
|
|
451
|
+
return None
|
|
452
|
+
|
|
453
|
+
def get_pending_urls(self, start_url: str) -> list[str] | None:
|
|
454
|
+
"""Get URLs that were discovered but not yet fetched.
|
|
455
|
+
|
|
456
|
+
Args:
|
|
457
|
+
start_url: The starting URL to match
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
List of pending URLs, or None if no resume data available
|
|
461
|
+
"""
|
|
462
|
+
discovered = self.load_discovered_urls(start_url)
|
|
463
|
+
if discovered is None:
|
|
464
|
+
return None
|
|
465
|
+
|
|
466
|
+
# Filter out already-fetched URLs
|
|
467
|
+
fetched = self.get_fetched_urls()
|
|
468
|
+
pending = [url for url in discovered if url not in fetched]
|
|
469
|
+
logger.info(f"Found {len(pending)} pending URLs (out of {len(discovered)} discovered)")
|
|
470
|
+
return pending
|
|
471
|
+
|
|
472
|
+
def clear_discovered_urls(self) -> None:
|
|
473
|
+
"""Clear discovered URLs file (called on successful completion).
|
|
474
|
+
|
|
475
|
+
This should be called after a successful fetch to clean up
|
|
476
|
+
the resume state.
|
|
477
|
+
"""
|
|
478
|
+
if self.discovered_urls_file.exists():
|
|
479
|
+
try:
|
|
480
|
+
self.discovered_urls_file.unlink()
|
|
481
|
+
logger.info("Cleared discovered URLs file")
|
|
482
|
+
except Exception as e:
|
|
483
|
+
logger.warning(f"Could not clear discovered URLs file: {e}")
|
|
484
|
+
|
|
485
|
+
def has_resume_data(self, start_url: str) -> bool:
|
|
486
|
+
"""Check if there is resume data available for the given URL.
|
|
487
|
+
|
|
488
|
+
Args:
|
|
489
|
+
start_url: The starting URL to check
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
True if resume data exists and matches the start URL
|
|
493
|
+
"""
|
|
494
|
+
if not self.discovered_urls_file.exists():
|
|
495
|
+
return False
|
|
496
|
+
|
|
497
|
+
try:
|
|
498
|
+
with open(self.discovered_urls_file, encoding="utf-8") as f:
|
|
499
|
+
data: DiscoveredUrlsState = json.load(f)
|
|
500
|
+
return data.get("start_url") == start_url
|
|
501
|
+
except Exception:
|
|
502
|
+
return False
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""Streaming deduplication for real-time duplicate detection during fetch."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import asyncio
|
|
4
6
|
import hashlib
|
|
5
|
-
from typing import Optional, Union
|
|
6
7
|
|
|
7
8
|
|
|
8
9
|
class StreamingDeduplicator:
|
|
@@ -38,7 +39,7 @@ class StreamingDeduplicator:
|
|
|
38
39
|
self._duplicates_found: int = 0
|
|
39
40
|
|
|
40
41
|
@staticmethod
|
|
41
|
-
def compute_hash(content:
|
|
42
|
+
def compute_hash(content: str | bytes) -> str:
|
|
42
43
|
"""
|
|
43
44
|
Compute SHA-256 hash of content.
|
|
44
45
|
|
|
@@ -59,8 +60,8 @@ class StreamingDeduplicator:
|
|
|
59
60
|
async def check_and_register(
|
|
60
61
|
self,
|
|
61
62
|
url: str,
|
|
62
|
-
content:
|
|
63
|
-
) -> tuple[bool,
|
|
63
|
+
content: str | bytes,
|
|
64
|
+
) -> tuple[bool, str | None]:
|
|
64
65
|
"""
|
|
65
66
|
Check if content is a duplicate and register if new.
|
|
66
67
|
|
|
@@ -89,7 +90,7 @@ class StreamingDeduplicator:
|
|
|
89
90
|
self._seen[content_hash] = url
|
|
90
91
|
return (True, None)
|
|
91
92
|
|
|
92
|
-
async def is_duplicate(self, content:
|
|
93
|
+
async def is_duplicate(self, content: str | bytes) -> bool:
|
|
93
94
|
"""
|
|
94
95
|
Check if content has been seen before (read-only).
|
|
95
96
|
|
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""Command-line interface for docpull."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import argparse
|
|
4
6
|
import asyncio
|
|
5
7
|
import sys
|
|
6
8
|
from pathlib import Path
|
|
7
|
-
from typing import Optional
|
|
8
9
|
|
|
9
10
|
# Check if --doctor flag is present before checking dependencies
|
|
10
11
|
if "--doctor" in sys.argv:
|
|
@@ -43,7 +44,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
|
43
44
|
from . import __version__
|
|
44
45
|
from .core.fetcher import Fetcher
|
|
45
46
|
from .models.config import DocpullConfig, ProfileName
|
|
46
|
-
from .models.events import EventType
|
|
47
|
+
from .models.events import EventType, SkipReason
|
|
47
48
|
|
|
48
49
|
|
|
49
50
|
def create_parser() -> argparse.ArgumentParser:
|
|
@@ -106,6 +107,13 @@ Examples:
|
|
|
106
107
|
default=None,
|
|
107
108
|
help="Output directory (default: ./docs)",
|
|
108
109
|
)
|
|
110
|
+
parser.add_argument(
|
|
111
|
+
"--format",
|
|
112
|
+
"-f",
|
|
113
|
+
choices=["markdown", "json", "sqlite"],
|
|
114
|
+
default="markdown",
|
|
115
|
+
help="Output format (default: markdown)",
|
|
116
|
+
)
|
|
109
117
|
|
|
110
118
|
# Crawl settings
|
|
111
119
|
crawl_group = parser.add_argument_group("crawl settings")
|
|
@@ -153,6 +161,11 @@ Examples:
|
|
|
153
161
|
dest="javascript",
|
|
154
162
|
help="Enable JavaScript rendering (requires Playwright)",
|
|
155
163
|
)
|
|
164
|
+
crawl_group.add_argument(
|
|
165
|
+
"--adaptive-rate-limit",
|
|
166
|
+
action="store_true",
|
|
167
|
+
help="Automatically adjust rate limits based on server responses",
|
|
168
|
+
)
|
|
156
169
|
|
|
157
170
|
# Content filtering
|
|
158
171
|
filter_group = parser.add_argument_group("content filtering")
|
|
@@ -188,6 +201,33 @@ Examples:
|
|
|
188
201
|
help="Maximum retry attempts",
|
|
189
202
|
)
|
|
190
203
|
|
|
204
|
+
# Authentication settings
|
|
205
|
+
auth_group = parser.add_argument_group("authentication")
|
|
206
|
+
auth_group.add_argument(
|
|
207
|
+
"--auth-bearer",
|
|
208
|
+
type=str,
|
|
209
|
+
metavar="TOKEN",
|
|
210
|
+
help="Bearer token for authentication",
|
|
211
|
+
)
|
|
212
|
+
auth_group.add_argument(
|
|
213
|
+
"--auth-basic",
|
|
214
|
+
type=str,
|
|
215
|
+
metavar="USER:PASS",
|
|
216
|
+
help="Basic auth credentials (username:password)",
|
|
217
|
+
)
|
|
218
|
+
auth_group.add_argument(
|
|
219
|
+
"--auth-cookie",
|
|
220
|
+
type=str,
|
|
221
|
+
metavar="COOKIE",
|
|
222
|
+
help="Cookie string for authentication",
|
|
223
|
+
)
|
|
224
|
+
auth_group.add_argument(
|
|
225
|
+
"--auth-header",
|
|
226
|
+
nargs=2,
|
|
227
|
+
metavar=("NAME", "VALUE"),
|
|
228
|
+
help="Custom auth header (name value)",
|
|
229
|
+
)
|
|
230
|
+
|
|
191
231
|
# Cache settings
|
|
192
232
|
cache_group = parser.add_argument_group("cache settings")
|
|
193
233
|
cache_group.add_argument(
|
|
@@ -214,6 +254,11 @@ Examples:
|
|
|
214
254
|
action="store_true",
|
|
215
255
|
help="Re-fetch pages even if unchanged",
|
|
216
256
|
)
|
|
257
|
+
cache_group.add_argument(
|
|
258
|
+
"--resume",
|
|
259
|
+
action="store_true",
|
|
260
|
+
help="Resume from previous interrupted run (requires --cache)",
|
|
261
|
+
)
|
|
217
262
|
|
|
218
263
|
# Output control
|
|
219
264
|
output_group = parser.add_argument_group("output control")
|
|
@@ -222,6 +267,11 @@ Examples:
|
|
|
222
267
|
action="store_true",
|
|
223
268
|
help="Show what would be fetched without downloading",
|
|
224
269
|
)
|
|
270
|
+
output_group.add_argument(
|
|
271
|
+
"--preview-urls",
|
|
272
|
+
action="store_true",
|
|
273
|
+
help="List discovered URLs without fetching",
|
|
274
|
+
)
|
|
225
275
|
output_group.add_argument(
|
|
226
276
|
"--verbose",
|
|
227
277
|
"-v",
|
|
@@ -262,8 +312,13 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
262
312
|
}
|
|
263
313
|
|
|
264
314
|
# Output settings
|
|
315
|
+
output_kwargs: dict = {}
|
|
265
316
|
if args.output_dir:
|
|
266
|
-
|
|
317
|
+
output_kwargs["directory"] = args.output_dir
|
|
318
|
+
if args.format:
|
|
319
|
+
output_kwargs["format"] = args.format
|
|
320
|
+
if output_kwargs:
|
|
321
|
+
config_kwargs["output"] = output_kwargs
|
|
267
322
|
|
|
268
323
|
# Crawl settings
|
|
269
324
|
crawl_kwargs: dict = {}
|
|
@@ -277,6 +332,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
277
332
|
crawl_kwargs["rate_limit"] = args.rate_limit
|
|
278
333
|
if args.javascript:
|
|
279
334
|
crawl_kwargs["javascript"] = True
|
|
335
|
+
if args.adaptive_rate_limit:
|
|
336
|
+
crawl_kwargs["adaptive_rate_limit"] = True
|
|
280
337
|
if args.include_paths:
|
|
281
338
|
crawl_kwargs["include_paths"] = args.include_paths
|
|
282
339
|
if args.exclude_paths:
|
|
@@ -304,9 +361,33 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
304
361
|
if network_kwargs:
|
|
305
362
|
config_kwargs["network"] = network_kwargs
|
|
306
363
|
|
|
364
|
+
# Authentication settings
|
|
365
|
+
auth_kwargs: dict = {}
|
|
366
|
+
if args.auth_bearer:
|
|
367
|
+
auth_kwargs["type"] = "bearer"
|
|
368
|
+
auth_kwargs["token"] = args.auth_bearer
|
|
369
|
+
elif args.auth_basic:
|
|
370
|
+
auth_kwargs["type"] = "basic"
|
|
371
|
+
if ":" in args.auth_basic:
|
|
372
|
+
username, password = args.auth_basic.split(":", 1)
|
|
373
|
+
auth_kwargs["username"] = username
|
|
374
|
+
auth_kwargs["password"] = password
|
|
375
|
+
else:
|
|
376
|
+
console.print("[red]Error:[/red] --auth-basic requires format username:password")
|
|
377
|
+
return 1
|
|
378
|
+
elif args.auth_cookie:
|
|
379
|
+
auth_kwargs["type"] = "cookie"
|
|
380
|
+
auth_kwargs["cookie"] = args.auth_cookie
|
|
381
|
+
elif args.auth_header:
|
|
382
|
+
auth_kwargs["type"] = "header"
|
|
383
|
+
auth_kwargs["header_name"] = args.auth_header[0]
|
|
384
|
+
auth_kwargs["header_value"] = args.auth_header[1]
|
|
385
|
+
if auth_kwargs:
|
|
386
|
+
config_kwargs["auth"] = auth_kwargs
|
|
387
|
+
|
|
307
388
|
# Cache settings
|
|
308
389
|
cache_kwargs: dict = {}
|
|
309
|
-
if args.cache:
|
|
390
|
+
if args.cache or args.resume:
|
|
310
391
|
cache_kwargs["enabled"] = True
|
|
311
392
|
if args.cache_dir:
|
|
312
393
|
cache_kwargs["directory"] = args.cache_dir
|
|
@@ -314,6 +395,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
314
395
|
cache_kwargs["ttl_days"] = args.cache_ttl
|
|
315
396
|
if args.no_skip_unchanged:
|
|
316
397
|
cache_kwargs["skip_unchanged"] = False
|
|
398
|
+
if args.resume:
|
|
399
|
+
cache_kwargs["resume"] = True
|
|
317
400
|
if cache_kwargs:
|
|
318
401
|
config_kwargs["cache"] = cache_kwargs
|
|
319
402
|
|
|
@@ -338,9 +421,23 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
338
421
|
|
|
339
422
|
try:
|
|
340
423
|
async with Fetcher(config) as fetcher:
|
|
424
|
+
# Handle --preview-urls mode
|
|
425
|
+
if args.preview_urls:
|
|
426
|
+
urls = await fetcher.discover()
|
|
427
|
+
console.print(f"[bold]Discovered {len(urls)} URLs:[/bold]")
|
|
428
|
+
for url in urls:
|
|
429
|
+
console.print(f" {url}")
|
|
430
|
+
return 0
|
|
431
|
+
|
|
432
|
+
# Track skip reasons for summary
|
|
433
|
+
from collections import defaultdict
|
|
434
|
+
|
|
435
|
+
skip_counts: dict[SkipReason, int] = defaultdict(int)
|
|
436
|
+
|
|
341
437
|
if args.quiet:
|
|
342
|
-
async for
|
|
343
|
-
|
|
438
|
+
async for event in fetcher.run():
|
|
439
|
+
if event.type == EventType.FETCH_SKIPPED and event.skip_reason:
|
|
440
|
+
skip_counts[event.skip_reason] += 1
|
|
344
441
|
else:
|
|
345
442
|
with Progress(
|
|
346
443
|
SpinnerColumn(),
|
|
@@ -353,6 +450,10 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
353
450
|
async for event in fetcher.run():
|
|
354
451
|
if event.type == EventType.STARTED:
|
|
355
452
|
progress.update(task, description=f"[cyan]{event.message}")
|
|
453
|
+
elif event.type == EventType.RESUMED:
|
|
454
|
+
progress.update(
|
|
455
|
+
task, description=f"[yellow]Resuming with {event.total} pending URLs"
|
|
456
|
+
)
|
|
356
457
|
elif event.type == EventType.DISCOVERY_STARTED:
|
|
357
458
|
progress.update(task, description="[cyan]Discovering URLs...")
|
|
358
459
|
elif event.type == EventType.DISCOVERY_COMPLETE:
|
|
@@ -362,6 +463,12 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
362
463
|
task,
|
|
363
464
|
description=f"[cyan]Fetching {event.current}/{event.total}: {event.url}",
|
|
364
465
|
)
|
|
466
|
+
elif event.type == EventType.FETCH_SKIPPED:
|
|
467
|
+
if event.skip_reason:
|
|
468
|
+
skip_counts[event.skip_reason] += 1
|
|
469
|
+
if args.verbose:
|
|
470
|
+
reason = event.skip_reason.value if event.skip_reason else "unknown"
|
|
471
|
+
console.print(f"[dim]Skipped: {event.url} ({reason})[/dim]")
|
|
365
472
|
elif event.type == EventType.FETCH_FAILED:
|
|
366
473
|
console.print(f"[red]Failed:[/red] {event.url} - {event.error}")
|
|
367
474
|
elif event.type == EventType.COMPLETED:
|
|
@@ -378,6 +485,13 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
378
485
|
console.print(f" Pages failed: {stats.pages_failed}")
|
|
379
486
|
console.print(f" Duration: {stats.duration_seconds:.1f}s")
|
|
380
487
|
|
|
488
|
+
# Print skip reason summary if there were skips
|
|
489
|
+
if skip_counts:
|
|
490
|
+
console.print()
|
|
491
|
+
console.print("[bold]Skip Summary:[/bold]")
|
|
492
|
+
for reason, count in sorted(skip_counts.items(), key=lambda x: -x[1]):
|
|
493
|
+
console.print(f" {reason.value}: {count}")
|
|
494
|
+
|
|
381
495
|
return 0 if stats.pages_failed == 0 else 1
|
|
382
496
|
|
|
383
497
|
except Exception as e:
|
|
@@ -391,7 +505,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
|
|
|
391
505
|
return asyncio.run(run())
|
|
392
506
|
|
|
393
507
|
|
|
394
|
-
def main(argv:
|
|
508
|
+
def main(argv: list[str] | None = None) -> int:
|
|
395
509
|
"""Main entry point."""
|
|
396
510
|
parser = create_parser()
|
|
397
511
|
args = parser.parse_args(argv)
|
|
@@ -64,7 +64,7 @@ class BrowserContextPool:
|
|
|
64
64
|
"""
|
|
65
65
|
if not PLAYWRIGHT_AVAILABLE:
|
|
66
66
|
raise ImportError(
|
|
67
|
-
"Playwright is required for JavaScript rendering.
|
|
67
|
+
"Playwright is required for JavaScript rendering. Install with: pip install docpull[js]"
|
|
68
68
|
)
|
|
69
69
|
|
|
70
70
|
self._max_contexts = max_contexts
|
|
@@ -280,8 +280,7 @@ class BrowserFetcher:
|
|
|
280
280
|
|
|
281
281
|
if response is None or response.status >= 400:
|
|
282
282
|
logger.warning(
|
|
283
|
-
f"Browser fetch failed for {url}: "
|
|
284
|
-
f"status={response.status if response else 'None'}"
|
|
283
|
+
f"Browser fetch failed for {url}: status={response.status if response else 'None'}"
|
|
285
284
|
)
|
|
286
285
|
return None
|
|
287
286
|
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
"""Main content extraction from HTML pages."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
6
|
import re
|
|
5
|
-
from typing import Optional
|
|
6
7
|
from urllib.parse import urljoin, urlparse
|
|
7
8
|
|
|
8
9
|
from bs4 import BeautifulSoup, Tag
|
|
@@ -103,8 +104,8 @@ class MainContentExtractor:
|
|
|
103
104
|
|
|
104
105
|
def __init__(
|
|
105
106
|
self,
|
|
106
|
-
content_selectors:
|
|
107
|
-
remove_selectors:
|
|
107
|
+
content_selectors: list[str] | None = None,
|
|
108
|
+
remove_selectors: list[str] | None = None,
|
|
108
109
|
preserve_images: bool = True,
|
|
109
110
|
preserve_code_blocks: bool = True,
|
|
110
111
|
):
|
|
@@ -146,7 +147,7 @@ class MainContentExtractor:
|
|
|
146
147
|
text = html.decode("utf-8", errors="replace")
|
|
147
148
|
return BeautifulSoup(text, "html.parser")
|
|
148
149
|
|
|
149
|
-
def _find_main_content(self, soup: BeautifulSoup) ->
|
|
150
|
+
def _find_main_content(self, soup: BeautifulSoup) -> Tag | None:
|
|
150
151
|
"""Find the main content element using selectors."""
|
|
151
152
|
for selector in self._content_selectors:
|
|
152
153
|
element = soup.select_one(selector)
|
|
@@ -1,8 +1,10 @@
|
|
|
1
1
|
"""HTML to Markdown conversion."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
3
5
|
import logging
|
|
4
6
|
import re
|
|
5
|
-
from typing import Any
|
|
7
|
+
from typing import Any
|
|
6
8
|
from urllib.parse import urljoin
|
|
7
9
|
|
|
8
10
|
import html2text
|
|
@@ -153,9 +155,9 @@ class FrontmatterBuilder:
|
|
|
153
155
|
|
|
154
156
|
def build(
|
|
155
157
|
self,
|
|
156
|
-
title:
|
|
157
|
-
url:
|
|
158
|
-
description:
|
|
158
|
+
title: str | None = None,
|
|
159
|
+
url: str | None = None,
|
|
160
|
+
description: str | None = None,
|
|
159
161
|
**extra_fields: Any,
|
|
160
162
|
) -> str:
|
|
161
163
|
"""
|