docpull 2.0.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {docpull-2.0.0/src/docpull.egg-info → docpull-2.2.0}/PKG-INFO +3 -2
  2. {docpull-2.0.0 → docpull-2.2.0}/README.md +1 -0
  3. {docpull-2.0.0 → docpull-2.2.0}/pyproject.toml +3 -3
  4. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/__init__.py +1 -1
  5. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/manager.py +132 -18
  6. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/streaming_dedup.py +6 -5
  7. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cli.py +121 -7
  8. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/concurrency/browser_pool.py +2 -3
  9. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/extractor.py +5 -4
  10. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/markdown.py +6 -4
  11. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/core/fetcher.py +169 -22
  12. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/__init__.py +17 -0
  13. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/crawler.py +24 -11
  14. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/filters.py +5 -4
  15. docpull-2.2.0/src/docpull/discovery/link_extractors/__init__.py +22 -0
  16. docpull-2.2.0/src/docpull/discovery/link_extractors/browser.py +294 -0
  17. docpull-2.2.0/src/docpull/discovery/link_extractors/enhanced.py +315 -0
  18. docpull-2.2.0/src/docpull/discovery/link_extractors/protocols.py +33 -0
  19. docpull-2.2.0/src/docpull/discovery/link_extractors/static.py +160 -0
  20. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/sitemap.py +39 -10
  21. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/doctor.py +5 -4
  22. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/__init__.py +2 -1
  23. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/client.py +47 -15
  24. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/protocols.py +4 -2
  25. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/http/rate_limiter.py +115 -4
  26. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/metadata_extractor.py +16 -14
  27. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/__init__.py +6 -1
  28. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/config.py +88 -13
  29. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/events.py +17 -0
  30. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/models/profiles.py +2 -0
  31. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/base.py +15 -13
  32. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/__init__.py +4 -0
  33. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/browser_fetch.py +1 -1
  34. docpull-2.2.0/src/docpull/pipeline/steps/save_json.py +191 -0
  35. docpull-2.2.0/src/docpull/pipeline/steps/save_sqlite.py +171 -0
  36. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/security/robots.py +7 -6
  37. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/security/url_validator.py +10 -9
  38. {docpull-2.0.0 → docpull-2.2.0/src/docpull.egg-info}/PKG-INFO +3 -2
  39. {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/SOURCES.txt +8 -0
  40. docpull-2.2.0/tests/test_link_extractors.py +270 -0
  41. {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_conversion.py +0 -1
  42. {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_discovery.py +1 -0
  43. {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_integration.py +1 -0
  44. {docpull-2.0.0 → docpull-2.2.0}/tests/test_v2_pipeline.py +1 -0
  45. {docpull-2.0.0 → docpull-2.2.0}/LICENSE +0 -0
  46. {docpull-2.0.0 → docpull-2.2.0}/setup.cfg +0 -0
  47. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/__main__.py +0 -0
  48. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/cache/__init__.py +0 -0
  49. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/concurrency/__init__.py +0 -0
  50. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/concurrency/manager.py +0 -0
  51. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/__init__.py +0 -0
  52. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/conversion/protocols.py +0 -0
  53. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/core/__init__.py +0 -0
  54. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/composite.py +0 -0
  55. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/discovery/protocols.py +0 -0
  56. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/logging_config.py +0 -0
  57. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/__init__.py +0 -0
  58. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/convert.py +0 -0
  59. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/dedup.py +0 -0
  60. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/fetch.py +0 -0
  61. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/metadata.py +0 -0
  62. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/save.py +0 -0
  63. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/pipeline/steps/validate.py +0 -0
  64. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/py.typed +0 -0
  65. {docpull-2.0.0 → docpull-2.2.0}/src/docpull/security/__init__.py +0 -0
  66. {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  67. {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/entry_points.txt +0 -0
  68. {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/requires.txt +0 -0
  69. {docpull-2.0.0 → docpull-2.2.0}/src/docpull.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 2.0.0
3
+ Version: 2.2.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -36,7 +36,7 @@ Classifier: Programming Language :: Python :: 3.13
36
36
  Classifier: Programming Language :: Python :: 3.14
37
37
  Classifier: Programming Language :: Python :: 3 :: Only
38
38
  Classifier: Typing :: Typed
39
- Requires-Python: >=3.9
39
+ Requires-Python: >=3.10
40
40
  Description-Content-Type: text/markdown
41
41
  License-File: LICENSE
42
42
  Requires-Dist: requests>=2.31.0
@@ -81,6 +81,7 @@ Dynamic: license-file
81
81
 
82
82
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
83
83
  [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
84
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
84
85
  [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
85
86
 
86
87
  ## Install
@@ -4,6 +4,7 @@
4
4
 
5
5
  [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
6
6
  [![PyPI version](https://badge.fury.io/py/docpull.svg)](https://badge.fury.io/py/docpull)
7
+ [![Downloads](https://pepy.tech/badge/docpull)](https://pepy.tech/project/docpull)
7
8
  [![License: MIT](https://img.shields.io/github/license/raintree-technology/docpull)](https://github.com/raintree-technology/docpull/blob/main/LICENSE)
8
9
 
9
10
  ## Install
@@ -4,11 +4,11 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "2.0.0"
7
+ version = "2.2.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
11
- requires-python = ">=3.9"
11
+ requires-python = ">=3.10"
12
12
  license = "MIT"
13
13
  license-files = ["LICENSE"]
14
14
  authors = [
@@ -137,7 +137,7 @@ select = ["E", "F", "W", "I", "N", "UP", "B", "A", "C4", "SIM"]
137
137
  ignore = ["A003"] # Allow 'type' and 'format' as field names in data models
138
138
 
139
139
  [tool.mypy]
140
- python_version = "3.9"
140
+ python_version = "3.10"
141
141
  warn_return_any = true
142
142
  warn_unused_configs = true
143
143
  disallow_untyped_defs = true
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "2.0.0"
17
+ __version__ = "2.2.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .core.fetcher import Fetcher, fetch_blocking
@@ -1,11 +1,13 @@
1
1
  """Cache management for update detection and incremental fetching."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import hashlib
4
6
  import json
5
7
  import logging
6
8
  from datetime import datetime, timedelta
7
9
  from pathlib import Path
8
- from typing import Optional, TypedDict, Union
10
+ from typing import TypedDict
9
11
 
10
12
  logger = logging.getLogger(__name__)
11
13
 
@@ -29,7 +31,15 @@ class CacheState(TypedDict, total=False):
29
31
 
30
32
  fetched_urls: list[str]
31
33
  failed_urls: list[str]
32
- last_run: Optional[str]
34
+ last_run: str | None
35
+
36
+
37
+ class DiscoveredUrlsState(TypedDict, total=False):
38
+ """Type for discovered URLs persistence (for resume capability)."""
39
+
40
+ start_url: str
41
+ discovered_at: str
42
+ urls: list[str]
33
43
 
34
44
 
35
45
  class _InternalState:
@@ -38,10 +48,10 @@ class _InternalState:
38
48
  def __init__(self) -> None:
39
49
  self.fetched_urls: set[str] = set()
40
50
  self.failed_urls: set[str] = set()
41
- self.last_run: Optional[str] = None
51
+ self.last_run: str | None = None
42
52
 
43
53
  @classmethod
44
- def from_cache_state(cls, state: CacheState) -> "_InternalState":
54
+ def from_cache_state(cls, state: CacheState) -> _InternalState:
45
55
  """Create internal state from serialized CacheState."""
46
56
  internal = cls()
47
57
  internal.fetched_urls = set(state.get("fetched_urls", []))
@@ -68,7 +78,7 @@ class CacheManager:
68
78
  - Consistent hashing: Uses bytes input for SHA-256 computation
69
79
  """
70
80
 
71
- def __init__(self, cache_dir: Path, ttl_days: Optional[int] = None):
81
+ def __init__(self, cache_dir: Path, ttl_days: int | None = None):
72
82
  """Initialize cache manager.
73
83
 
74
84
  Args:
@@ -81,6 +91,7 @@ class CacheManager:
81
91
 
82
92
  self.manifest_file = self.cache_dir / "manifest.json"
83
93
  self.state_file = self.cache_dir / "state.json"
94
+ self.discovered_urls_file = self.cache_dir / "discovered_urls.json"
84
95
 
85
96
  self.manifest: dict[str, ManifestEntry] = self._load_manifest()
86
97
  self._state: _InternalState = _InternalState.from_cache_state(self._load_state())
@@ -157,21 +168,21 @@ class CacheManager:
157
168
  self._save_manifest()
158
169
  self._save_state()
159
170
 
160
- def __enter__(self) -> "CacheManager":
171
+ def __enter__(self) -> CacheManager:
161
172
  """Context manager entry."""
162
173
  return self
163
174
 
164
175
  def __exit__(
165
176
  self,
166
- exc_type: Optional[type[BaseException]],
167
- exc_val: Optional[BaseException],
168
- exc_tb: Optional[object],
177
+ exc_type: type[BaseException] | None,
178
+ exc_val: BaseException | None,
179
+ exc_tb: object | None,
169
180
  ) -> None:
170
181
  """Context manager exit - auto-flush on exit."""
171
182
  self.flush()
172
183
 
173
184
  @staticmethod
174
- def compute_checksum(content: Union[str, bytes]) -> str:
185
+ def compute_checksum(content: str | bytes) -> str:
175
186
  """Compute SHA-256 checksum of content.
176
187
 
177
188
  Args:
@@ -187,9 +198,9 @@ class CacheManager:
187
198
  def has_changed(
188
199
  self,
189
200
  url: str,
190
- content: Optional[str] = None,
191
- etag: Optional[str] = None,
192
- last_modified: Optional[str] = None,
201
+ content: str | None = None,
202
+ etag: str | None = None,
203
+ last_modified: str | None = None,
193
204
  ) -> bool:
194
205
  """Check if content has changed since last fetch.
195
206
 
@@ -226,10 +237,10 @@ class CacheManager:
226
237
  def update_cache(
227
238
  self,
228
239
  url: str,
229
- content: Union[str, bytes],
240
+ content: str | bytes,
230
241
  file_path: Path,
231
- etag: Optional[str] = None,
232
- last_modified: Optional[str] = None,
242
+ etag: str | None = None,
243
+ last_modified: str | None = None,
233
244
  ) -> None:
234
245
  """Update cache entry for a URL.
235
246
 
@@ -317,7 +328,7 @@ class CacheManager:
317
328
  self.flush()
318
329
  logger.info("Cleared incremental state")
319
330
 
320
- def get_cache_stats(self) -> dict[str, Union[str, int, None]]:
331
+ def get_cache_stats(self) -> dict[str, str | int | None]:
321
332
  """Get cache statistics.
322
333
 
323
334
  Returns:
@@ -330,7 +341,7 @@ class CacheManager:
330
341
  "last_run": self._state.last_run,
331
342
  }
332
343
 
333
- def evict_expired(self, ttl_days: Optional[int] = None) -> int:
344
+ def evict_expired(self, ttl_days: int | None = None) -> int:
334
345
  """Remove cache entries older than TTL.
335
346
 
336
347
  Args:
@@ -386,3 +397,106 @@ class CacheManager:
386
397
  True if URL failed to fetch
387
398
  """
388
399
  return url in self._state.failed_urls
400
+
401
+ # Resume capability methods
402
+
403
+ def save_discovered_urls(self, urls: list[str], start_url: str) -> None:
404
+ """Save discovered URLs for resume capability.
405
+
406
+ Args:
407
+ urls: List of discovered URLs
408
+ start_url: The starting URL for this crawl
409
+
410
+ Note:
411
+ This is written immediately (not batched) to ensure
412
+ URLs are persisted before fetching begins.
413
+ """
414
+ data: DiscoveredUrlsState = {
415
+ "start_url": start_url,
416
+ "discovered_at": datetime.now().isoformat(),
417
+ "urls": urls,
418
+ }
419
+ try:
420
+ with open(self.discovered_urls_file, "w", encoding="utf-8") as f:
421
+ json.dump(data, f, indent=2, ensure_ascii=False)
422
+ logger.info(f"Saved {len(urls)} discovered URLs for resume capability")
423
+ except Exception as e:
424
+ logger.error(f"Could not save discovered URLs: {e}")
425
+
426
+ def load_discovered_urls(self, start_url: str) -> list[str] | None:
427
+ """Load previously discovered URLs if they match the start URL.
428
+
429
+ Args:
430
+ start_url: The starting URL to match
431
+
432
+ Returns:
433
+ List of discovered URLs if found and matching, None otherwise
434
+ """
435
+ if not self.discovered_urls_file.exists():
436
+ return None
437
+
438
+ try:
439
+ with open(self.discovered_urls_file, encoding="utf-8") as f:
440
+ data: DiscoveredUrlsState = json.load(f)
441
+
442
+ if data.get("start_url") != start_url:
443
+ logger.info("Discovered URLs file exists but start_url doesn't match")
444
+ return None
445
+
446
+ urls = data.get("urls", [])
447
+ logger.info(f"Loaded {len(urls)} discovered URLs from previous run")
448
+ return urls
449
+ except Exception as e:
450
+ logger.warning(f"Could not load discovered URLs: {e}")
451
+ return None
452
+
453
+ def get_pending_urls(self, start_url: str) -> list[str] | None:
454
+ """Get URLs that were discovered but not yet fetched.
455
+
456
+ Args:
457
+ start_url: The starting URL to match
458
+
459
+ Returns:
460
+ List of pending URLs, or None if no resume data available
461
+ """
462
+ discovered = self.load_discovered_urls(start_url)
463
+ if discovered is None:
464
+ return None
465
+
466
+ # Filter out already-fetched URLs
467
+ fetched = self.get_fetched_urls()
468
+ pending = [url for url in discovered if url not in fetched]
469
+ logger.info(f"Found {len(pending)} pending URLs (out of {len(discovered)} discovered)")
470
+ return pending
471
+
472
+ def clear_discovered_urls(self) -> None:
473
+ """Clear discovered URLs file (called on successful completion).
474
+
475
+ This should be called after a successful fetch to clean up
476
+ the resume state.
477
+ """
478
+ if self.discovered_urls_file.exists():
479
+ try:
480
+ self.discovered_urls_file.unlink()
481
+ logger.info("Cleared discovered URLs file")
482
+ except Exception as e:
483
+ logger.warning(f"Could not clear discovered URLs file: {e}")
484
+
485
+ def has_resume_data(self, start_url: str) -> bool:
486
+ """Check if there is resume data available for the given URL.
487
+
488
+ Args:
489
+ start_url: The starting URL to check
490
+
491
+ Returns:
492
+ True if resume data exists and matches the start URL
493
+ """
494
+ if not self.discovered_urls_file.exists():
495
+ return False
496
+
497
+ try:
498
+ with open(self.discovered_urls_file, encoding="utf-8") as f:
499
+ data: DiscoveredUrlsState = json.load(f)
500
+ return data.get("start_url") == start_url
501
+ except Exception:
502
+ return False
@@ -1,8 +1,9 @@
1
1
  """Streaming deduplication for real-time duplicate detection during fetch."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import asyncio
4
6
  import hashlib
5
- from typing import Optional, Union
6
7
 
7
8
 
8
9
  class StreamingDeduplicator:
@@ -38,7 +39,7 @@ class StreamingDeduplicator:
38
39
  self._duplicates_found: int = 0
39
40
 
40
41
  @staticmethod
41
- def compute_hash(content: Union[str, bytes]) -> str:
42
+ def compute_hash(content: str | bytes) -> str:
42
43
  """
43
44
  Compute SHA-256 hash of content.
44
45
 
@@ -59,8 +60,8 @@ class StreamingDeduplicator:
59
60
  async def check_and_register(
60
61
  self,
61
62
  url: str,
62
- content: Union[str, bytes],
63
- ) -> tuple[bool, Optional[str]]:
63
+ content: str | bytes,
64
+ ) -> tuple[bool, str | None]:
64
65
  """
65
66
  Check if content is a duplicate and register if new.
66
67
 
@@ -89,7 +90,7 @@ class StreamingDeduplicator:
89
90
  self._seen[content_hash] = url
90
91
  return (True, None)
91
92
 
92
- async def is_duplicate(self, content: Union[str, bytes]) -> bool:
93
+ async def is_duplicate(self, content: str | bytes) -> bool:
93
94
  """
94
95
  Check if content has been seen before (read-only).
95
96
 
@@ -1,10 +1,11 @@
1
1
  """Command-line interface for docpull."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import argparse
4
6
  import asyncio
5
7
  import sys
6
8
  from pathlib import Path
7
- from typing import Optional
8
9
 
9
10
  # Check if --doctor flag is present before checking dependencies
10
11
  if "--doctor" in sys.argv:
@@ -43,7 +44,7 @@ from rich.progress import Progress, SpinnerColumn, TextColumn
43
44
  from . import __version__
44
45
  from .core.fetcher import Fetcher
45
46
  from .models.config import DocpullConfig, ProfileName
46
- from .models.events import EventType
47
+ from .models.events import EventType, SkipReason
47
48
 
48
49
 
49
50
  def create_parser() -> argparse.ArgumentParser:
@@ -106,6 +107,13 @@ Examples:
106
107
  default=None,
107
108
  help="Output directory (default: ./docs)",
108
109
  )
110
+ parser.add_argument(
111
+ "--format",
112
+ "-f",
113
+ choices=["markdown", "json", "sqlite"],
114
+ default="markdown",
115
+ help="Output format (default: markdown)",
116
+ )
109
117
 
110
118
  # Crawl settings
111
119
  crawl_group = parser.add_argument_group("crawl settings")
@@ -153,6 +161,11 @@ Examples:
153
161
  dest="javascript",
154
162
  help="Enable JavaScript rendering (requires Playwright)",
155
163
  )
164
+ crawl_group.add_argument(
165
+ "--adaptive-rate-limit",
166
+ action="store_true",
167
+ help="Automatically adjust rate limits based on server responses",
168
+ )
156
169
 
157
170
  # Content filtering
158
171
  filter_group = parser.add_argument_group("content filtering")
@@ -188,6 +201,33 @@ Examples:
188
201
  help="Maximum retry attempts",
189
202
  )
190
203
 
204
+ # Authentication settings
205
+ auth_group = parser.add_argument_group("authentication")
206
+ auth_group.add_argument(
207
+ "--auth-bearer",
208
+ type=str,
209
+ metavar="TOKEN",
210
+ help="Bearer token for authentication",
211
+ )
212
+ auth_group.add_argument(
213
+ "--auth-basic",
214
+ type=str,
215
+ metavar="USER:PASS",
216
+ help="Basic auth credentials (username:password)",
217
+ )
218
+ auth_group.add_argument(
219
+ "--auth-cookie",
220
+ type=str,
221
+ metavar="COOKIE",
222
+ help="Cookie string for authentication",
223
+ )
224
+ auth_group.add_argument(
225
+ "--auth-header",
226
+ nargs=2,
227
+ metavar=("NAME", "VALUE"),
228
+ help="Custom auth header (name value)",
229
+ )
230
+
191
231
  # Cache settings
192
232
  cache_group = parser.add_argument_group("cache settings")
193
233
  cache_group.add_argument(
@@ -214,6 +254,11 @@ Examples:
214
254
  action="store_true",
215
255
  help="Re-fetch pages even if unchanged",
216
256
  )
257
+ cache_group.add_argument(
258
+ "--resume",
259
+ action="store_true",
260
+ help="Resume from previous interrupted run (requires --cache)",
261
+ )
217
262
 
218
263
  # Output control
219
264
  output_group = parser.add_argument_group("output control")
@@ -222,6 +267,11 @@ Examples:
222
267
  action="store_true",
223
268
  help="Show what would be fetched without downloading",
224
269
  )
270
+ output_group.add_argument(
271
+ "--preview-urls",
272
+ action="store_true",
273
+ help="List discovered URLs without fetching",
274
+ )
225
275
  output_group.add_argument(
226
276
  "--verbose",
227
277
  "-v",
@@ -262,8 +312,13 @@ def run_fetcher(args: argparse.Namespace) -> int:
262
312
  }
263
313
 
264
314
  # Output settings
315
+ output_kwargs: dict = {}
265
316
  if args.output_dir:
266
- config_kwargs["output"] = {"directory": args.output_dir}
317
+ output_kwargs["directory"] = args.output_dir
318
+ if args.format:
319
+ output_kwargs["format"] = args.format
320
+ if output_kwargs:
321
+ config_kwargs["output"] = output_kwargs
267
322
 
268
323
  # Crawl settings
269
324
  crawl_kwargs: dict = {}
@@ -277,6 +332,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
277
332
  crawl_kwargs["rate_limit"] = args.rate_limit
278
333
  if args.javascript:
279
334
  crawl_kwargs["javascript"] = True
335
+ if args.adaptive_rate_limit:
336
+ crawl_kwargs["adaptive_rate_limit"] = True
280
337
  if args.include_paths:
281
338
  crawl_kwargs["include_paths"] = args.include_paths
282
339
  if args.exclude_paths:
@@ -304,9 +361,33 @@ def run_fetcher(args: argparse.Namespace) -> int:
304
361
  if network_kwargs:
305
362
  config_kwargs["network"] = network_kwargs
306
363
 
364
+ # Authentication settings
365
+ auth_kwargs: dict = {}
366
+ if args.auth_bearer:
367
+ auth_kwargs["type"] = "bearer"
368
+ auth_kwargs["token"] = args.auth_bearer
369
+ elif args.auth_basic:
370
+ auth_kwargs["type"] = "basic"
371
+ if ":" in args.auth_basic:
372
+ username, password = args.auth_basic.split(":", 1)
373
+ auth_kwargs["username"] = username
374
+ auth_kwargs["password"] = password
375
+ else:
376
+ console.print("[red]Error:[/red] --auth-basic requires format username:password")
377
+ return 1
378
+ elif args.auth_cookie:
379
+ auth_kwargs["type"] = "cookie"
380
+ auth_kwargs["cookie"] = args.auth_cookie
381
+ elif args.auth_header:
382
+ auth_kwargs["type"] = "header"
383
+ auth_kwargs["header_name"] = args.auth_header[0]
384
+ auth_kwargs["header_value"] = args.auth_header[1]
385
+ if auth_kwargs:
386
+ config_kwargs["auth"] = auth_kwargs
387
+
307
388
  # Cache settings
308
389
  cache_kwargs: dict = {}
309
- if args.cache:
390
+ if args.cache or args.resume:
310
391
  cache_kwargs["enabled"] = True
311
392
  if args.cache_dir:
312
393
  cache_kwargs["directory"] = args.cache_dir
@@ -314,6 +395,8 @@ def run_fetcher(args: argparse.Namespace) -> int:
314
395
  cache_kwargs["ttl_days"] = args.cache_ttl
315
396
  if args.no_skip_unchanged:
316
397
  cache_kwargs["skip_unchanged"] = False
398
+ if args.resume:
399
+ cache_kwargs["resume"] = True
317
400
  if cache_kwargs:
318
401
  config_kwargs["cache"] = cache_kwargs
319
402
 
@@ -338,9 +421,23 @@ def run_fetcher(args: argparse.Namespace) -> int:
338
421
 
339
422
  try:
340
423
  async with Fetcher(config) as fetcher:
424
+ # Handle --preview-urls mode
425
+ if args.preview_urls:
426
+ urls = await fetcher.discover()
427
+ console.print(f"[bold]Discovered {len(urls)} URLs:[/bold]")
428
+ for url in urls:
429
+ console.print(f" {url}")
430
+ return 0
431
+
432
+ # Track skip reasons for summary
433
+ from collections import defaultdict
434
+
435
+ skip_counts: dict[SkipReason, int] = defaultdict(int)
436
+
341
437
  if args.quiet:
342
- async for _ in fetcher.run():
343
- pass
438
+ async for event in fetcher.run():
439
+ if event.type == EventType.FETCH_SKIPPED and event.skip_reason:
440
+ skip_counts[event.skip_reason] += 1
344
441
  else:
345
442
  with Progress(
346
443
  SpinnerColumn(),
@@ -353,6 +450,10 @@ def run_fetcher(args: argparse.Namespace) -> int:
353
450
  async for event in fetcher.run():
354
451
  if event.type == EventType.STARTED:
355
452
  progress.update(task, description=f"[cyan]{event.message}")
453
+ elif event.type == EventType.RESUMED:
454
+ progress.update(
455
+ task, description=f"[yellow]Resuming with {event.total} pending URLs"
456
+ )
356
457
  elif event.type == EventType.DISCOVERY_STARTED:
357
458
  progress.update(task, description="[cyan]Discovering URLs...")
358
459
  elif event.type == EventType.DISCOVERY_COMPLETE:
@@ -362,6 +463,12 @@ def run_fetcher(args: argparse.Namespace) -> int:
362
463
  task,
363
464
  description=f"[cyan]Fetching {event.current}/{event.total}: {event.url}",
364
465
  )
466
+ elif event.type == EventType.FETCH_SKIPPED:
467
+ if event.skip_reason:
468
+ skip_counts[event.skip_reason] += 1
469
+ if args.verbose:
470
+ reason = event.skip_reason.value if event.skip_reason else "unknown"
471
+ console.print(f"[dim]Skipped: {event.url} ({reason})[/dim]")
365
472
  elif event.type == EventType.FETCH_FAILED:
366
473
  console.print(f"[red]Failed:[/red] {event.url} - {event.error}")
367
474
  elif event.type == EventType.COMPLETED:
@@ -378,6 +485,13 @@ def run_fetcher(args: argparse.Namespace) -> int:
378
485
  console.print(f" Pages failed: {stats.pages_failed}")
379
486
  console.print(f" Duration: {stats.duration_seconds:.1f}s")
380
487
 
488
+ # Print skip reason summary if there were skips
489
+ if skip_counts:
490
+ console.print()
491
+ console.print("[bold]Skip Summary:[/bold]")
492
+ for reason, count in sorted(skip_counts.items(), key=lambda x: -x[1]):
493
+ console.print(f" {reason.value}: {count}")
494
+
381
495
  return 0 if stats.pages_failed == 0 else 1
382
496
 
383
497
  except Exception as e:
@@ -391,7 +505,7 @@ def run_fetcher(args: argparse.Namespace) -> int:
391
505
  return asyncio.run(run())
392
506
 
393
507
 
394
- def main(argv: Optional[list[str]] = None) -> int:
508
+ def main(argv: list[str] | None = None) -> int:
395
509
  """Main entry point."""
396
510
  parser = create_parser()
397
511
  args = parser.parse_args(argv)
@@ -64,7 +64,7 @@ class BrowserContextPool:
64
64
  """
65
65
  if not PLAYWRIGHT_AVAILABLE:
66
66
  raise ImportError(
67
- "Playwright is required for JavaScript rendering. " "Install with: pip install docpull[js]"
67
+ "Playwright is required for JavaScript rendering. Install with: pip install docpull[js]"
68
68
  )
69
69
 
70
70
  self._max_contexts = max_contexts
@@ -280,8 +280,7 @@ class BrowserFetcher:
280
280
 
281
281
  if response is None or response.status >= 400:
282
282
  logger.warning(
283
- f"Browser fetch failed for {url}: "
284
- f"status={response.status if response else 'None'}"
283
+ f"Browser fetch failed for {url}: status={response.status if response else 'None'}"
285
284
  )
286
285
  return None
287
286
 
@@ -1,8 +1,9 @@
1
1
  """Main content extraction from HTML pages."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
4
6
  import re
5
- from typing import Optional
6
7
  from urllib.parse import urljoin, urlparse
7
8
 
8
9
  from bs4 import BeautifulSoup, Tag
@@ -103,8 +104,8 @@ class MainContentExtractor:
103
104
 
104
105
  def __init__(
105
106
  self,
106
- content_selectors: Optional[list[str]] = None,
107
- remove_selectors: Optional[list[str]] = None,
107
+ content_selectors: list[str] | None = None,
108
+ remove_selectors: list[str] | None = None,
108
109
  preserve_images: bool = True,
109
110
  preserve_code_blocks: bool = True,
110
111
  ):
@@ -146,7 +147,7 @@ class MainContentExtractor:
146
147
  text = html.decode("utf-8", errors="replace")
147
148
  return BeautifulSoup(text, "html.parser")
148
149
 
149
- def _find_main_content(self, soup: BeautifulSoup) -> Optional[Tag]:
150
+ def _find_main_content(self, soup: BeautifulSoup) -> Tag | None:
150
151
  """Find the main content element using selectors."""
151
152
  for selector in self._content_selectors:
152
153
  element = soup.select_one(selector)
@@ -1,8 +1,10 @@
1
1
  """HTML to Markdown conversion."""
2
2
 
3
+ from __future__ import annotations
4
+
3
5
  import logging
4
6
  import re
5
- from typing import Any, Optional
7
+ from typing import Any
6
8
  from urllib.parse import urljoin
7
9
 
8
10
  import html2text
@@ -153,9 +155,9 @@ class FrontmatterBuilder:
153
155
 
154
156
  def build(
155
157
  self,
156
- title: Optional[str] = None,
157
- url: Optional[str] = None,
158
- description: Optional[str] = None,
158
+ title: str | None = None,
159
+ url: str | None = None,
160
+ description: str | None = None,
159
161
  **extra_fields: Any,
160
162
  ) -> str:
161
163
  """