docpull 3.0.2__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {docpull-3.0.2/src/docpull.egg-info → docpull-4.0.0}/PKG-INFO +4 -2
  2. {docpull-3.0.2 → docpull-4.0.0}/pyproject.toml +4 -2
  3. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/__init__.py +1 -1
  4. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/cache/manager.py +0 -112
  5. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/cache/streaming_dedup.py +0 -17
  6. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/markdown.py +21 -6
  7. docpull-4.0.0/src/docpull/discovery/_fetch.py +33 -0
  8. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/crawler.py +2 -28
  9. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/link_extractors/enhanced.py +2 -27
  10. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/link_extractors/static.py +2 -28
  11. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/sitemap.py +2 -1
  12. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/http/client.py +40 -46
  13. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/mcp/tools.py +11 -1
  14. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/models/config.py +0 -5
  15. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/base.py +0 -13
  16. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/fetch.py +16 -2
  17. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/security/robots.py +7 -5
  18. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/security/url_validator.py +87 -32
  19. {docpull-3.0.2 → docpull-4.0.0/src/docpull.egg-info}/PKG-INFO +4 -2
  20. {docpull-3.0.2 → docpull-4.0.0}/src/docpull.egg-info/SOURCES.txt +2 -3
  21. {docpull-3.0.2 → docpull-4.0.0}/src/docpull.egg-info/requires.txt +3 -1
  22. {docpull-3.0.2 → docpull-4.0.0}/tests/test_cache_conditional_get.py +28 -0
  23. docpull-4.0.0/tests/test_ci_policy.py +38 -0
  24. {docpull-3.0.2 → docpull-4.0.0}/tests/test_conversion.py +34 -6
  25. {docpull-3.0.2 → docpull-4.0.0}/tests/test_discovery.py +33 -0
  26. {docpull-3.0.2 → docpull-4.0.0}/tests/test_mcp_tools.py +20 -0
  27. {docpull-3.0.2 → docpull-4.0.0}/tests/test_security_hardening.py +92 -0
  28. docpull-3.0.2/src/docpull/concurrency/__init__.py +0 -7
  29. docpull-3.0.2/src/docpull/concurrency/manager.py +0 -123
  30. docpull-3.0.2/src/docpull/logging_config.py +0 -53
  31. {docpull-3.0.2 → docpull-4.0.0}/LICENSE +0 -0
  32. {docpull-3.0.2 → docpull-4.0.0}/README.md +0 -0
  33. {docpull-3.0.2 → docpull-4.0.0}/setup.cfg +0 -0
  34. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/__main__.py +0 -0
  35. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/cache/__init__.py +0 -0
  36. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/cli.py +0 -0
  37. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/__init__.py +0 -0
  38. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/chunking.py +0 -0
  39. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/extractor.py +0 -0
  40. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/protocols.py +0 -0
  41. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/special_cases.py +0 -0
  42. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  43. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/core/__init__.py +0 -0
  44. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/core/fetcher.py +0 -0
  45. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/__init__.py +0 -0
  46. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/composite.py +0 -0
  47. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/filters.py +0 -0
  48. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  49. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  50. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/discovery/protocols.py +0 -0
  51. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/doctor.py +0 -0
  52. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/http/__init__.py +0 -0
  53. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/http/protocols.py +0 -0
  54. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/http/rate_limiter.py +0 -0
  55. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/mcp/__init__.py +0 -0
  56. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/mcp/server.py +0 -0
  57. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/mcp/sources.py +0 -0
  58. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/metadata_extractor.py +0 -0
  59. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/models/__init__.py +0 -0
  60. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/models/events.py +0 -0
  61. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/models/profiles.py +0 -0
  62. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/__init__.py +0 -0
  63. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/__init__.py +0 -0
  64. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/chunk.py +0 -0
  65. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/convert.py +0 -0
  66. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/dedup.py +0 -0
  67. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/metadata.py +0 -0
  68. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/save.py +0 -0
  69. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/save_json.py +0 -0
  70. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
  71. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
  72. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/pipeline/steps/validate.py +0 -0
  73. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/py.typed +0 -0
  74. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/security/__init__.py +0 -0
  75. {docpull-3.0.2 → docpull-4.0.0}/src/docpull/time_utils.py +0 -0
  76. {docpull-3.0.2 → docpull-4.0.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  77. {docpull-3.0.2 → docpull-4.0.0}/src/docpull.egg-info/entry_points.txt +0 -0
  78. {docpull-3.0.2 → docpull-4.0.0}/src/docpull.egg-info/top_level.txt +0 -0
  79. {docpull-3.0.2 → docpull-4.0.0}/tests/test_chunking.py +0 -0
  80. {docpull-3.0.2 → docpull-4.0.0}/tests/test_cli.py +0 -0
  81. {docpull-3.0.2 → docpull-4.0.0}/tests/test_convert_step_new.py +0 -0
  82. {docpull-3.0.2 → docpull-4.0.0}/tests/test_integration.py +0 -0
  83. {docpull-3.0.2 → docpull-4.0.0}/tests/test_link_extractors.py +0 -0
  84. {docpull-3.0.2 → docpull-4.0.0}/tests/test_mcp_server.py +0 -0
  85. {docpull-3.0.2 → docpull-4.0.0}/tests/test_naming.py +0 -0
  86. {docpull-3.0.2 → docpull-4.0.0}/tests/test_pipeline.py +0 -0
  87. {docpull-3.0.2 → docpull-4.0.0}/tests/test_real_site_regressions.py +0 -0
  88. {docpull-3.0.2 → docpull-4.0.0}/tests/test_save_ndjson.py +0 -0
  89. {docpull-3.0.2 → docpull-4.0.0}/tests/test_special_cases.py +0 -0
  90. {docpull-3.0.2 → docpull-4.0.0}/tests/test_time_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 3.0.2
3
+ Version: 4.0.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -42,7 +42,7 @@ Requires-Dist: beautifulsoup4>=4.12.0
42
42
  Requires-Dist: html2text>=2020.1.16
43
43
  Requires-Dist: defusedxml>=0.7.1
44
44
  Requires-Dist: extruct>=0.15.0
45
- Requires-Dist: aiohttp>=3.9.0
45
+ Requires-Dist: aiohttp>=3.14.0
46
46
  Requires-Dist: idna>=3.15
47
47
  Requires-Dist: regex>=2024.11.6
48
48
  Requires-Dist: rich>=13.0.0
@@ -59,6 +59,7 @@ Provides-Extra: tokens
59
59
  Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
60
60
  Provides-Extra: mcp
61
61
  Requires-Dist: mcp>=1.0.0; extra == "mcp"
62
+ Requires-Dist: pyjwt>=2.13.0; extra == "mcp"
62
63
  Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
63
64
  Requires-Dist: starlette>=1.0.1; extra == "mcp"
64
65
  Provides-Extra: llm
@@ -69,6 +70,7 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
69
70
  Requires-Dist: trafilatura>=1.12.0; extra == "all"
70
71
  Requires-Dist: tiktoken>=0.7.0; extra == "all"
71
72
  Requires-Dist: mcp>=1.0.0; extra == "all"
73
+ Requires-Dist: pyjwt>=2.13.0; extra == "all"
72
74
  Requires-Dist: python-multipart>=0.0.27; extra == "all"
73
75
  Requires-Dist: starlette>=1.0.1; extra == "all"
74
76
  Provides-Extra: dev
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "3.0.2"
7
+ version = "4.0.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -66,7 +66,7 @@ dependencies = [
66
66
  "html2text>=2020.1.16",
67
67
  "defusedxml>=0.7.1",
68
68
  "extruct>=0.15.0",
69
- "aiohttp>=3.9.0",
69
+ "aiohttp>=3.14.0", # 3.14.0 fixes CVE-2026-34993 and CVE-2026-47265
70
70
  "idna>=3.15",
71
71
  "regex>=2024.11.6",
72
72
  "rich>=13.0.0",
@@ -90,6 +90,7 @@ tokens = [
90
90
  ]
91
91
  mcp = [
92
92
  "mcp>=1.0.0",
93
+ "pyjwt>=2.13.0",
93
94
  "python-multipart>=0.0.27",
94
95
  "starlette>=1.0.1",
95
96
  ]
@@ -102,6 +103,7 @@ all = [
102
103
  "trafilatura>=1.12.0",
103
104
  "tiktoken>=0.7.0",
104
105
  "mcp>=1.0.0",
106
+ "pyjwt>=2.13.0",
105
107
  "python-multipart>=0.0.27",
106
108
  "starlette>=1.0.1",
107
109
  ]
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "3.0.2"
17
+ __version__ = "4.0.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -197,45 +197,6 @@ class CacheManager:
197
197
  content = content.encode("utf-8")
198
198
  return hashlib.sha256(content).hexdigest()
199
199
 
200
- def has_changed(
201
- self,
202
- url: str,
203
- content: str | None = None,
204
- etag: str | None = None,
205
- last_modified: str | None = None,
206
- ) -> bool:
207
- """Check if content has changed since last fetch.
208
-
209
- Args:
210
- url: URL to check
211
- content: Current content (for checksum comparison)
212
- etag: HTTP ETag header
213
- last_modified: HTTP Last-Modified header
214
-
215
- Returns:
216
- True if content has changed or is new
217
- """
218
- if url not in self.manifest:
219
- return True # New URL
220
-
221
- cached = self.manifest[url]
222
-
223
- # Check ETag first (most reliable)
224
- if etag and "etag" in cached:
225
- return bool(etag != cached["etag"])
226
-
227
- # Check Last-Modified
228
- if last_modified and "last_modified" in cached:
229
- return bool(last_modified != cached["last_modified"])
230
-
231
- # Check content checksum
232
- if content and "checksum" in cached:
233
- current_checksum = self.compute_checksum(content)
234
- return bool(current_checksum != cached["checksum"])
235
-
236
- # Can't determine, assume changed
237
- return True
238
-
239
200
  def update_cache(
240
201
  self,
241
202
  url: str,
@@ -302,14 +263,6 @@ class CacheManager:
302
263
  """
303
264
  return self._state.fetched_urls.copy()
304
265
 
305
- def get_failed_urls(self) -> set[str]:
306
- """Get set of URLs that failed to fetch.
307
-
308
- Returns:
309
- Set of failed URLs (copy to prevent mutation)
310
- """
311
- return self._state.failed_urls.copy()
312
-
313
266
  def start_session(self) -> None:
314
267
  """Start a new fetch session.
315
268
 
@@ -319,30 +272,6 @@ class CacheManager:
319
272
  self._state.last_run = utc_now_iso()
320
273
  self._state_dirty = True
321
274
 
322
- def clear_state(self) -> None:
323
- """Clear incremental state (for fresh start).
324
-
325
- Note:
326
- This immediately flushes to disk.
327
- """
328
- self._state = _InternalState()
329
- self._state_dirty = True
330
- self.flush()
331
- logger.info("Cleared incremental state")
332
-
333
- def get_cache_stats(self) -> dict[str, str | int | None]:
334
- """Get cache statistics.
335
-
336
- Returns:
337
- Dict with cache stats
338
- """
339
- return {
340
- "cached_urls": len(self.manifest),
341
- "fetched_urls": len(self._state.fetched_urls),
342
- "failed_urls": len(self._state.failed_urls),
343
- "last_run": self._state.last_run,
344
- }
345
-
346
275
  def evict_expired(self, ttl_days: int | None = None) -> int:
347
276
  """Remove cache entries older than TTL.
348
277
 
@@ -378,28 +307,6 @@ class CacheManager:
378
307
 
379
308
  return len(to_remove)
380
309
 
381
- def is_fetched(self, url: str) -> bool:
382
- """Check if URL has been fetched (O(1) lookup).
383
-
384
- Args:
385
- url: URL to check
386
-
387
- Returns:
388
- True if URL was successfully fetched
389
- """
390
- return url in self._state.fetched_urls
391
-
392
- def is_failed(self, url: str) -> bool:
393
- """Check if URL has failed (O(1) lookup).
394
-
395
- Args:
396
- url: URL to check
397
-
398
- Returns:
399
- True if URL failed to fetch
400
- """
401
- return url in self._state.failed_urls
402
-
403
310
  # Resume capability methods
404
311
 
405
312
  def save_discovered_urls(self, urls: list[str], start_url: str) -> None:
@@ -483,22 +390,3 @@ class CacheManager:
483
390
  logger.info("Cleared discovered URLs file")
484
391
  except Exception as e:
485
392
  logger.warning(f"Could not clear discovered URLs file: {e}")
486
-
487
- def has_resume_data(self, start_url: str) -> bool:
488
- """Check if there is resume data available for the given URL.
489
-
490
- Args:
491
- start_url: The starting URL to check
492
-
493
- Returns:
494
- True if resume data exists and matches the start URL
495
- """
496
- if not self.discovered_urls_file.exists():
497
- return False
498
-
499
- try:
500
- with open(self.discovered_urls_file, encoding="utf-8") as f:
501
- data: DiscoveredUrlsState = json.load(f)
502
- return data.get("start_url") == start_url
503
- except Exception:
504
- return False
@@ -90,23 +90,6 @@ class StreamingDeduplicator:
90
90
  self._seen[content_hash] = url
91
91
  return (True, None)
92
92
 
93
- async def is_duplicate(self, content: str | bytes) -> bool:
94
- """
95
- Check if content has been seen before (read-only).
96
-
97
- Unlike check_and_register, this doesn't register the content.
98
- Useful for checking without committing to save.
99
-
100
- Args:
101
- content: The content to check (str or bytes)
102
-
103
- Returns:
104
- True if content has been seen before
105
- """
106
- content_hash = self.compute_hash(content)
107
- async with self._lock:
108
- return content_hash in self._seen
109
-
110
93
  def get_stats(self) -> dict:
111
94
  """
112
95
  Get deduplication statistics.
@@ -215,6 +215,17 @@ class FrontmatterBuilder:
215
215
  )
216
216
  """
217
217
 
218
+ @staticmethod
219
+ def _inline(value: Any) -> str:
220
+ """Collapse CR/LF/NUL so an interpolated value stays on its own YAML line.
221
+
222
+ Page-supplied metadata (JSON-LD ``keywords``, OpenGraph ``article:tag``,
223
+ etc.) flows into frontmatter. Without this, a newline in a tag/keyword
224
+ would break out of the list item and inject attacker-chosen top-level
225
+ keys (e.g. ``draft: true``) into the document frontmatter.
226
+ """
227
+ return str(value).replace("\r", " ").replace("\n", " ").replace("\x00", " ")
228
+
218
229
  def build(
219
230
  self,
220
231
  title: str | None = None,
@@ -238,28 +249,32 @@ class FrontmatterBuilder:
238
249
 
239
250
  if title:
240
251
  # Escape quotes in title
241
- safe_title = title.replace('"', '\\"')
252
+ safe_title = self._inline(title).replace('"', '\\"')
242
253
  lines.append(f'title: "{safe_title}"')
243
254
 
244
255
  if url:
245
- lines.append(f"source: {url}")
256
+ lines.append(f"source: {self._inline(url)}")
246
257
 
247
258
  if description:
248
259
  # Escape quotes and truncate long descriptions
249
- safe_desc = description[:500].replace('"', '\\"')
260
+ safe_desc = self._inline(description[:500]).replace('"', '\\"')
250
261
  lines.append(f'description: "{safe_desc}"')
251
262
 
252
263
  for key, value in extra_fields.items():
253
264
  if value is not None:
254
265
  if isinstance(value, str):
255
- safe_value = value.replace('"', '\\"')
266
+ safe_value = self._inline(value).replace('"', '\\"')
256
267
  lines.append(f'{key}: "{safe_value}"')
257
268
  elif isinstance(value, (list, tuple)):
258
269
  lines.append(f"{key}:")
259
270
  for item in value:
260
- lines.append(f" - {item}")
271
+ # Quote + escape each item so a hostile tag/keyword (from
272
+ # page JSON-LD / OpenGraph) stays a single YAML string and
273
+ # cannot inject new keys or produce malformed frontmatter.
274
+ safe_item = self._inline(item).replace('"', '\\"')
275
+ lines.append(f' - "{safe_item}"')
261
276
  else:
262
- lines.append(f"{key}: {value}")
277
+ lines.append(f"{key}: {self._inline(value)}")
263
278
 
264
279
  lines.append("---")
265
280
  return "\n".join(lines) + "\n\n"
@@ -0,0 +1,33 @@
1
+ """Shared HTML fetch helper for link-discovery components."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ..http.protocols import HttpClient
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ async def fetch_html(client: HttpClient, url: str) -> bytes | None:
13
+ """Fetch ``url`` and return its body iff it is a successful HTML response.
14
+
15
+ Returns ``None`` on network error, non-200 status, or a non-HTML content
16
+ type. Shared by the crawler and the static/enhanced link extractors so the
17
+ fetch and content-type gate stay identical across all three.
18
+ """
19
+ try:
20
+ response = await client.get(url, timeout=30.0)
21
+
22
+ if response.status_code != 200:
23
+ return None
24
+
25
+ content_type = response.content_type.lower()
26
+ if "text/html" not in content_type and "application/xhtml" not in content_type:
27
+ return None
28
+
29
+ return response.content
30
+
31
+ except Exception as e:
32
+ logger.debug(f"Failed to fetch {url}: {e}")
33
+ return None
@@ -13,6 +13,7 @@ from bs4 import BeautifulSoup
13
13
  from ..http.protocols import HttpClient
14
14
  from ..security.robots import RobotsChecker
15
15
  from ..security.url_validator import UrlValidator
16
+ from ._fetch import fetch_html
16
17
  from .filters import DomainFilter, PatternFilter, SeenUrlTracker
17
18
 
18
19
  if TYPE_CHECKING:
@@ -114,33 +115,6 @@ class LinkCrawler:
114
115
 
115
116
  return links
116
117
 
117
- async def _fetch_page(self, url: str) -> bytes | None:
118
- """
119
- Fetch a page for link extraction.
120
-
121
- Args:
122
- url: URL to fetch
123
-
124
- Returns:
125
- HTML content as bytes, or None if fetch failed
126
- """
127
- try:
128
- response = await self._client.get(url, timeout=30.0)
129
-
130
- if response.status_code != 200:
131
- return None
132
-
133
- # Only process HTML content
134
- content_type = response.content_type.lower()
135
- if "text/html" not in content_type and "application/xhtml" not in content_type:
136
- return None
137
-
138
- return response.content
139
-
140
- except Exception as e:
141
- logger.debug(f"Failed to fetch {url}: {e}")
142
- return None
143
-
144
118
  def _should_crawl(self, url: str) -> bool:
145
119
  """
146
120
  Check if a URL should be crawled.
@@ -225,7 +199,7 @@ class LinkCrawler:
225
199
  links = await self._link_extractor.extract_links(current_url)
226
200
  else:
227
201
  # Built-in extraction with separate fetch
228
- html = await self._fetch_page(current_url)
202
+ html = await fetch_html(self._client, current_url)
229
203
  if html is None:
230
204
  continue
231
205
  links = self._extract_links(html, current_url)
@@ -10,6 +10,7 @@ from urllib.parse import urljoin, urlparse
10
10
  from bs4 import BeautifulSoup
11
11
 
12
12
  from ...http.protocols import HttpClient
13
+ from .._fetch import fetch_html
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -104,7 +105,7 @@ class EnhancedLinkExtractor:
104
105
  List of absolute URLs found on the page
105
106
  """
106
107
  if content is None:
107
- content = await self._fetch_content(url)
108
+ content = await fetch_html(self._client, url)
108
109
  if content is None:
109
110
  return []
110
111
 
@@ -137,32 +138,6 @@ class EnhancedLinkExtractor:
137
138
 
138
139
  return list(links)
139
140
 
140
- async def _fetch_content(self, url: str) -> bytes | None:
141
- """
142
- Fetch page content for link extraction.
143
-
144
- Args:
145
- url: URL to fetch
146
-
147
- Returns:
148
- HTML content as bytes, or None if fetch failed
149
- """
150
- try:
151
- response = await self._client.get(url, timeout=30.0)
152
-
153
- if response.status_code != 200:
154
- return None
155
-
156
- content_type = response.content_type.lower()
157
- if "text/html" not in content_type and "application/xhtml" not in content_type:
158
- return None
159
-
160
- return response.content
161
-
162
- except Exception as e:
163
- logger.debug(f"Failed to fetch {url}: {e}")
164
- return None
165
-
166
141
  def _extract_standard_links(self, soup: BeautifulSoup, base_url: str) -> list[str]:
167
142
  """Extract links from standard <a href> tags."""
168
143
  links = []
@@ -8,6 +8,7 @@ from urllib.parse import urljoin, urlparse
8
8
  from bs4 import BeautifulSoup
9
9
 
10
10
  from ...http.protocols import HttpClient
11
+ from .._fetch import fetch_html
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
@@ -56,39 +57,12 @@ class StaticLinkExtractor:
56
57
  List of absolute URLs found on the page
57
58
  """
58
59
  if content is None:
59
- content = await self._fetch_content(url)
60
+ content = await fetch_html(self._client, url)
60
61
  if content is None:
61
62
  return []
62
63
 
63
64
  return self._parse_links(content, url)
64
65
 
65
- async def _fetch_content(self, url: str) -> bytes | None:
66
- """
67
- Fetch page content for link extraction.
68
-
69
- Args:
70
- url: URL to fetch
71
-
72
- Returns:
73
- HTML content as bytes, or None if fetch failed
74
- """
75
- try:
76
- response = await self._client.get(url, timeout=30.0)
77
-
78
- if response.status_code != 200:
79
- return None
80
-
81
- # Only process HTML content
82
- content_type = response.content_type.lower()
83
- if "text/html" not in content_type and "application/xhtml" not in content_type:
84
- return None
85
-
86
- return response.content
87
-
88
- except Exception as e:
89
- logger.debug(f"Failed to fetch {url}: {e}")
90
- return None
91
-
92
66
  def _parse_links(self, html: bytes, base_url: str) -> list[str]:
93
67
  """
94
68
  Parse links from HTML content.
@@ -7,6 +7,7 @@ from collections.abc import AsyncIterator
7
7
  from urllib.parse import urlparse
8
8
 
9
9
  from defusedxml import ElementTree
10
+ from defusedxml.common import DefusedXmlException
10
11
 
11
12
  from ..http.protocols import HttpClient
12
13
  from ..security.robots import RobotsChecker
@@ -160,7 +161,7 @@ class SitemapDiscoverer:
160
161
 
161
162
  try:
162
163
  root = ElementTree.fromstring(content)
163
- except ElementTree.ParseError as e:
164
+ except (ElementTree.ParseError, DefusedXmlException) as e:
164
165
  logger.warning(f"Failed to parse sitemap XML: {e}")
165
166
  return page_urls, sitemap_urls
166
167
 
@@ -234,6 +234,38 @@ class AsyncHttpClient:
234
234
 
235
235
  return {key: value for key, value in headers.items() if key.lower() not in self.SENSITIVE_HEADERS}
236
236
 
237
+ def _next_redirect(
238
+ self,
239
+ response: aiohttp.ClientResponse,
240
+ current_url: str,
241
+ current_headers: dict[str, str],
242
+ redirect_count: int,
243
+ original_url: str,
244
+ ) -> tuple[str, dict[str, str], int] | None:
245
+ """Re-validate and follow one redirect hop, shared by GET and HEAD.
246
+
247
+ Returns the updated ``(url, headers, redirect_count)`` when ``response``
248
+ is a redirect, or ``None`` when it is not. Raises ``ValueError`` once
249
+ ``MAX_REDIRECTS`` is exceeded. Centralising this keeps GET and HEAD on
250
+ identical redirect/SSRF re-validation.
251
+ """
252
+ location = response.headers.get("Location")
253
+ if response.status in self.REDIRECT_STATUS_CODES and location:
254
+ if redirect_count >= self.MAX_REDIRECTS:
255
+ raise ValueError(f"Too many redirects while fetching {original_url}")
256
+
257
+ redirect_url = self._resolve_redirect_url(current_url, location)
258
+ new_headers = self._headers_for_url(
259
+ self._headers_for_redirect(
260
+ current_headers,
261
+ current_url,
262
+ redirect_url,
263
+ ),
264
+ redirect_url,
265
+ )
266
+ return redirect_url, new_headers, redirect_count + 1
267
+ return None
268
+
237
269
  async def __aenter__(self) -> AsyncHttpClient:
238
270
  """Enter async context and create session."""
239
271
  resolver: AbstractResolver | None = None
@@ -382,22 +414,11 @@ class AsyncHttpClient:
382
414
  allow_redirects=False,
383
415
  ) as response,
384
416
  ):
385
- location = response.headers.get("Location")
386
- if response.status in self.REDIRECT_STATUS_CODES and location:
387
- if redirect_count >= self.MAX_REDIRECTS:
388
- raise ValueError(f"Too many redirects while fetching {url}")
389
-
390
- redirect_url = self._resolve_redirect_url(current_url, location)
391
- current_headers = self._headers_for_url(
392
- self._headers_for_redirect(
393
- current_headers,
394
- current_url,
395
- redirect_url,
396
- ),
397
- redirect_url,
398
- )
399
- current_url = redirect_url
400
- redirect_count += 1
417
+ redirect = self._next_redirect(
418
+ response, current_url, current_headers, redirect_count, url
419
+ )
420
+ if redirect is not None:
421
+ current_url, current_headers, redirect_count = redirect
401
422
  continue
402
423
 
403
424
  if response.status in self.RETRYABLE_STATUS_CODES:
@@ -504,22 +525,9 @@ class AsyncHttpClient:
504
525
  allow_redirects=False,
505
526
  ) as response,
506
527
  ):
507
- location = response.headers.get("Location")
508
- if response.status in self.REDIRECT_STATUS_CODES and location:
509
- if redirect_count >= self.MAX_REDIRECTS:
510
- raise ValueError(f"Too many redirects while fetching {url}")
511
-
512
- redirect_url = self._resolve_redirect_url(current_url, location)
513
- current_headers = self._headers_for_url(
514
- self._headers_for_redirect(
515
- current_headers,
516
- current_url,
517
- redirect_url,
518
- ),
519
- redirect_url,
520
- )
521
- current_url = redirect_url
522
- redirect_count += 1
528
+ redirect = self._next_redirect(response, current_url, current_headers, redirect_count, url)
529
+ if redirect is not None:
530
+ current_url, current_headers, redirect_count = redirect
523
531
  continue
524
532
 
525
533
  return HttpResponse(
@@ -529,17 +537,3 @@ class AsyncHttpClient:
529
537
  headers=dict(response.headers),
530
538
  url=str(response.url),
531
539
  )
532
-
533
- def decode_content(self, response: HttpResponse) -> str:
534
- """
535
- Decode response content to string.
536
-
537
- Convenience method that uses intelligent encoding detection.
538
-
539
- Args:
540
- response: HttpResponse to decode
541
-
542
- Returns:
543
- Decoded string content
544
- """
545
- return self._decode_content(response.content, response.content_type)
@@ -452,12 +452,22 @@ def grep_docs(
452
452
  for root in roots:
453
453
  if not root.exists() or not root.is_dir():
454
454
  continue
455
+ resolved_root = root.resolve()
455
456
  for file in root.rglob("*.md"):
456
457
  if time.monotonic() > deadline:
457
458
  timed_out = True
458
459
  break
460
+ if file.is_symlink():
461
+ logger.debug("skip symlinked doc file: %s", file)
462
+ continue
463
+ resolved_file = file.resolve()
464
+ try:
465
+ resolved_file.relative_to(resolved_root)
466
+ except ValueError:
467
+ logger.debug("skip doc file outside library root: %s", file)
468
+ continue
459
469
  try:
460
- lines = file.read_text(errors="replace").splitlines()
470
+ lines = resolved_file.read_text(errors="replace").splitlines()
461
471
  except OSError as err:
462
472
  logger.debug("skip %s: %s", file, err)
463
473
  continue
@@ -396,8 +396,3 @@ class DocpullConfig(BaseModel):
396
396
 
397
397
  data = yaml.safe_load(yaml_str)
398
398
  return cls.model_validate(data)
399
-
400
- @classmethod
401
- def from_yaml_file(cls, path: Path) -> DocpullConfig:
402
- """Load config from YAML file."""
403
- return cls.from_yaml(path.read_text())
@@ -181,16 +181,3 @@ class FetchPipeline:
181
181
  break
182
182
 
183
183
  return ctx
184
-
185
- def add_step(self, step: FetchStep) -> FetchPipeline:
186
- """
187
- Add a step to the pipeline (fluent API).
188
-
189
- Args:
190
- step: The step to add
191
-
192
- Returns:
193
- Self for chaining
194
- """
195
- self.steps.append(step)
196
- return self