docpull 3.0.1__tar.gz → 4.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {docpull-3.0.1/src/docpull.egg-info → docpull-4.0.0}/PKG-INFO +15 -3
  2. {docpull-3.0.1 → docpull-4.0.0}/README.md +11 -1
  3. {docpull-3.0.1 → docpull-4.0.0}/pyproject.toml +4 -2
  4. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/__init__.py +1 -1
  5. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cache/manager.py +0 -112
  6. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cache/streaming_dedup.py +0 -17
  7. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/markdown.py +21 -6
  8. docpull-4.0.0/src/docpull/discovery/_fetch.py +33 -0
  9. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/crawler.py +2 -28
  10. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/enhanced.py +2 -27
  11. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/static.py +2 -28
  12. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/sitemap.py +2 -1
  13. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/client.py +40 -46
  14. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/tools.py +11 -1
  15. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/config.py +0 -5
  16. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/base.py +0 -13
  17. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/fetch.py +16 -2
  18. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/security/robots.py +7 -5
  19. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/security/url_validator.py +87 -32
  20. {docpull-3.0.1 → docpull-4.0.0/src/docpull.egg-info}/PKG-INFO +15 -3
  21. {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/SOURCES.txt +3 -3
  22. {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/requires.txt +3 -1
  23. {docpull-3.0.1 → docpull-4.0.0}/tests/test_cache_conditional_get.py +28 -0
  24. docpull-4.0.0/tests/test_ci_policy.py +38 -0
  25. {docpull-3.0.1 → docpull-4.0.0}/tests/test_cli.py +7 -0
  26. {docpull-3.0.1 → docpull-4.0.0}/tests/test_conversion.py +34 -6
  27. {docpull-3.0.1 → docpull-4.0.0}/tests/test_discovery.py +33 -0
  28. docpull-4.0.0/tests/test_mcp_server.py +51 -0
  29. {docpull-3.0.1 → docpull-4.0.0}/tests/test_mcp_tools.py +20 -0
  30. {docpull-3.0.1 → docpull-4.0.0}/tests/test_security_hardening.py +92 -0
  31. docpull-3.0.1/src/docpull/concurrency/__init__.py +0 -7
  32. docpull-3.0.1/src/docpull/concurrency/manager.py +0 -123
  33. docpull-3.0.1/src/docpull/logging_config.py +0 -53
  34. {docpull-3.0.1 → docpull-4.0.0}/LICENSE +0 -0
  35. {docpull-3.0.1 → docpull-4.0.0}/setup.cfg +0 -0
  36. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/__main__.py +0 -0
  37. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cache/__init__.py +0 -0
  38. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cli.py +0 -0
  39. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/__init__.py +0 -0
  40. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/chunking.py +0 -0
  41. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/extractor.py +0 -0
  42. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/protocols.py +0 -0
  43. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/special_cases.py +0 -0
  44. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
  45. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/core/__init__.py +0 -0
  46. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/core/fetcher.py +0 -0
  47. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/__init__.py +0 -0
  48. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/composite.py +0 -0
  49. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/filters.py +0 -0
  50. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
  51. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
  52. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/protocols.py +0 -0
  53. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/doctor.py +0 -0
  54. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/__init__.py +0 -0
  55. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/protocols.py +0 -0
  56. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/rate_limiter.py +0 -0
  57. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/__init__.py +0 -0
  58. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/server.py +0 -0
  59. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/sources.py +0 -0
  60. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/metadata_extractor.py +0 -0
  61. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/__init__.py +0 -0
  62. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/events.py +0 -0
  63. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/profiles.py +0 -0
  64. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/__init__.py +0 -0
  65. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/__init__.py +0 -0
  66. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/chunk.py +0 -0
  67. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/convert.py +0 -0
  68. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/dedup.py +0 -0
  69. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/metadata.py +0 -0
  70. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save.py +0 -0
  71. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save_json.py +0 -0
  72. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
  73. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
  74. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/validate.py +0 -0
  75. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/py.typed +0 -0
  76. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/security/__init__.py +0 -0
  77. {docpull-3.0.1 → docpull-4.0.0}/src/docpull/time_utils.py +0 -0
  78. {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/dependency_links.txt +0 -0
  79. {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/entry_points.txt +0 -0
  80. {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/top_level.txt +0 -0
  81. {docpull-3.0.1 → docpull-4.0.0}/tests/test_chunking.py +0 -0
  82. {docpull-3.0.1 → docpull-4.0.0}/tests/test_convert_step_new.py +0 -0
  83. {docpull-3.0.1 → docpull-4.0.0}/tests/test_integration.py +0 -0
  84. {docpull-3.0.1 → docpull-4.0.0}/tests/test_link_extractors.py +0 -0
  85. {docpull-3.0.1 → docpull-4.0.0}/tests/test_naming.py +0 -0
  86. {docpull-3.0.1 → docpull-4.0.0}/tests/test_pipeline.py +0 -0
  87. {docpull-3.0.1 → docpull-4.0.0}/tests/test_real_site_regressions.py +0 -0
  88. {docpull-3.0.1 → docpull-4.0.0}/tests/test_save_ndjson.py +0 -0
  89. {docpull-3.0.1 → docpull-4.0.0}/tests/test_special_cases.py +0 -0
  90. {docpull-3.0.1 → docpull-4.0.0}/tests/test_time_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: docpull
3
- Version: 3.0.1
3
+ Version: 4.0.0
4
4
  Summary: Pull documentation from the web and convert to clean markdown
5
5
  Author-email: Zachary Roth <support@raintree.technology>
6
6
  Maintainer-email: Raintree Technology <support@raintree.technology>
@@ -42,7 +42,7 @@ Requires-Dist: beautifulsoup4>=4.12.0
42
42
  Requires-Dist: html2text>=2020.1.16
43
43
  Requires-Dist: defusedxml>=0.7.1
44
44
  Requires-Dist: extruct>=0.15.0
45
- Requires-Dist: aiohttp>=3.9.0
45
+ Requires-Dist: aiohttp>=3.14.0
46
46
  Requires-Dist: idna>=3.15
47
47
  Requires-Dist: regex>=2024.11.6
48
48
  Requires-Dist: rich>=13.0.0
@@ -59,6 +59,7 @@ Provides-Extra: tokens
59
59
  Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
60
60
  Provides-Extra: mcp
61
61
  Requires-Dist: mcp>=1.0.0; extra == "mcp"
62
+ Requires-Dist: pyjwt>=2.13.0; extra == "mcp"
62
63
  Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
63
64
  Requires-Dist: starlette>=1.0.1; extra == "mcp"
64
65
  Provides-Extra: llm
@@ -69,6 +70,7 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
69
70
  Requires-Dist: trafilatura>=1.12.0; extra == "all"
70
71
  Requires-Dist: tiktoken>=0.7.0; extra == "all"
71
72
  Requires-Dist: mcp>=1.0.0; extra == "all"
73
+ Requires-Dist: pyjwt>=2.13.0; extra == "all"
72
74
  Requires-Dist: python-multipart>=0.0.27; extra == "all"
73
75
  Requires-Dist: starlette>=1.0.1; extra == "all"
74
76
  Provides-Extra: dev
@@ -228,12 +230,19 @@ pip install 'docpull[mcp]'
228
230
  docpull mcp # starts the stdio server
229
231
  ```
230
232
 
231
- Add to Claude Desktop or Claude Code manually:
233
+ Claude Code:
234
+
235
+ ```bash
236
+ claude mcp add --transport stdio docpull -- docpull mcp
237
+ ```
238
+
239
+ Cursor (`.cursor/mcp.json` in a project, or `~/.cursor/mcp.json` globally):
232
240
 
233
241
  ```json
234
242
  {
235
243
  "mcpServers": {
236
244
  "docpull": {
245
+ "type": "stdio",
237
246
  "command": "docpull",
238
247
  "args": ["mcp"]
239
248
  }
@@ -241,6 +250,9 @@ Add to Claude Desktop or Claude Code manually:
241
250
  }
242
251
  ```
243
252
 
253
+ Claude Desktop uses the same `mcpServers` shape in
254
+ `claude_desktop_config.json`.
255
+
244
256
  Or, if you use Claude Code, install the plugin instead — it bundles the MCP
245
257
  server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
246
258
  `/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
@@ -140,12 +140,19 @@ pip install 'docpull[mcp]'
140
140
  docpull mcp # starts the stdio server
141
141
  ```
142
142
 
143
- Add to Claude Desktop or Claude Code manually:
143
+ Claude Code:
144
+
145
+ ```bash
146
+ claude mcp add --transport stdio docpull -- docpull mcp
147
+ ```
148
+
149
+ Cursor (`.cursor/mcp.json` in a project, or `~/.cursor/mcp.json` globally):
144
150
 
145
151
  ```json
146
152
  {
147
153
  "mcpServers": {
148
154
  "docpull": {
155
+ "type": "stdio",
149
156
  "command": "docpull",
150
157
  "args": ["mcp"]
151
158
  }
@@ -153,6 +160,9 @@ Add to Claude Desktop or Claude Code manually:
153
160
  }
154
161
  ```
155
162
 
163
+ Claude Desktop uses the same `mcpServers` shape in
164
+ `claude_desktop_config.json`.
165
+
156
166
  Or, if you use Claude Code, install the plugin instead — it bundles the MCP
157
167
  server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
158
168
  `/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "docpull"
7
- version = "3.0.1"
7
+ version = "4.0.0"
8
8
  dynamic = []
9
9
  description = "Pull documentation from the web and convert to clean markdown"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
@@ -66,7 +66,7 @@ dependencies = [
66
66
  "html2text>=2020.1.16",
67
67
  "defusedxml>=0.7.1",
68
68
  "extruct>=0.15.0",
69
- "aiohttp>=3.9.0",
69
+ "aiohttp>=3.14.0", # 3.14.0 fixes CVE-2026-34993 and CVE-2026-47265
70
70
  "idna>=3.15",
71
71
  "regex>=2024.11.6",
72
72
  "rich>=13.0.0",
@@ -90,6 +90,7 @@ tokens = [
90
90
  ]
91
91
  mcp = [
92
92
  "mcp>=1.0.0",
93
+ "pyjwt>=2.13.0",
93
94
  "python-multipart>=0.0.27",
94
95
  "starlette>=1.0.1",
95
96
  ]
@@ -102,6 +103,7 @@ all = [
102
103
  "trafilatura>=1.12.0",
103
104
  "tiktoken>=0.7.0",
104
105
  "mcp>=1.0.0",
106
+ "pyjwt>=2.13.0",
105
107
  "python-multipart>=0.0.27",
106
108
  "starlette>=1.0.1",
107
109
  ]
@@ -14,7 +14,7 @@ Usage:
14
14
  print(event)
15
15
  """
16
16
 
17
- __version__ = "3.0.0"
17
+ __version__ = "4.0.0"
18
18
 
19
19
  from .cache import CacheManager, StreamingDeduplicator
20
20
  from .conversion.chunking import Chunk, TokenCounter, chunk_markdown
@@ -197,45 +197,6 @@ class CacheManager:
197
197
  content = content.encode("utf-8")
198
198
  return hashlib.sha256(content).hexdigest()
199
199
 
200
- def has_changed(
201
- self,
202
- url: str,
203
- content: str | None = None,
204
- etag: str | None = None,
205
- last_modified: str | None = None,
206
- ) -> bool:
207
- """Check if content has changed since last fetch.
208
-
209
- Args:
210
- url: URL to check
211
- content: Current content (for checksum comparison)
212
- etag: HTTP ETag header
213
- last_modified: HTTP Last-Modified header
214
-
215
- Returns:
216
- True if content has changed or is new
217
- """
218
- if url not in self.manifest:
219
- return True # New URL
220
-
221
- cached = self.manifest[url]
222
-
223
- # Check ETag first (most reliable)
224
- if etag and "etag" in cached:
225
- return bool(etag != cached["etag"])
226
-
227
- # Check Last-Modified
228
- if last_modified and "last_modified" in cached:
229
- return bool(last_modified != cached["last_modified"])
230
-
231
- # Check content checksum
232
- if content and "checksum" in cached:
233
- current_checksum = self.compute_checksum(content)
234
- return bool(current_checksum != cached["checksum"])
235
-
236
- # Can't determine, assume changed
237
- return True
238
-
239
200
  def update_cache(
240
201
  self,
241
202
  url: str,
@@ -302,14 +263,6 @@ class CacheManager:
302
263
  """
303
264
  return self._state.fetched_urls.copy()
304
265
 
305
- def get_failed_urls(self) -> set[str]:
306
- """Get set of URLs that failed to fetch.
307
-
308
- Returns:
309
- Set of failed URLs (copy to prevent mutation)
310
- """
311
- return self._state.failed_urls.copy()
312
-
313
266
  def start_session(self) -> None:
314
267
  """Start a new fetch session.
315
268
 
@@ -319,30 +272,6 @@ class CacheManager:
319
272
  self._state.last_run = utc_now_iso()
320
273
  self._state_dirty = True
321
274
 
322
- def clear_state(self) -> None:
323
- """Clear incremental state (for fresh start).
324
-
325
- Note:
326
- This immediately flushes to disk.
327
- """
328
- self._state = _InternalState()
329
- self._state_dirty = True
330
- self.flush()
331
- logger.info("Cleared incremental state")
332
-
333
- def get_cache_stats(self) -> dict[str, str | int | None]:
334
- """Get cache statistics.
335
-
336
- Returns:
337
- Dict with cache stats
338
- """
339
- return {
340
- "cached_urls": len(self.manifest),
341
- "fetched_urls": len(self._state.fetched_urls),
342
- "failed_urls": len(self._state.failed_urls),
343
- "last_run": self._state.last_run,
344
- }
345
-
346
275
  def evict_expired(self, ttl_days: int | None = None) -> int:
347
276
  """Remove cache entries older than TTL.
348
277
 
@@ -378,28 +307,6 @@ class CacheManager:
378
307
 
379
308
  return len(to_remove)
380
309
 
381
- def is_fetched(self, url: str) -> bool:
382
- """Check if URL has been fetched (O(1) lookup).
383
-
384
- Args:
385
- url: URL to check
386
-
387
- Returns:
388
- True if URL was successfully fetched
389
- """
390
- return url in self._state.fetched_urls
391
-
392
- def is_failed(self, url: str) -> bool:
393
- """Check if URL has failed (O(1) lookup).
394
-
395
- Args:
396
- url: URL to check
397
-
398
- Returns:
399
- True if URL failed to fetch
400
- """
401
- return url in self._state.failed_urls
402
-
403
310
  # Resume capability methods
404
311
 
405
312
  def save_discovered_urls(self, urls: list[str], start_url: str) -> None:
@@ -483,22 +390,3 @@ class CacheManager:
483
390
  logger.info("Cleared discovered URLs file")
484
391
  except Exception as e:
485
392
  logger.warning(f"Could not clear discovered URLs file: {e}")
486
-
487
- def has_resume_data(self, start_url: str) -> bool:
488
- """Check if there is resume data available for the given URL.
489
-
490
- Args:
491
- start_url: The starting URL to check
492
-
493
- Returns:
494
- True if resume data exists and matches the start URL
495
- """
496
- if not self.discovered_urls_file.exists():
497
- return False
498
-
499
- try:
500
- with open(self.discovered_urls_file, encoding="utf-8") as f:
501
- data: DiscoveredUrlsState = json.load(f)
502
- return data.get("start_url") == start_url
503
- except Exception:
504
- return False
@@ -90,23 +90,6 @@ class StreamingDeduplicator:
90
90
  self._seen[content_hash] = url
91
91
  return (True, None)
92
92
 
93
- async def is_duplicate(self, content: str | bytes) -> bool:
94
- """
95
- Check if content has been seen before (read-only).
96
-
97
- Unlike check_and_register, this doesn't register the content.
98
- Useful for checking without committing to save.
99
-
100
- Args:
101
- content: The content to check (str or bytes)
102
-
103
- Returns:
104
- True if content has been seen before
105
- """
106
- content_hash = self.compute_hash(content)
107
- async with self._lock:
108
- return content_hash in self._seen
109
-
110
93
  def get_stats(self) -> dict:
111
94
  """
112
95
  Get deduplication statistics.
@@ -215,6 +215,17 @@ class FrontmatterBuilder:
215
215
  )
216
216
  """
217
217
 
218
+ @staticmethod
219
+ def _inline(value: Any) -> str:
220
+ """Collapse CR/LF/NUL so an interpolated value stays on its own YAML line.
221
+
222
+ Page-supplied metadata (JSON-LD ``keywords``, OpenGraph ``article:tag``,
223
+ etc.) flows into frontmatter. Without this, a newline in a tag/keyword
224
+ would break out of the list item and inject attacker-chosen top-level
225
+ keys (e.g. ``draft: true``) into the document frontmatter.
226
+ """
227
+ return str(value).replace("\r", " ").replace("\n", " ").replace("\x00", " ")
228
+
218
229
  def build(
219
230
  self,
220
231
  title: str | None = None,
@@ -238,28 +249,32 @@ class FrontmatterBuilder:
238
249
 
239
250
  if title:
240
251
  # Escape quotes in title
241
- safe_title = title.replace('"', '\\"')
252
+ safe_title = self._inline(title).replace('"', '\\"')
242
253
  lines.append(f'title: "{safe_title}"')
243
254
 
244
255
  if url:
245
- lines.append(f"source: {url}")
256
+ lines.append(f"source: {self._inline(url)}")
246
257
 
247
258
  if description:
248
259
  # Escape quotes and truncate long descriptions
249
- safe_desc = description[:500].replace('"', '\\"')
260
+ safe_desc = self._inline(description[:500]).replace('"', '\\"')
250
261
  lines.append(f'description: "{safe_desc}"')
251
262
 
252
263
  for key, value in extra_fields.items():
253
264
  if value is not None:
254
265
  if isinstance(value, str):
255
- safe_value = value.replace('"', '\\"')
266
+ safe_value = self._inline(value).replace('"', '\\"')
256
267
  lines.append(f'{key}: "{safe_value}"')
257
268
  elif isinstance(value, (list, tuple)):
258
269
  lines.append(f"{key}:")
259
270
  for item in value:
260
- lines.append(f" - {item}")
271
+ # Quote + escape each item so a hostile tag/keyword (from
272
+ # page JSON-LD / OpenGraph) stays a single YAML string and
273
+ # cannot inject new keys or produce malformed frontmatter.
274
+ safe_item = self._inline(item).replace('"', '\\"')
275
+ lines.append(f' - "{safe_item}"')
261
276
  else:
262
- lines.append(f"{key}: {value}")
277
+ lines.append(f"{key}: {self._inline(value)}")
263
278
 
264
279
  lines.append("---")
265
280
  return "\n".join(lines) + "\n\n"
@@ -0,0 +1,33 @@
1
+ """Shared HTML fetch helper for link-discovery components."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+
7
+ from ..http.protocols import HttpClient
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ async def fetch_html(client: HttpClient, url: str) -> bytes | None:
13
+ """Fetch ``url`` and return its body iff it is a successful HTML response.
14
+
15
+ Returns ``None`` on network error, non-200 status, or a non-HTML content
16
+ type. Shared by the crawler and the static/enhanced link extractors so the
17
+ fetch and content-type gate stay identical across all three.
18
+ """
19
+ try:
20
+ response = await client.get(url, timeout=30.0)
21
+
22
+ if response.status_code != 200:
23
+ return None
24
+
25
+ content_type = response.content_type.lower()
26
+ if "text/html" not in content_type and "application/xhtml" not in content_type:
27
+ return None
28
+
29
+ return response.content
30
+
31
+ except Exception as e:
32
+ logger.debug(f"Failed to fetch {url}: {e}")
33
+ return None
@@ -13,6 +13,7 @@ from bs4 import BeautifulSoup
13
13
  from ..http.protocols import HttpClient
14
14
  from ..security.robots import RobotsChecker
15
15
  from ..security.url_validator import UrlValidator
16
+ from ._fetch import fetch_html
16
17
  from .filters import DomainFilter, PatternFilter, SeenUrlTracker
17
18
 
18
19
  if TYPE_CHECKING:
@@ -114,33 +115,6 @@ class LinkCrawler:
114
115
 
115
116
  return links
116
117
 
117
- async def _fetch_page(self, url: str) -> bytes | None:
118
- """
119
- Fetch a page for link extraction.
120
-
121
- Args:
122
- url: URL to fetch
123
-
124
- Returns:
125
- HTML content as bytes, or None if fetch failed
126
- """
127
- try:
128
- response = await self._client.get(url, timeout=30.0)
129
-
130
- if response.status_code != 200:
131
- return None
132
-
133
- # Only process HTML content
134
- content_type = response.content_type.lower()
135
- if "text/html" not in content_type and "application/xhtml" not in content_type:
136
- return None
137
-
138
- return response.content
139
-
140
- except Exception as e:
141
- logger.debug(f"Failed to fetch {url}: {e}")
142
- return None
143
-
144
118
  def _should_crawl(self, url: str) -> bool:
145
119
  """
146
120
  Check if a URL should be crawled.
@@ -225,7 +199,7 @@ class LinkCrawler:
225
199
  links = await self._link_extractor.extract_links(current_url)
226
200
  else:
227
201
  # Built-in extraction with separate fetch
228
- html = await self._fetch_page(current_url)
202
+ html = await fetch_html(self._client, current_url)
229
203
  if html is None:
230
204
  continue
231
205
  links = self._extract_links(html, current_url)
@@ -10,6 +10,7 @@ from urllib.parse import urljoin, urlparse
10
10
  from bs4 import BeautifulSoup
11
11
 
12
12
  from ...http.protocols import HttpClient
13
+ from .._fetch import fetch_html
13
14
 
14
15
  logger = logging.getLogger(__name__)
15
16
 
@@ -104,7 +105,7 @@ class EnhancedLinkExtractor:
104
105
  List of absolute URLs found on the page
105
106
  """
106
107
  if content is None:
107
- content = await self._fetch_content(url)
108
+ content = await fetch_html(self._client, url)
108
109
  if content is None:
109
110
  return []
110
111
 
@@ -137,32 +138,6 @@ class EnhancedLinkExtractor:
137
138
 
138
139
  return list(links)
139
140
 
140
- async def _fetch_content(self, url: str) -> bytes | None:
141
- """
142
- Fetch page content for link extraction.
143
-
144
- Args:
145
- url: URL to fetch
146
-
147
- Returns:
148
- HTML content as bytes, or None if fetch failed
149
- """
150
- try:
151
- response = await self._client.get(url, timeout=30.0)
152
-
153
- if response.status_code != 200:
154
- return None
155
-
156
- content_type = response.content_type.lower()
157
- if "text/html" not in content_type and "application/xhtml" not in content_type:
158
- return None
159
-
160
- return response.content
161
-
162
- except Exception as e:
163
- logger.debug(f"Failed to fetch {url}: {e}")
164
- return None
165
-
166
141
  def _extract_standard_links(self, soup: BeautifulSoup, base_url: str) -> list[str]:
167
142
  """Extract links from standard <a href> tags."""
168
143
  links = []
@@ -8,6 +8,7 @@ from urllib.parse import urljoin, urlparse
8
8
  from bs4 import BeautifulSoup
9
9
 
10
10
  from ...http.protocols import HttpClient
11
+ from .._fetch import fetch_html
11
12
 
12
13
  logger = logging.getLogger(__name__)
13
14
 
@@ -56,39 +57,12 @@ class StaticLinkExtractor:
56
57
  List of absolute URLs found on the page
57
58
  """
58
59
  if content is None:
59
- content = await self._fetch_content(url)
60
+ content = await fetch_html(self._client, url)
60
61
  if content is None:
61
62
  return []
62
63
 
63
64
  return self._parse_links(content, url)
64
65
 
65
- async def _fetch_content(self, url: str) -> bytes | None:
66
- """
67
- Fetch page content for link extraction.
68
-
69
- Args:
70
- url: URL to fetch
71
-
72
- Returns:
73
- HTML content as bytes, or None if fetch failed
74
- """
75
- try:
76
- response = await self._client.get(url, timeout=30.0)
77
-
78
- if response.status_code != 200:
79
- return None
80
-
81
- # Only process HTML content
82
- content_type = response.content_type.lower()
83
- if "text/html" not in content_type and "application/xhtml" not in content_type:
84
- return None
85
-
86
- return response.content
87
-
88
- except Exception as e:
89
- logger.debug(f"Failed to fetch {url}: {e}")
90
- return None
91
-
92
66
  def _parse_links(self, html: bytes, base_url: str) -> list[str]:
93
67
  """
94
68
  Parse links from HTML content.
@@ -7,6 +7,7 @@ from collections.abc import AsyncIterator
7
7
  from urllib.parse import urlparse
8
8
 
9
9
  from defusedxml import ElementTree
10
+ from defusedxml.common import DefusedXmlException
10
11
 
11
12
  from ..http.protocols import HttpClient
12
13
  from ..security.robots import RobotsChecker
@@ -160,7 +161,7 @@ class SitemapDiscoverer:
160
161
 
161
162
  try:
162
163
  root = ElementTree.fromstring(content)
163
- except ElementTree.ParseError as e:
164
+ except (ElementTree.ParseError, DefusedXmlException) as e:
164
165
  logger.warning(f"Failed to parse sitemap XML: {e}")
165
166
  return page_urls, sitemap_urls
166
167
 
@@ -234,6 +234,38 @@ class AsyncHttpClient:
234
234
 
235
235
  return {key: value for key, value in headers.items() if key.lower() not in self.SENSITIVE_HEADERS}
236
236
 
237
+ def _next_redirect(
238
+ self,
239
+ response: aiohttp.ClientResponse,
240
+ current_url: str,
241
+ current_headers: dict[str, str],
242
+ redirect_count: int,
243
+ original_url: str,
244
+ ) -> tuple[str, dict[str, str], int] | None:
245
+ """Re-validate and follow one redirect hop, shared by GET and HEAD.
246
+
247
+ Returns the updated ``(url, headers, redirect_count)`` when ``response``
248
+ is a redirect, or ``None`` when it is not. Raises ``ValueError`` once
249
+ ``MAX_REDIRECTS`` is exceeded. Centralising this keeps GET and HEAD on
250
+ identical redirect/SSRF re-validation.
251
+ """
252
+ location = response.headers.get("Location")
253
+ if response.status in self.REDIRECT_STATUS_CODES and location:
254
+ if redirect_count >= self.MAX_REDIRECTS:
255
+ raise ValueError(f"Too many redirects while fetching {original_url}")
256
+
257
+ redirect_url = self._resolve_redirect_url(current_url, location)
258
+ new_headers = self._headers_for_url(
259
+ self._headers_for_redirect(
260
+ current_headers,
261
+ current_url,
262
+ redirect_url,
263
+ ),
264
+ redirect_url,
265
+ )
266
+ return redirect_url, new_headers, redirect_count + 1
267
+ return None
268
+
237
269
  async def __aenter__(self) -> AsyncHttpClient:
238
270
  """Enter async context and create session."""
239
271
  resolver: AbstractResolver | None = None
@@ -382,22 +414,11 @@ class AsyncHttpClient:
382
414
  allow_redirects=False,
383
415
  ) as response,
384
416
  ):
385
- location = response.headers.get("Location")
386
- if response.status in self.REDIRECT_STATUS_CODES and location:
387
- if redirect_count >= self.MAX_REDIRECTS:
388
- raise ValueError(f"Too many redirects while fetching {url}")
389
-
390
- redirect_url = self._resolve_redirect_url(current_url, location)
391
- current_headers = self._headers_for_url(
392
- self._headers_for_redirect(
393
- current_headers,
394
- current_url,
395
- redirect_url,
396
- ),
397
- redirect_url,
398
- )
399
- current_url = redirect_url
400
- redirect_count += 1
417
+ redirect = self._next_redirect(
418
+ response, current_url, current_headers, redirect_count, url
419
+ )
420
+ if redirect is not None:
421
+ current_url, current_headers, redirect_count = redirect
401
422
  continue
402
423
 
403
424
  if response.status in self.RETRYABLE_STATUS_CODES:
@@ -504,22 +525,9 @@ class AsyncHttpClient:
504
525
  allow_redirects=False,
505
526
  ) as response,
506
527
  ):
507
- location = response.headers.get("Location")
508
- if response.status in self.REDIRECT_STATUS_CODES and location:
509
- if redirect_count >= self.MAX_REDIRECTS:
510
- raise ValueError(f"Too many redirects while fetching {url}")
511
-
512
- redirect_url = self._resolve_redirect_url(current_url, location)
513
- current_headers = self._headers_for_url(
514
- self._headers_for_redirect(
515
- current_headers,
516
- current_url,
517
- redirect_url,
518
- ),
519
- redirect_url,
520
- )
521
- current_url = redirect_url
522
- redirect_count += 1
528
+ redirect = self._next_redirect(response, current_url, current_headers, redirect_count, url)
529
+ if redirect is not None:
530
+ current_url, current_headers, redirect_count = redirect
523
531
  continue
524
532
 
525
533
  return HttpResponse(
@@ -529,17 +537,3 @@ class AsyncHttpClient:
529
537
  headers=dict(response.headers),
530
538
  url=str(response.url),
531
539
  )
532
-
533
- def decode_content(self, response: HttpResponse) -> str:
534
- """
535
- Decode response content to string.
536
-
537
- Convenience method that uses intelligent encoding detection.
538
-
539
- Args:
540
- response: HttpResponse to decode
541
-
542
- Returns:
543
- Decoded string content
544
- """
545
- return self._decode_content(response.content, response.content_type)