docpull 3.0.1__tar.gz → 4.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {docpull-3.0.1/src/docpull.egg-info → docpull-4.0.0}/PKG-INFO +15 -3
- {docpull-3.0.1 → docpull-4.0.0}/README.md +11 -1
- {docpull-3.0.1 → docpull-4.0.0}/pyproject.toml +4 -2
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/__init__.py +1 -1
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cache/manager.py +0 -112
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cache/streaming_dedup.py +0 -17
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/markdown.py +21 -6
- docpull-4.0.0/src/docpull/discovery/_fetch.py +33 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/crawler.py +2 -28
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/enhanced.py +2 -27
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/static.py +2 -28
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/sitemap.py +2 -1
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/client.py +40 -46
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/tools.py +11 -1
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/config.py +0 -5
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/base.py +0 -13
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/fetch.py +16 -2
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/security/robots.py +7 -5
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/security/url_validator.py +87 -32
- {docpull-3.0.1 → docpull-4.0.0/src/docpull.egg-info}/PKG-INFO +15 -3
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/SOURCES.txt +3 -3
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/requires.txt +3 -1
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_cache_conditional_get.py +28 -0
- docpull-4.0.0/tests/test_ci_policy.py +38 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_cli.py +7 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_conversion.py +34 -6
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_discovery.py +33 -0
- docpull-4.0.0/tests/test_mcp_server.py +51 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_mcp_tools.py +20 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_security_hardening.py +92 -0
- docpull-3.0.1/src/docpull/concurrency/__init__.py +0 -7
- docpull-3.0.1/src/docpull/concurrency/manager.py +0 -123
- docpull-3.0.1/src/docpull/logging_config.py +0 -53
- {docpull-3.0.1 → docpull-4.0.0}/LICENSE +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/setup.cfg +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/__main__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cache/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/cli.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/chunking.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/extractor.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/protocols.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/special_cases.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/conversion/trafilatura_extractor.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/core/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/core/fetcher.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/composite.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/filters.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/link_extractors/protocols.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/discovery/protocols.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/doctor.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/protocols.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/http/rate_limiter.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/server.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/mcp/sources.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/metadata_extractor.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/events.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/models/profiles.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/chunk.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/convert.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/dedup.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/metadata.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save_json.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save_ndjson.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/save_sqlite.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/pipeline/steps/validate.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/py.typed +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/security/__init__.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull/time_utils.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/dependency_links.txt +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/entry_points.txt +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/src/docpull.egg-info/top_level.txt +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_chunking.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_convert_step_new.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_integration.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_link_extractors.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_naming.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_pipeline.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_real_site_regressions.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_save_ndjson.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_special_cases.py +0 -0
- {docpull-3.0.1 → docpull-4.0.0}/tests/test_time_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: docpull
|
|
3
|
-
Version:
|
|
3
|
+
Version: 4.0.0
|
|
4
4
|
Summary: Pull documentation from the web and convert to clean markdown
|
|
5
5
|
Author-email: Zachary Roth <support@raintree.technology>
|
|
6
6
|
Maintainer-email: Raintree Technology <support@raintree.technology>
|
|
@@ -42,7 +42,7 @@ Requires-Dist: beautifulsoup4>=4.12.0
|
|
|
42
42
|
Requires-Dist: html2text>=2020.1.16
|
|
43
43
|
Requires-Dist: defusedxml>=0.7.1
|
|
44
44
|
Requires-Dist: extruct>=0.15.0
|
|
45
|
-
Requires-Dist: aiohttp>=3.
|
|
45
|
+
Requires-Dist: aiohttp>=3.14.0
|
|
46
46
|
Requires-Dist: idna>=3.15
|
|
47
47
|
Requires-Dist: regex>=2024.11.6
|
|
48
48
|
Requires-Dist: rich>=13.0.0
|
|
@@ -59,6 +59,7 @@ Provides-Extra: tokens
|
|
|
59
59
|
Requires-Dist: tiktoken>=0.7.0; extra == "tokens"
|
|
60
60
|
Provides-Extra: mcp
|
|
61
61
|
Requires-Dist: mcp>=1.0.0; extra == "mcp"
|
|
62
|
+
Requires-Dist: pyjwt>=2.13.0; extra == "mcp"
|
|
62
63
|
Requires-Dist: python-multipart>=0.0.27; extra == "mcp"
|
|
63
64
|
Requires-Dist: starlette>=1.0.1; extra == "mcp"
|
|
64
65
|
Provides-Extra: llm
|
|
@@ -69,6 +70,7 @@ Requires-Dist: url-normalize>=1.4.0; extra == "all"
|
|
|
69
70
|
Requires-Dist: trafilatura>=1.12.0; extra == "all"
|
|
70
71
|
Requires-Dist: tiktoken>=0.7.0; extra == "all"
|
|
71
72
|
Requires-Dist: mcp>=1.0.0; extra == "all"
|
|
73
|
+
Requires-Dist: pyjwt>=2.13.0; extra == "all"
|
|
72
74
|
Requires-Dist: python-multipart>=0.0.27; extra == "all"
|
|
73
75
|
Requires-Dist: starlette>=1.0.1; extra == "all"
|
|
74
76
|
Provides-Extra: dev
|
|
@@ -228,12 +230,19 @@ pip install 'docpull[mcp]'
|
|
|
228
230
|
docpull mcp # starts the stdio server
|
|
229
231
|
```
|
|
230
232
|
|
|
231
|
-
|
|
233
|
+
Claude Code:
|
|
234
|
+
|
|
235
|
+
```bash
|
|
236
|
+
claude mcp add --transport stdio docpull -- docpull mcp
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Cursor (`.cursor/mcp.json` in a project, or `~/.cursor/mcp.json` globally):
|
|
232
240
|
|
|
233
241
|
```json
|
|
234
242
|
{
|
|
235
243
|
"mcpServers": {
|
|
236
244
|
"docpull": {
|
|
245
|
+
"type": "stdio",
|
|
237
246
|
"command": "docpull",
|
|
238
247
|
"args": ["mcp"]
|
|
239
248
|
}
|
|
@@ -241,6 +250,9 @@ Add to Claude Desktop or Claude Code manually:
|
|
|
241
250
|
}
|
|
242
251
|
```
|
|
243
252
|
|
|
253
|
+
Claude Desktop uses the same `mcpServers` shape in
|
|
254
|
+
`claude_desktop_config.json`.
|
|
255
|
+
|
|
244
256
|
Or, if you use Claude Code, install the plugin instead — it bundles the MCP
|
|
245
257
|
server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
|
|
246
258
|
`/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
|
|
@@ -140,12 +140,19 @@ pip install 'docpull[mcp]'
|
|
|
140
140
|
docpull mcp # starts the stdio server
|
|
141
141
|
```
|
|
142
142
|
|
|
143
|
-
|
|
143
|
+
Claude Code:
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
claude mcp add --transport stdio docpull -- docpull mcp
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
Cursor (`.cursor/mcp.json` in a project, or `~/.cursor/mcp.json` globally):
|
|
144
150
|
|
|
145
151
|
```json
|
|
146
152
|
{
|
|
147
153
|
"mcpServers": {
|
|
148
154
|
"docpull": {
|
|
155
|
+
"type": "stdio",
|
|
149
156
|
"command": "docpull",
|
|
150
157
|
"args": ["mcp"]
|
|
151
158
|
}
|
|
@@ -153,6 +160,9 @@ Add to Claude Desktop or Claude Code manually:
|
|
|
153
160
|
}
|
|
154
161
|
```
|
|
155
162
|
|
|
163
|
+
Claude Desktop uses the same `mcpServers` shape in
|
|
164
|
+
`claude_desktop_config.json`.
|
|
165
|
+
|
|
156
166
|
Or, if you use Claude Code, install the plugin instead — it bundles the MCP
|
|
157
167
|
server, five slash commands (`/docs-add`, `/docs-search`, `/docs-list`,
|
|
158
168
|
`/docs-refresh`, `/docs-remove`), and a meta-skill that teaches Claude
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "docpull"
|
|
7
|
-
version = "
|
|
7
|
+
version = "4.0.0"
|
|
8
8
|
dynamic = []
|
|
9
9
|
description = "Pull documentation from the web and convert to clean markdown"
|
|
10
10
|
readme = {file = "README.md", content-type = "text/markdown"}
|
|
@@ -66,7 +66,7 @@ dependencies = [
|
|
|
66
66
|
"html2text>=2020.1.16",
|
|
67
67
|
"defusedxml>=0.7.1",
|
|
68
68
|
"extruct>=0.15.0",
|
|
69
|
-
"aiohttp>=3.
|
|
69
|
+
"aiohttp>=3.14.0", # 3.14.0 fixes CVE-2026-34993 and CVE-2026-47265
|
|
70
70
|
"idna>=3.15",
|
|
71
71
|
"regex>=2024.11.6",
|
|
72
72
|
"rich>=13.0.0",
|
|
@@ -90,6 +90,7 @@ tokens = [
|
|
|
90
90
|
]
|
|
91
91
|
mcp = [
|
|
92
92
|
"mcp>=1.0.0",
|
|
93
|
+
"pyjwt>=2.13.0",
|
|
93
94
|
"python-multipart>=0.0.27",
|
|
94
95
|
"starlette>=1.0.1",
|
|
95
96
|
]
|
|
@@ -102,6 +103,7 @@ all = [
|
|
|
102
103
|
"trafilatura>=1.12.0",
|
|
103
104
|
"tiktoken>=0.7.0",
|
|
104
105
|
"mcp>=1.0.0",
|
|
106
|
+
"pyjwt>=2.13.0",
|
|
105
107
|
"python-multipart>=0.0.27",
|
|
106
108
|
"starlette>=1.0.1",
|
|
107
109
|
]
|
|
@@ -197,45 +197,6 @@ class CacheManager:
|
|
|
197
197
|
content = content.encode("utf-8")
|
|
198
198
|
return hashlib.sha256(content).hexdigest()
|
|
199
199
|
|
|
200
|
-
def has_changed(
|
|
201
|
-
self,
|
|
202
|
-
url: str,
|
|
203
|
-
content: str | None = None,
|
|
204
|
-
etag: str | None = None,
|
|
205
|
-
last_modified: str | None = None,
|
|
206
|
-
) -> bool:
|
|
207
|
-
"""Check if content has changed since last fetch.
|
|
208
|
-
|
|
209
|
-
Args:
|
|
210
|
-
url: URL to check
|
|
211
|
-
content: Current content (for checksum comparison)
|
|
212
|
-
etag: HTTP ETag header
|
|
213
|
-
last_modified: HTTP Last-Modified header
|
|
214
|
-
|
|
215
|
-
Returns:
|
|
216
|
-
True if content has changed or is new
|
|
217
|
-
"""
|
|
218
|
-
if url not in self.manifest:
|
|
219
|
-
return True # New URL
|
|
220
|
-
|
|
221
|
-
cached = self.manifest[url]
|
|
222
|
-
|
|
223
|
-
# Check ETag first (most reliable)
|
|
224
|
-
if etag and "etag" in cached:
|
|
225
|
-
return bool(etag != cached["etag"])
|
|
226
|
-
|
|
227
|
-
# Check Last-Modified
|
|
228
|
-
if last_modified and "last_modified" in cached:
|
|
229
|
-
return bool(last_modified != cached["last_modified"])
|
|
230
|
-
|
|
231
|
-
# Check content checksum
|
|
232
|
-
if content and "checksum" in cached:
|
|
233
|
-
current_checksum = self.compute_checksum(content)
|
|
234
|
-
return bool(current_checksum != cached["checksum"])
|
|
235
|
-
|
|
236
|
-
# Can't determine, assume changed
|
|
237
|
-
return True
|
|
238
|
-
|
|
239
200
|
def update_cache(
|
|
240
201
|
self,
|
|
241
202
|
url: str,
|
|
@@ -302,14 +263,6 @@ class CacheManager:
|
|
|
302
263
|
"""
|
|
303
264
|
return self._state.fetched_urls.copy()
|
|
304
265
|
|
|
305
|
-
def get_failed_urls(self) -> set[str]:
|
|
306
|
-
"""Get set of URLs that failed to fetch.
|
|
307
|
-
|
|
308
|
-
Returns:
|
|
309
|
-
Set of failed URLs (copy to prevent mutation)
|
|
310
|
-
"""
|
|
311
|
-
return self._state.failed_urls.copy()
|
|
312
|
-
|
|
313
266
|
def start_session(self) -> None:
|
|
314
267
|
"""Start a new fetch session.
|
|
315
268
|
|
|
@@ -319,30 +272,6 @@ class CacheManager:
|
|
|
319
272
|
self._state.last_run = utc_now_iso()
|
|
320
273
|
self._state_dirty = True
|
|
321
274
|
|
|
322
|
-
def clear_state(self) -> None:
|
|
323
|
-
"""Clear incremental state (for fresh start).
|
|
324
|
-
|
|
325
|
-
Note:
|
|
326
|
-
This immediately flushes to disk.
|
|
327
|
-
"""
|
|
328
|
-
self._state = _InternalState()
|
|
329
|
-
self._state_dirty = True
|
|
330
|
-
self.flush()
|
|
331
|
-
logger.info("Cleared incremental state")
|
|
332
|
-
|
|
333
|
-
def get_cache_stats(self) -> dict[str, str | int | None]:
|
|
334
|
-
"""Get cache statistics.
|
|
335
|
-
|
|
336
|
-
Returns:
|
|
337
|
-
Dict with cache stats
|
|
338
|
-
"""
|
|
339
|
-
return {
|
|
340
|
-
"cached_urls": len(self.manifest),
|
|
341
|
-
"fetched_urls": len(self._state.fetched_urls),
|
|
342
|
-
"failed_urls": len(self._state.failed_urls),
|
|
343
|
-
"last_run": self._state.last_run,
|
|
344
|
-
}
|
|
345
|
-
|
|
346
275
|
def evict_expired(self, ttl_days: int | None = None) -> int:
|
|
347
276
|
"""Remove cache entries older than TTL.
|
|
348
277
|
|
|
@@ -378,28 +307,6 @@ class CacheManager:
|
|
|
378
307
|
|
|
379
308
|
return len(to_remove)
|
|
380
309
|
|
|
381
|
-
def is_fetched(self, url: str) -> bool:
|
|
382
|
-
"""Check if URL has been fetched (O(1) lookup).
|
|
383
|
-
|
|
384
|
-
Args:
|
|
385
|
-
url: URL to check
|
|
386
|
-
|
|
387
|
-
Returns:
|
|
388
|
-
True if URL was successfully fetched
|
|
389
|
-
"""
|
|
390
|
-
return url in self._state.fetched_urls
|
|
391
|
-
|
|
392
|
-
def is_failed(self, url: str) -> bool:
|
|
393
|
-
"""Check if URL has failed (O(1) lookup).
|
|
394
|
-
|
|
395
|
-
Args:
|
|
396
|
-
url: URL to check
|
|
397
|
-
|
|
398
|
-
Returns:
|
|
399
|
-
True if URL failed to fetch
|
|
400
|
-
"""
|
|
401
|
-
return url in self._state.failed_urls
|
|
402
|
-
|
|
403
310
|
# Resume capability methods
|
|
404
311
|
|
|
405
312
|
def save_discovered_urls(self, urls: list[str], start_url: str) -> None:
|
|
@@ -483,22 +390,3 @@ class CacheManager:
|
|
|
483
390
|
logger.info("Cleared discovered URLs file")
|
|
484
391
|
except Exception as e:
|
|
485
392
|
logger.warning(f"Could not clear discovered URLs file: {e}")
|
|
486
|
-
|
|
487
|
-
def has_resume_data(self, start_url: str) -> bool:
|
|
488
|
-
"""Check if there is resume data available for the given URL.
|
|
489
|
-
|
|
490
|
-
Args:
|
|
491
|
-
start_url: The starting URL to check
|
|
492
|
-
|
|
493
|
-
Returns:
|
|
494
|
-
True if resume data exists and matches the start URL
|
|
495
|
-
"""
|
|
496
|
-
if not self.discovered_urls_file.exists():
|
|
497
|
-
return False
|
|
498
|
-
|
|
499
|
-
try:
|
|
500
|
-
with open(self.discovered_urls_file, encoding="utf-8") as f:
|
|
501
|
-
data: DiscoveredUrlsState = json.load(f)
|
|
502
|
-
return data.get("start_url") == start_url
|
|
503
|
-
except Exception:
|
|
504
|
-
return False
|
|
@@ -90,23 +90,6 @@ class StreamingDeduplicator:
|
|
|
90
90
|
self._seen[content_hash] = url
|
|
91
91
|
return (True, None)
|
|
92
92
|
|
|
93
|
-
async def is_duplicate(self, content: str | bytes) -> bool:
|
|
94
|
-
"""
|
|
95
|
-
Check if content has been seen before (read-only).
|
|
96
|
-
|
|
97
|
-
Unlike check_and_register, this doesn't register the content.
|
|
98
|
-
Useful for checking without committing to save.
|
|
99
|
-
|
|
100
|
-
Args:
|
|
101
|
-
content: The content to check (str or bytes)
|
|
102
|
-
|
|
103
|
-
Returns:
|
|
104
|
-
True if content has been seen before
|
|
105
|
-
"""
|
|
106
|
-
content_hash = self.compute_hash(content)
|
|
107
|
-
async with self._lock:
|
|
108
|
-
return content_hash in self._seen
|
|
109
|
-
|
|
110
93
|
def get_stats(self) -> dict:
|
|
111
94
|
"""
|
|
112
95
|
Get deduplication statistics.
|
|
@@ -215,6 +215,17 @@ class FrontmatterBuilder:
|
|
|
215
215
|
)
|
|
216
216
|
"""
|
|
217
217
|
|
|
218
|
+
@staticmethod
|
|
219
|
+
def _inline(value: Any) -> str:
|
|
220
|
+
"""Collapse CR/LF/NUL so an interpolated value stays on its own YAML line.
|
|
221
|
+
|
|
222
|
+
Page-supplied metadata (JSON-LD ``keywords``, OpenGraph ``article:tag``,
|
|
223
|
+
etc.) flows into frontmatter. Without this, a newline in a tag/keyword
|
|
224
|
+
would break out of the list item and inject attacker-chosen top-level
|
|
225
|
+
keys (e.g. ``draft: true``) into the document frontmatter.
|
|
226
|
+
"""
|
|
227
|
+
return str(value).replace("\r", " ").replace("\n", " ").replace("\x00", " ")
|
|
228
|
+
|
|
218
229
|
def build(
|
|
219
230
|
self,
|
|
220
231
|
title: str | None = None,
|
|
@@ -238,28 +249,32 @@ class FrontmatterBuilder:
|
|
|
238
249
|
|
|
239
250
|
if title:
|
|
240
251
|
# Escape quotes in title
|
|
241
|
-
safe_title = title.replace('"', '\\"')
|
|
252
|
+
safe_title = self._inline(title).replace('"', '\\"')
|
|
242
253
|
lines.append(f'title: "{safe_title}"')
|
|
243
254
|
|
|
244
255
|
if url:
|
|
245
|
-
lines.append(f"source: {url}")
|
|
256
|
+
lines.append(f"source: {self._inline(url)}")
|
|
246
257
|
|
|
247
258
|
if description:
|
|
248
259
|
# Escape quotes and truncate long descriptions
|
|
249
|
-
safe_desc = description[:500].replace('"', '\\"')
|
|
260
|
+
safe_desc = self._inline(description[:500]).replace('"', '\\"')
|
|
250
261
|
lines.append(f'description: "{safe_desc}"')
|
|
251
262
|
|
|
252
263
|
for key, value in extra_fields.items():
|
|
253
264
|
if value is not None:
|
|
254
265
|
if isinstance(value, str):
|
|
255
|
-
safe_value = value.replace('"', '\\"')
|
|
266
|
+
safe_value = self._inline(value).replace('"', '\\"')
|
|
256
267
|
lines.append(f'{key}: "{safe_value}"')
|
|
257
268
|
elif isinstance(value, (list, tuple)):
|
|
258
269
|
lines.append(f"{key}:")
|
|
259
270
|
for item in value:
|
|
260
|
-
|
|
271
|
+
# Quote + escape each item so a hostile tag/keyword (from
|
|
272
|
+
# page JSON-LD / OpenGraph) stays a single YAML string and
|
|
273
|
+
# cannot inject new keys or produce malformed frontmatter.
|
|
274
|
+
safe_item = self._inline(item).replace('"', '\\"')
|
|
275
|
+
lines.append(f' - "{safe_item}"')
|
|
261
276
|
else:
|
|
262
|
-
lines.append(f"{key}: {value}")
|
|
277
|
+
lines.append(f"{key}: {self._inline(value)}")
|
|
263
278
|
|
|
264
279
|
lines.append("---")
|
|
265
280
|
return "\n".join(lines) + "\n\n"
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Shared HTML fetch helper for link-discovery components."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
from ..http.protocols import HttpClient
|
|
8
|
+
|
|
9
|
+
logger = logging.getLogger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
async def fetch_html(client: HttpClient, url: str) -> bytes | None:
|
|
13
|
+
"""Fetch ``url`` and return its body iff it is a successful HTML response.
|
|
14
|
+
|
|
15
|
+
Returns ``None`` on network error, non-200 status, or a non-HTML content
|
|
16
|
+
type. Shared by the crawler and the static/enhanced link extractors so the
|
|
17
|
+
fetch and content-type gate stay identical across all three.
|
|
18
|
+
"""
|
|
19
|
+
try:
|
|
20
|
+
response = await client.get(url, timeout=30.0)
|
|
21
|
+
|
|
22
|
+
if response.status_code != 200:
|
|
23
|
+
return None
|
|
24
|
+
|
|
25
|
+
content_type = response.content_type.lower()
|
|
26
|
+
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
|
27
|
+
return None
|
|
28
|
+
|
|
29
|
+
return response.content
|
|
30
|
+
|
|
31
|
+
except Exception as e:
|
|
32
|
+
logger.debug(f"Failed to fetch {url}: {e}")
|
|
33
|
+
return None
|
|
@@ -13,6 +13,7 @@ from bs4 import BeautifulSoup
|
|
|
13
13
|
from ..http.protocols import HttpClient
|
|
14
14
|
from ..security.robots import RobotsChecker
|
|
15
15
|
from ..security.url_validator import UrlValidator
|
|
16
|
+
from ._fetch import fetch_html
|
|
16
17
|
from .filters import DomainFilter, PatternFilter, SeenUrlTracker
|
|
17
18
|
|
|
18
19
|
if TYPE_CHECKING:
|
|
@@ -114,33 +115,6 @@ class LinkCrawler:
|
|
|
114
115
|
|
|
115
116
|
return links
|
|
116
117
|
|
|
117
|
-
async def _fetch_page(self, url: str) -> bytes | None:
|
|
118
|
-
"""
|
|
119
|
-
Fetch a page for link extraction.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
url: URL to fetch
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
HTML content as bytes, or None if fetch failed
|
|
126
|
-
"""
|
|
127
|
-
try:
|
|
128
|
-
response = await self._client.get(url, timeout=30.0)
|
|
129
|
-
|
|
130
|
-
if response.status_code != 200:
|
|
131
|
-
return None
|
|
132
|
-
|
|
133
|
-
# Only process HTML content
|
|
134
|
-
content_type = response.content_type.lower()
|
|
135
|
-
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
|
136
|
-
return None
|
|
137
|
-
|
|
138
|
-
return response.content
|
|
139
|
-
|
|
140
|
-
except Exception as e:
|
|
141
|
-
logger.debug(f"Failed to fetch {url}: {e}")
|
|
142
|
-
return None
|
|
143
|
-
|
|
144
118
|
def _should_crawl(self, url: str) -> bool:
|
|
145
119
|
"""
|
|
146
120
|
Check if a URL should be crawled.
|
|
@@ -225,7 +199,7 @@ class LinkCrawler:
|
|
|
225
199
|
links = await self._link_extractor.extract_links(current_url)
|
|
226
200
|
else:
|
|
227
201
|
# Built-in extraction with separate fetch
|
|
228
|
-
html = await self.
|
|
202
|
+
html = await fetch_html(self._client, current_url)
|
|
229
203
|
if html is None:
|
|
230
204
|
continue
|
|
231
205
|
links = self._extract_links(html, current_url)
|
|
@@ -10,6 +10,7 @@ from urllib.parse import urljoin, urlparse
|
|
|
10
10
|
from bs4 import BeautifulSoup
|
|
11
11
|
|
|
12
12
|
from ...http.protocols import HttpClient
|
|
13
|
+
from .._fetch import fetch_html
|
|
13
14
|
|
|
14
15
|
logger = logging.getLogger(__name__)
|
|
15
16
|
|
|
@@ -104,7 +105,7 @@ class EnhancedLinkExtractor:
|
|
|
104
105
|
List of absolute URLs found on the page
|
|
105
106
|
"""
|
|
106
107
|
if content is None:
|
|
107
|
-
content = await self.
|
|
108
|
+
content = await fetch_html(self._client, url)
|
|
108
109
|
if content is None:
|
|
109
110
|
return []
|
|
110
111
|
|
|
@@ -137,32 +138,6 @@ class EnhancedLinkExtractor:
|
|
|
137
138
|
|
|
138
139
|
return list(links)
|
|
139
140
|
|
|
140
|
-
async def _fetch_content(self, url: str) -> bytes | None:
|
|
141
|
-
"""
|
|
142
|
-
Fetch page content for link extraction.
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
url: URL to fetch
|
|
146
|
-
|
|
147
|
-
Returns:
|
|
148
|
-
HTML content as bytes, or None if fetch failed
|
|
149
|
-
"""
|
|
150
|
-
try:
|
|
151
|
-
response = await self._client.get(url, timeout=30.0)
|
|
152
|
-
|
|
153
|
-
if response.status_code != 200:
|
|
154
|
-
return None
|
|
155
|
-
|
|
156
|
-
content_type = response.content_type.lower()
|
|
157
|
-
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
|
158
|
-
return None
|
|
159
|
-
|
|
160
|
-
return response.content
|
|
161
|
-
|
|
162
|
-
except Exception as e:
|
|
163
|
-
logger.debug(f"Failed to fetch {url}: {e}")
|
|
164
|
-
return None
|
|
165
|
-
|
|
166
141
|
def _extract_standard_links(self, soup: BeautifulSoup, base_url: str) -> list[str]:
|
|
167
142
|
"""Extract links from standard <a href> tags."""
|
|
168
143
|
links = []
|
|
@@ -8,6 +8,7 @@ from urllib.parse import urljoin, urlparse
|
|
|
8
8
|
from bs4 import BeautifulSoup
|
|
9
9
|
|
|
10
10
|
from ...http.protocols import HttpClient
|
|
11
|
+
from .._fetch import fetch_html
|
|
11
12
|
|
|
12
13
|
logger = logging.getLogger(__name__)
|
|
13
14
|
|
|
@@ -56,39 +57,12 @@ class StaticLinkExtractor:
|
|
|
56
57
|
List of absolute URLs found on the page
|
|
57
58
|
"""
|
|
58
59
|
if content is None:
|
|
59
|
-
content = await self.
|
|
60
|
+
content = await fetch_html(self._client, url)
|
|
60
61
|
if content is None:
|
|
61
62
|
return []
|
|
62
63
|
|
|
63
64
|
return self._parse_links(content, url)
|
|
64
65
|
|
|
65
|
-
async def _fetch_content(self, url: str) -> bytes | None:
|
|
66
|
-
"""
|
|
67
|
-
Fetch page content for link extraction.
|
|
68
|
-
|
|
69
|
-
Args:
|
|
70
|
-
url: URL to fetch
|
|
71
|
-
|
|
72
|
-
Returns:
|
|
73
|
-
HTML content as bytes, or None if fetch failed
|
|
74
|
-
"""
|
|
75
|
-
try:
|
|
76
|
-
response = await self._client.get(url, timeout=30.0)
|
|
77
|
-
|
|
78
|
-
if response.status_code != 200:
|
|
79
|
-
return None
|
|
80
|
-
|
|
81
|
-
# Only process HTML content
|
|
82
|
-
content_type = response.content_type.lower()
|
|
83
|
-
if "text/html" not in content_type and "application/xhtml" not in content_type:
|
|
84
|
-
return None
|
|
85
|
-
|
|
86
|
-
return response.content
|
|
87
|
-
|
|
88
|
-
except Exception as e:
|
|
89
|
-
logger.debug(f"Failed to fetch {url}: {e}")
|
|
90
|
-
return None
|
|
91
|
-
|
|
92
66
|
def _parse_links(self, html: bytes, base_url: str) -> list[str]:
|
|
93
67
|
"""
|
|
94
68
|
Parse links from HTML content.
|
|
@@ -7,6 +7,7 @@ from collections.abc import AsyncIterator
|
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
9
|
from defusedxml import ElementTree
|
|
10
|
+
from defusedxml.common import DefusedXmlException
|
|
10
11
|
|
|
11
12
|
from ..http.protocols import HttpClient
|
|
12
13
|
from ..security.robots import RobotsChecker
|
|
@@ -160,7 +161,7 @@ class SitemapDiscoverer:
|
|
|
160
161
|
|
|
161
162
|
try:
|
|
162
163
|
root = ElementTree.fromstring(content)
|
|
163
|
-
except ElementTree.ParseError as e:
|
|
164
|
+
except (ElementTree.ParseError, DefusedXmlException) as e:
|
|
164
165
|
logger.warning(f"Failed to parse sitemap XML: {e}")
|
|
165
166
|
return page_urls, sitemap_urls
|
|
166
167
|
|
|
@@ -234,6 +234,38 @@ class AsyncHttpClient:
|
|
|
234
234
|
|
|
235
235
|
return {key: value for key, value in headers.items() if key.lower() not in self.SENSITIVE_HEADERS}
|
|
236
236
|
|
|
237
|
+
def _next_redirect(
|
|
238
|
+
self,
|
|
239
|
+
response: aiohttp.ClientResponse,
|
|
240
|
+
current_url: str,
|
|
241
|
+
current_headers: dict[str, str],
|
|
242
|
+
redirect_count: int,
|
|
243
|
+
original_url: str,
|
|
244
|
+
) -> tuple[str, dict[str, str], int] | None:
|
|
245
|
+
"""Re-validate and follow one redirect hop, shared by GET and HEAD.
|
|
246
|
+
|
|
247
|
+
Returns the updated ``(url, headers, redirect_count)`` when ``response``
|
|
248
|
+
is a redirect, or ``None`` when it is not. Raises ``ValueError`` once
|
|
249
|
+
``MAX_REDIRECTS`` is exceeded. Centralising this keeps GET and HEAD on
|
|
250
|
+
identical redirect/SSRF re-validation.
|
|
251
|
+
"""
|
|
252
|
+
location = response.headers.get("Location")
|
|
253
|
+
if response.status in self.REDIRECT_STATUS_CODES and location:
|
|
254
|
+
if redirect_count >= self.MAX_REDIRECTS:
|
|
255
|
+
raise ValueError(f"Too many redirects while fetching {original_url}")
|
|
256
|
+
|
|
257
|
+
redirect_url = self._resolve_redirect_url(current_url, location)
|
|
258
|
+
new_headers = self._headers_for_url(
|
|
259
|
+
self._headers_for_redirect(
|
|
260
|
+
current_headers,
|
|
261
|
+
current_url,
|
|
262
|
+
redirect_url,
|
|
263
|
+
),
|
|
264
|
+
redirect_url,
|
|
265
|
+
)
|
|
266
|
+
return redirect_url, new_headers, redirect_count + 1
|
|
267
|
+
return None
|
|
268
|
+
|
|
237
269
|
async def __aenter__(self) -> AsyncHttpClient:
|
|
238
270
|
"""Enter async context and create session."""
|
|
239
271
|
resolver: AbstractResolver | None = None
|
|
@@ -382,22 +414,11 @@ class AsyncHttpClient:
|
|
|
382
414
|
allow_redirects=False,
|
|
383
415
|
) as response,
|
|
384
416
|
):
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
redirect_url = self._resolve_redirect_url(current_url, location)
|
|
391
|
-
current_headers = self._headers_for_url(
|
|
392
|
-
self._headers_for_redirect(
|
|
393
|
-
current_headers,
|
|
394
|
-
current_url,
|
|
395
|
-
redirect_url,
|
|
396
|
-
),
|
|
397
|
-
redirect_url,
|
|
398
|
-
)
|
|
399
|
-
current_url = redirect_url
|
|
400
|
-
redirect_count += 1
|
|
417
|
+
redirect = self._next_redirect(
|
|
418
|
+
response, current_url, current_headers, redirect_count, url
|
|
419
|
+
)
|
|
420
|
+
if redirect is not None:
|
|
421
|
+
current_url, current_headers, redirect_count = redirect
|
|
401
422
|
continue
|
|
402
423
|
|
|
403
424
|
if response.status in self.RETRYABLE_STATUS_CODES:
|
|
@@ -504,22 +525,9 @@ class AsyncHttpClient:
|
|
|
504
525
|
allow_redirects=False,
|
|
505
526
|
) as response,
|
|
506
527
|
):
|
|
507
|
-
|
|
508
|
-
if
|
|
509
|
-
|
|
510
|
-
raise ValueError(f"Too many redirects while fetching {url}")
|
|
511
|
-
|
|
512
|
-
redirect_url = self._resolve_redirect_url(current_url, location)
|
|
513
|
-
current_headers = self._headers_for_url(
|
|
514
|
-
self._headers_for_redirect(
|
|
515
|
-
current_headers,
|
|
516
|
-
current_url,
|
|
517
|
-
redirect_url,
|
|
518
|
-
),
|
|
519
|
-
redirect_url,
|
|
520
|
-
)
|
|
521
|
-
current_url = redirect_url
|
|
522
|
-
redirect_count += 1
|
|
528
|
+
redirect = self._next_redirect(response, current_url, current_headers, redirect_count, url)
|
|
529
|
+
if redirect is not None:
|
|
530
|
+
current_url, current_headers, redirect_count = redirect
|
|
523
531
|
continue
|
|
524
532
|
|
|
525
533
|
return HttpResponse(
|
|
@@ -529,17 +537,3 @@ class AsyncHttpClient:
|
|
|
529
537
|
headers=dict(response.headers),
|
|
530
538
|
url=str(response.url),
|
|
531
539
|
)
|
|
532
|
-
|
|
533
|
-
def decode_content(self, response: HttpResponse) -> str:
|
|
534
|
-
"""
|
|
535
|
-
Decode response content to string.
|
|
536
|
-
|
|
537
|
-
Convenience method that uses intelligent encoding detection.
|
|
538
|
-
|
|
539
|
-
Args:
|
|
540
|
-
response: HttpResponse to decode
|
|
541
|
-
|
|
542
|
-
Returns:
|
|
543
|
-
Decoded string content
|
|
544
|
-
"""
|
|
545
|
-
return self._decode_content(response.content, response.content_type)
|