aloop 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aloop might be problematic. Click here for more details.
- agent/__init__.py +0 -0
- agent/agent.py +182 -0
- agent/base.py +406 -0
- agent/context.py +126 -0
- agent/todo.py +149 -0
- agent/tool_executor.py +54 -0
- agent/verification.py +135 -0
- aloop-0.1.0.dist-info/METADATA +246 -0
- aloop-0.1.0.dist-info/RECORD +62 -0
- aloop-0.1.0.dist-info/WHEEL +5 -0
- aloop-0.1.0.dist-info/entry_points.txt +2 -0
- aloop-0.1.0.dist-info/licenses/LICENSE +21 -0
- aloop-0.1.0.dist-info/top_level.txt +9 -0
- cli.py +19 -0
- config.py +146 -0
- interactive.py +865 -0
- llm/__init__.py +51 -0
- llm/base.py +26 -0
- llm/compat.py +226 -0
- llm/content_utils.py +309 -0
- llm/litellm_adapter.py +450 -0
- llm/message_types.py +245 -0
- llm/model_manager.py +265 -0
- llm/retry.py +95 -0
- main.py +246 -0
- memory/__init__.py +20 -0
- memory/compressor.py +554 -0
- memory/manager.py +538 -0
- memory/serialization.py +82 -0
- memory/short_term.py +88 -0
- memory/token_tracker.py +203 -0
- memory/types.py +51 -0
- tools/__init__.py +6 -0
- tools/advanced_file_ops.py +557 -0
- tools/base.py +51 -0
- tools/calculator.py +50 -0
- tools/code_navigator.py +975 -0
- tools/explore.py +254 -0
- tools/file_ops.py +150 -0
- tools/git_tools.py +791 -0
- tools/notify.py +69 -0
- tools/parallel_execute.py +420 -0
- tools/session_manager.py +205 -0
- tools/shell.py +147 -0
- tools/shell_background.py +470 -0
- tools/smart_edit.py +491 -0
- tools/todo.py +130 -0
- tools/web_fetch.py +673 -0
- tools/web_search.py +61 -0
- utils/__init__.py +15 -0
- utils/logger.py +105 -0
- utils/model_pricing.py +49 -0
- utils/runtime.py +75 -0
- utils/terminal_ui.py +422 -0
- utils/tui/__init__.py +39 -0
- utils/tui/command_registry.py +49 -0
- utils/tui/components.py +306 -0
- utils/tui/input_handler.py +393 -0
- utils/tui/model_ui.py +204 -0
- utils/tui/progress.py +292 -0
- utils/tui/status_bar.py +178 -0
- utils/tui/theme.py +165 -0
tools/web_fetch.py
ADDED
|
@@ -0,0 +1,673 @@
|
|
|
1
|
+
"""Web fetch tool for retrieving content from URLs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import hashlib
|
|
7
|
+
import ipaddress
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import socket
|
|
11
|
+
import time
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from email.message import Message
|
|
14
|
+
from typing import Any, TypedDict
|
|
15
|
+
from urllib.parse import urljoin, urlparse
|
|
16
|
+
|
|
17
|
+
import aiofiles
|
|
18
|
+
import aiofiles.os
|
|
19
|
+
import httpx
|
|
20
|
+
import trafilatura
|
|
21
|
+
from lxml import html as lxml_html
|
|
22
|
+
|
|
23
|
+
from .base import BaseTool
|
|
24
|
+
|
|
25
|
+
MAX_RESPONSE_BYTES = 5 * 1024 * 1024
|
|
26
|
+
DEFAULT_TIMEOUT_SECONDS = 30
|
|
27
|
+
MAX_TIMEOUT_SECONDS = 120
|
|
28
|
+
MAX_REDIRECTS = 5
|
|
29
|
+
ALLOWED_PORTS = {80, 443}
|
|
30
|
+
BLOCKED_HOSTS = {"localhost"}
|
|
31
|
+
BLOCKED_SUFFIXES = (".local",)
|
|
32
|
+
TEXT_CONTENT_TYPES = {"", "text/plain", "text/markdown"}
|
|
33
|
+
HTML_STRIP_XPATH = "//script|//style|//noscript|//iframe|//object|//embed"
|
|
34
|
+
ACCEPT_HEADERS = {
|
|
35
|
+
"markdown": "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1",
|
|
36
|
+
"text": "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1",
|
|
37
|
+
"html": "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
# Cache configuration
|
|
41
|
+
CACHE_TTL_SECONDS = 300 # 5 minutes TTL
|
|
42
|
+
CACHE_MAX_ENTRIES = 100
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _get_encoding_from_headers(headers: httpx.Headers) -> str | None:
|
|
46
|
+
content_type = headers.get("content-type")
|
|
47
|
+
if not content_type:
|
|
48
|
+
return None
|
|
49
|
+
message = Message()
|
|
50
|
+
message["content-type"] = content_type
|
|
51
|
+
charset = message.get_param("charset")
|
|
52
|
+
if not charset:
|
|
53
|
+
return None
|
|
54
|
+
if isinstance(charset, tuple):
|
|
55
|
+
charset = charset[0]
|
|
56
|
+
if not isinstance(charset, str):
|
|
57
|
+
return None
|
|
58
|
+
return charset.strip("'\"")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class ExtractedLink(TypedDict):
|
|
62
|
+
"""Structured link extracted from HTML content."""
|
|
63
|
+
|
|
64
|
+
href: str
|
|
65
|
+
text: str
|
|
66
|
+
type: str # "internal", "external", "anchor", "mailto", "tel"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class CacheEntry:
|
|
71
|
+
"""Cache entry for URL fetch results."""
|
|
72
|
+
|
|
73
|
+
result: dict[str, Any]
|
|
74
|
+
timestamp: float
|
|
75
|
+
ttl: float
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class WebFetchCache:
|
|
79
|
+
"""Simple in-memory cache for web fetch results."""
|
|
80
|
+
|
|
81
|
+
def __init__(self, max_entries: int = CACHE_MAX_ENTRIES):
|
|
82
|
+
self._cache: dict[str, CacheEntry] = {}
|
|
83
|
+
self._max_entries = max_entries
|
|
84
|
+
|
|
85
|
+
def _make_key(self, url: str, format: str) -> str:
|
|
86
|
+
"""Create a cache key from URL and format."""
|
|
87
|
+
return hashlib.md5(f"{url}:{format}".encode()).hexdigest()
|
|
88
|
+
|
|
89
|
+
def get(self, url: str, format: str) -> dict[str, Any] | None:
|
|
90
|
+
"""Get cached result if valid."""
|
|
91
|
+
key = self._make_key(url, format)
|
|
92
|
+
entry = self._cache.get(key)
|
|
93
|
+
if entry is None:
|
|
94
|
+
return None
|
|
95
|
+
# Check TTL
|
|
96
|
+
if time.time() - entry.timestamp > entry.ttl:
|
|
97
|
+
del self._cache[key]
|
|
98
|
+
return None
|
|
99
|
+
return entry.result
|
|
100
|
+
|
|
101
|
+
def set(
|
|
102
|
+
self, url: str, format: str, result: dict[str, Any], ttl: float = CACHE_TTL_SECONDS
|
|
103
|
+
) -> None:
|
|
104
|
+
"""Cache a result."""
|
|
105
|
+
# Evict oldest entries if at capacity
|
|
106
|
+
if len(self._cache) >= self._max_entries:
|
|
107
|
+
oldest_key = min(self._cache, key=lambda k: self._cache[k].timestamp)
|
|
108
|
+
del self._cache[oldest_key]
|
|
109
|
+
|
|
110
|
+
key = self._make_key(url, format)
|
|
111
|
+
self._cache[key] = CacheEntry(result=result, timestamp=time.time(), ttl=ttl)
|
|
112
|
+
|
|
113
|
+
def clear(self) -> None:
|
|
114
|
+
"""Clear all cached entries."""
|
|
115
|
+
self._cache.clear()
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# Global cache instance
|
|
119
|
+
_url_cache = WebFetchCache()
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
class WebFetchError(Exception):
|
|
123
|
+
"""Structured error for WebFetchTool."""
|
|
124
|
+
|
|
125
|
+
def __init__(self, code: str, message: str, metadata: dict[str, Any] | None = None):
|
|
126
|
+
super().__init__(message)
|
|
127
|
+
self.code = code
|
|
128
|
+
self.message = message
|
|
129
|
+
self.metadata = metadata or {}
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
class WebFetchTool(BaseTool):
|
|
133
|
+
"""Fetch content from URLs and convert to various formats."""
|
|
134
|
+
|
|
135
|
+
@property
|
|
136
|
+
def name(self) -> str:
|
|
137
|
+
return "web_fetch"
|
|
138
|
+
|
|
139
|
+
@property
|
|
140
|
+
def description(self) -> str:
|
|
141
|
+
return (
|
|
142
|
+
"Fetch content from a URL and convert to markdown, text, or HTML. "
|
|
143
|
+
"Returns JSON with ok/output/metadata or error_code/message. "
|
|
144
|
+
"Use save_to parameter to save content to a local file for later grep/search. "
|
|
145
|
+
"IMPORTANT: When using save_to, the response will NOT contain the actual content - "
|
|
146
|
+
"only a confirmation that the file was saved. You MUST use read_file or grep_content "
|
|
147
|
+
"to access the saved content before using it."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
@property
|
|
151
|
+
def parameters(self) -> dict[str, Any]:
|
|
152
|
+
return {
|
|
153
|
+
"url": {
|
|
154
|
+
"type": "string",
|
|
155
|
+
"description": "The URL to fetch content from (must start with http:// or https://)",
|
|
156
|
+
},
|
|
157
|
+
"format": {
|
|
158
|
+
"type": "string",
|
|
159
|
+
"enum": ["markdown", "text", "html"],
|
|
160
|
+
"description": "Output format - markdown by default",
|
|
161
|
+
"default": "markdown",
|
|
162
|
+
},
|
|
163
|
+
"timeout": {
|
|
164
|
+
"type": "number",
|
|
165
|
+
"description": "Optional timeout in seconds (max 120)",
|
|
166
|
+
"default": DEFAULT_TIMEOUT_SECONDS,
|
|
167
|
+
},
|
|
168
|
+
"save_to": {
|
|
169
|
+
"type": "string",
|
|
170
|
+
"description": (
|
|
171
|
+
"Optional file path to save the fetched content. "
|
|
172
|
+
"Parent directories will be created if needed. "
|
|
173
|
+
"WARNING: When this parameter is used, the response will only contain "
|
|
174
|
+
"a save confirmation, NOT the actual content. You MUST call read_file "
|
|
175
|
+
"or grep_content afterwards to access the content."
|
|
176
|
+
),
|
|
177
|
+
},
|
|
178
|
+
"use_cache": {
|
|
179
|
+
"type": "boolean",
|
|
180
|
+
"description": (
|
|
181
|
+
"Whether to use cached results if available (default: true). "
|
|
182
|
+
"Cache TTL is 5 minutes. Set to false to force a fresh fetch."
|
|
183
|
+
),
|
|
184
|
+
"default": True,
|
|
185
|
+
},
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
async def execute(self, **kwargs) -> str:
|
|
189
|
+
"""Execute web fetch with format conversion."""
|
|
190
|
+
url = kwargs.get("url")
|
|
191
|
+
format_value = kwargs.get("format", "markdown")
|
|
192
|
+
timeout = kwargs.get("timeout")
|
|
193
|
+
save_to = kwargs.get("save_to")
|
|
194
|
+
use_cache = kwargs.get("use_cache", True)
|
|
195
|
+
start = time.time()
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
if not url:
|
|
199
|
+
raise WebFetchError("invalid_url", "URL is required", {"requested_url": url})
|
|
200
|
+
|
|
201
|
+
# Check cache first (only if use_cache is True and save_to is not specified)
|
|
202
|
+
if use_cache and not save_to:
|
|
203
|
+
cached_result = _url_cache.get(url, format_value)
|
|
204
|
+
if cached_result is not None:
|
|
205
|
+
# Update metadata to indicate cache hit
|
|
206
|
+
result = cached_result.copy()
|
|
207
|
+
result["metadata"] = result.get("metadata", {}).copy()
|
|
208
|
+
result["metadata"]["cache_hit"] = True
|
|
209
|
+
result["metadata"]["duration_ms"] = int((time.time() - start) * 1000)
|
|
210
|
+
return json.dumps(result, ensure_ascii=False)
|
|
211
|
+
|
|
212
|
+
result = await self._execute(
|
|
213
|
+
url=url, format=format_value, timeout=timeout, start_time=start, save_to=save_to
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
# Cache successful results (only if save_to is not specified)
|
|
217
|
+
if result.get("ok") and not save_to:
|
|
218
|
+
_url_cache.set(url, format_value, result)
|
|
219
|
+
result["metadata"]["cache_hit"] = False
|
|
220
|
+
|
|
221
|
+
return json.dumps(result, ensure_ascii=False)
|
|
222
|
+
except WebFetchError as exc:
|
|
223
|
+
error_result = {
|
|
224
|
+
"ok": False,
|
|
225
|
+
"error_code": exc.code,
|
|
226
|
+
"message": exc.message,
|
|
227
|
+
"metadata": exc.metadata,
|
|
228
|
+
}
|
|
229
|
+
return json.dumps(error_result, ensure_ascii=False)
|
|
230
|
+
except Exception as exc:
|
|
231
|
+
error_result = {
|
|
232
|
+
"ok": False,
|
|
233
|
+
"error_code": "unexpected_error",
|
|
234
|
+
"message": str(exc),
|
|
235
|
+
"metadata": {"requested_url": url},
|
|
236
|
+
}
|
|
237
|
+
return json.dumps(error_result, ensure_ascii=False)
|
|
238
|
+
|
|
239
|
+
async def _execute(
|
|
240
|
+
self,
|
|
241
|
+
url: str,
|
|
242
|
+
format: str,
|
|
243
|
+
timeout: float | None,
|
|
244
|
+
start_time: float,
|
|
245
|
+
save_to: str | None = None,
|
|
246
|
+
) -> dict[str, Any]:
|
|
247
|
+
if format not in {"markdown", "text", "html"}:
|
|
248
|
+
raise WebFetchError(
|
|
249
|
+
"invalid_format",
|
|
250
|
+
"Format must be one of markdown, text, or html",
|
|
251
|
+
{"requested_format": format},
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
parsed_url = await self._validate_url(url)
|
|
255
|
+
timeout_seconds = (
|
|
256
|
+
DEFAULT_TIMEOUT_SECONDS
|
|
257
|
+
if timeout is None
|
|
258
|
+
else max(1.0, min(float(timeout), MAX_TIMEOUT_SECONDS))
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
response, content_bytes, redirects = await self._fetch_with_redirects(
|
|
262
|
+
parsed_url.geturl(), format, timeout_seconds
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
content_type_header = response.headers.get("content-type", "")
|
|
266
|
+
content_type = content_type_header.partition(";")[0].strip().lower()
|
|
267
|
+
encoding = (
|
|
268
|
+
_get_encoding_from_headers(response.headers)
|
|
269
|
+
or getattr(response, "encoding", None)
|
|
270
|
+
or getattr(response, "apparent_encoding", None)
|
|
271
|
+
or "utf-8"
|
|
272
|
+
)
|
|
273
|
+
content = content_bytes.decode(encoding, errors="replace")
|
|
274
|
+
|
|
275
|
+
output, title = self._convert_content(content, content_type, format, url)
|
|
276
|
+
|
|
277
|
+
# Extract structured links from HTML content
|
|
278
|
+
links: list[ExtractedLink] = []
|
|
279
|
+
if "html" in content_type or content_type in {"application/xhtml+xml"}:
|
|
280
|
+
links = self._extract_links(content, str(response.url))
|
|
281
|
+
|
|
282
|
+
# Save content to file if save_to is specified
|
|
283
|
+
saved_path = None
|
|
284
|
+
if save_to:
|
|
285
|
+
saved_path = await self._save_content(output, save_to)
|
|
286
|
+
|
|
287
|
+
metadata: dict[str, Any] = {
|
|
288
|
+
"requested_url": url,
|
|
289
|
+
"final_url": str(response.url),
|
|
290
|
+
"status_code": response.status_code,
|
|
291
|
+
"content_type": content_type_header,
|
|
292
|
+
"charset": encoding,
|
|
293
|
+
"fetched_bytes": len(content_bytes),
|
|
294
|
+
"output_chars": len(output),
|
|
295
|
+
"redirects": redirects,
|
|
296
|
+
"truncated": len(content_bytes) >= MAX_RESPONSE_BYTES,
|
|
297
|
+
"duration_ms": int((time.time() - start_time) * 1000),
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
# Add extracted links to metadata if any were found
|
|
301
|
+
if links:
|
|
302
|
+
metadata["links"] = links
|
|
303
|
+
|
|
304
|
+
if saved_path:
|
|
305
|
+
metadata["saved_to"] = saved_path
|
|
306
|
+
# When saved to file, don't include full content in result to save tokens
|
|
307
|
+
# User can access content via read_file or grep_content
|
|
308
|
+
return {
|
|
309
|
+
"ok": True,
|
|
310
|
+
"title": title,
|
|
311
|
+
"output": f"Content saved to: {saved_path}\nUse read_file or grep_content to access the content.",
|
|
312
|
+
"metadata": metadata,
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
# Check content size before returning
|
|
316
|
+
estimated_tokens = len(output) // self.CHARS_PER_TOKEN
|
|
317
|
+
if estimated_tokens > self.MAX_TOKENS:
|
|
318
|
+
raise WebFetchError(
|
|
319
|
+
"content_too_large",
|
|
320
|
+
f"Page content (~{estimated_tokens} tokens) exceeds maximum allowed ({self.MAX_TOKENS}). "
|
|
321
|
+
f"Use save_to parameter to save content to a file, then use grep_content to search it.",
|
|
322
|
+
{"estimated_tokens": estimated_tokens, "max_tokens": self.MAX_TOKENS},
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
return {
|
|
326
|
+
"ok": True,
|
|
327
|
+
"title": title,
|
|
328
|
+
"output": output,
|
|
329
|
+
"metadata": metadata,
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
async def _save_content(self, content: str, save_to: str) -> str:
|
|
333
|
+
"""Save content to a file, creating parent directories if needed.
|
|
334
|
+
|
|
335
|
+
Args:
|
|
336
|
+
content: Content to save
|
|
337
|
+
save_to: File path to save to
|
|
338
|
+
|
|
339
|
+
Returns:
|
|
340
|
+
Absolute path where content was saved
|
|
341
|
+
"""
|
|
342
|
+
# Get absolute path
|
|
343
|
+
abs_path = os.path.abspath(save_to)
|
|
344
|
+
|
|
345
|
+
# Create parent directories if needed
|
|
346
|
+
parent_dir = os.path.dirname(abs_path)
|
|
347
|
+
if parent_dir:
|
|
348
|
+
await aiofiles.os.makedirs(parent_dir, exist_ok=True)
|
|
349
|
+
|
|
350
|
+
# Write content to file
|
|
351
|
+
async with aiofiles.open(abs_path, "w", encoding="utf-8") as f:
|
|
352
|
+
await f.write(content)
|
|
353
|
+
|
|
354
|
+
return abs_path
|
|
355
|
+
|
|
356
|
+
async def _validate_url(self, url: str):
|
|
357
|
+
if not url.startswith(("http://", "https://")):
|
|
358
|
+
raise WebFetchError(
|
|
359
|
+
"invalid_url",
|
|
360
|
+
"URL must start with http:// or https://",
|
|
361
|
+
{"requested_url": url},
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
parsed = urlparse(url)
|
|
365
|
+
if not parsed.scheme or not parsed.netloc or not parsed.hostname:
|
|
366
|
+
raise WebFetchError(
|
|
367
|
+
"invalid_url",
|
|
368
|
+
"URL is missing a valid hostname",
|
|
369
|
+
{"requested_url": url},
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
if parsed.username or parsed.password:
|
|
373
|
+
raise WebFetchError(
|
|
374
|
+
"invalid_url",
|
|
375
|
+
"URLs with embedded credentials are not allowed",
|
|
376
|
+
{"requested_url": url},
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
host = parsed.hostname.lower()
|
|
380
|
+
if host in BLOCKED_HOSTS or host.endswith(BLOCKED_SUFFIXES):
|
|
381
|
+
raise WebFetchError(
|
|
382
|
+
"blocked_host",
|
|
383
|
+
"Access to localhost or .local domains is not allowed",
|
|
384
|
+
{"host": host},
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
port = parsed.port or (443 if parsed.scheme == "https" else 80)
|
|
388
|
+
if port not in ALLOWED_PORTS:
|
|
389
|
+
raise WebFetchError(
|
|
390
|
+
"blocked_host",
|
|
391
|
+
"Access to the requested port is not allowed",
|
|
392
|
+
{"host": host, "port": port},
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
await self._ensure_host_safe(host, port)
|
|
396
|
+
return parsed
|
|
397
|
+
|
|
398
|
+
async def _ensure_host_safe(self, host: str, port: int) -> None:
|
|
399
|
+
try:
|
|
400
|
+
ip = ipaddress.ip_address(host)
|
|
401
|
+
except ValueError:
|
|
402
|
+
ip = None
|
|
403
|
+
|
|
404
|
+
if ip:
|
|
405
|
+
if not self._is_ip_allowed(ip):
|
|
406
|
+
raise WebFetchError(
|
|
407
|
+
"blocked_ip",
|
|
408
|
+
"Access to the requested IP address is not allowed",
|
|
409
|
+
{"ip": str(ip)},
|
|
410
|
+
)
|
|
411
|
+
return
|
|
412
|
+
|
|
413
|
+
resolved_ips = await self._resolve_host(host, port)
|
|
414
|
+
if not resolved_ips:
|
|
415
|
+
raise WebFetchError(
|
|
416
|
+
"dns_error",
|
|
417
|
+
"Failed to resolve hostname",
|
|
418
|
+
{"host": host},
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
for resolved in resolved_ips:
|
|
422
|
+
ip_value = ipaddress.ip_address(resolved)
|
|
423
|
+
if not self._is_ip_allowed(ip_value):
|
|
424
|
+
raise WebFetchError(
|
|
425
|
+
"blocked_ip",
|
|
426
|
+
"Resolved IP address is not allowed",
|
|
427
|
+
{"host": host, "ip": str(ip_value)},
|
|
428
|
+
)
|
|
429
|
+
|
|
430
|
+
async def _resolve_host(self, host: str, port: int) -> list[str]:
|
|
431
|
+
try:
|
|
432
|
+
loop = asyncio.get_running_loop()
|
|
433
|
+
infos = await loop.getaddrinfo(host, port, type=socket.SOCK_STREAM)
|
|
434
|
+
except socket.gaierror:
|
|
435
|
+
return []
|
|
436
|
+
addresses = []
|
|
437
|
+
for info in infos:
|
|
438
|
+
sockaddr = info[4]
|
|
439
|
+
if sockaddr:
|
|
440
|
+
addresses.append(str(sockaddr[0]))
|
|
441
|
+
return list(dict.fromkeys(addresses))
|
|
442
|
+
|
|
443
|
+
def _is_ip_allowed(self, ip: ipaddress.IPv4Address | ipaddress.IPv6Address) -> bool:
|
|
444
|
+
if str(ip) == "169.254.169.254":
|
|
445
|
+
return False
|
|
446
|
+
return ip.is_global
|
|
447
|
+
|
|
448
|
+
async def _fetch_with_redirects(
|
|
449
|
+
self, url: str, format: str, timeout: float
|
|
450
|
+
) -> tuple[httpx.Response, bytes, list[str]]:
|
|
451
|
+
headers = {
|
|
452
|
+
"User-Agent": "Mozilla/5.0 (compatible; aloop/1.0)",
|
|
453
|
+
"Accept": ACCEPT_HEADERS.get(format, "*/*"),
|
|
454
|
+
"Accept-Language": "en-US,en;q=0.9",
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
redirects: list[str] = []
|
|
458
|
+
current_url = url
|
|
459
|
+
timeout_config = httpx.Timeout(timeout)
|
|
460
|
+
async with httpx.AsyncClient(follow_redirects=False, timeout=timeout_config) as client:
|
|
461
|
+
for _ in range(MAX_REDIRECTS + 1):
|
|
462
|
+
parsed = await self._validate_url(current_url)
|
|
463
|
+
try:
|
|
464
|
+
response, content_bytes = await self._request(
|
|
465
|
+
client, parsed.geturl(), headers, timeout
|
|
466
|
+
)
|
|
467
|
+
except httpx.TimeoutException as exc:
|
|
468
|
+
raise WebFetchError(
|
|
469
|
+
"timeout",
|
|
470
|
+
"Request timed out",
|
|
471
|
+
{"requested_url": current_url},
|
|
472
|
+
) from exc
|
|
473
|
+
except httpx.RequestError as exc:
|
|
474
|
+
raise WebFetchError(
|
|
475
|
+
"request_error",
|
|
476
|
+
"Failed to fetch URL",
|
|
477
|
+
{"requested_url": current_url, "error": str(exc)},
|
|
478
|
+
) from exc
|
|
479
|
+
|
|
480
|
+
if response.status_code in {301, 302, 303, 307, 308}:
|
|
481
|
+
location = response.headers.get("location")
|
|
482
|
+
if not location:
|
|
483
|
+
raise WebFetchError(
|
|
484
|
+
"http_error",
|
|
485
|
+
"Redirect response missing Location header",
|
|
486
|
+
{"requested_url": current_url, "status_code": response.status_code},
|
|
487
|
+
)
|
|
488
|
+
next_url = urljoin(current_url, location)
|
|
489
|
+
try:
|
|
490
|
+
await self._validate_url(next_url)
|
|
491
|
+
except WebFetchError as exc:
|
|
492
|
+
raise WebFetchError(
|
|
493
|
+
"redirect_blocked",
|
|
494
|
+
"Redirect target is not allowed",
|
|
495
|
+
{
|
|
496
|
+
"requested_url": current_url,
|
|
497
|
+
"redirect_url": next_url,
|
|
498
|
+
"redirect_error": exc.code,
|
|
499
|
+
},
|
|
500
|
+
) from exc
|
|
501
|
+
redirects.append(next_url)
|
|
502
|
+
current_url = next_url
|
|
503
|
+
continue
|
|
504
|
+
|
|
505
|
+
if response.status_code >= 400:
|
|
506
|
+
raise WebFetchError(
|
|
507
|
+
"http_error",
|
|
508
|
+
f"Request failed with status code: {response.status_code}",
|
|
509
|
+
{"requested_url": current_url, "status_code": response.status_code},
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
return response, content_bytes, redirects
|
|
513
|
+
|
|
514
|
+
raise WebFetchError(
|
|
515
|
+
"redirect_blocked",
|
|
516
|
+
"Too many redirects",
|
|
517
|
+
{"requested_url": url, "redirects": redirects},
|
|
518
|
+
)
|
|
519
|
+
|
|
520
|
+
async def _request(
|
|
521
|
+
self, client: httpx.AsyncClient, url: str, headers: dict[str, str], timeout: float
|
|
522
|
+
) -> tuple[httpx.Response, bytes]:
|
|
523
|
+
async with client.stream("GET", url, headers=headers, follow_redirects=False) as response:
|
|
524
|
+
if response.status_code in {301, 302, 303, 307, 308}:
|
|
525
|
+
return response, b""
|
|
526
|
+
if response.status_code >= 400:
|
|
527
|
+
return response, b""
|
|
528
|
+
content_bytes = await self._read_response(response)
|
|
529
|
+
return response, content_bytes
|
|
530
|
+
|
|
531
|
+
async def _read_response(self, response: httpx.Response) -> bytes:
|
|
532
|
+
content_length = response.headers.get("content-length")
|
|
533
|
+
if content_length:
|
|
534
|
+
try:
|
|
535
|
+
if int(content_length) > MAX_RESPONSE_BYTES:
|
|
536
|
+
raise WebFetchError(
|
|
537
|
+
"too_large",
|
|
538
|
+
"Response too large (exceeds 5MB limit)",
|
|
539
|
+
{"content_length": int(content_length)},
|
|
540
|
+
)
|
|
541
|
+
except ValueError:
|
|
542
|
+
pass
|
|
543
|
+
|
|
544
|
+
chunks: list[bytes] = []
|
|
545
|
+
total = 0
|
|
546
|
+
async for chunk in response.aiter_bytes():
|
|
547
|
+
if not chunk:
|
|
548
|
+
continue
|
|
549
|
+
total += len(chunk)
|
|
550
|
+
if total > MAX_RESPONSE_BYTES:
|
|
551
|
+
raise WebFetchError(
|
|
552
|
+
"too_large",
|
|
553
|
+
"Response too large (exceeds 5MB limit)",
|
|
554
|
+
{"fetched_bytes": total},
|
|
555
|
+
)
|
|
556
|
+
chunks.append(chunk)
|
|
557
|
+
return b"".join(chunks)
|
|
558
|
+
|
|
559
|
+
def _convert_content(
|
|
560
|
+
self, content: str, content_type: str, format: str, url: str
|
|
561
|
+
) -> tuple[str, str]:
|
|
562
|
+
if "html" in content_type or content_type in {"application/xhtml+xml"}:
|
|
563
|
+
if format == "html":
|
|
564
|
+
return content, f"{url} ({content_type})"
|
|
565
|
+
return self._render_html(content, format, url)
|
|
566
|
+
|
|
567
|
+
if content_type.startswith("text/") or content_type in TEXT_CONTENT_TYPES:
|
|
568
|
+
return content, f"{url} ({content_type or 'text/plain'})"
|
|
569
|
+
|
|
570
|
+
if content_type == "application/json":
|
|
571
|
+
try:
|
|
572
|
+
payload = json.loads(content)
|
|
573
|
+
formatted = json.dumps(payload, ensure_ascii=False, indent=2)
|
|
574
|
+
except json.JSONDecodeError:
|
|
575
|
+
formatted = content
|
|
576
|
+
return formatted, f"{url} ({content_type})"
|
|
577
|
+
|
|
578
|
+
raise WebFetchError(
|
|
579
|
+
"unsupported_content_type",
|
|
580
|
+
"Unsupported content type for web fetch",
|
|
581
|
+
{"content_type": content_type},
|
|
582
|
+
)
|
|
583
|
+
|
|
584
|
+
def _render_html(self, html: str, format: str, url: str) -> tuple[str, str]:
|
|
585
|
+
# Extract title from HTML
|
|
586
|
+
title = url
|
|
587
|
+
try:
|
|
588
|
+
tree = lxml_html.fromstring(html)
|
|
589
|
+
title_elem = tree.find(".//title")
|
|
590
|
+
if title_elem is not None and title_elem.text:
|
|
591
|
+
title = title_elem.text.strip()
|
|
592
|
+
except Exception:
|
|
593
|
+
pass
|
|
594
|
+
|
|
595
|
+
if format == "markdown":
|
|
596
|
+
# Use trafilatura for markdown extraction
|
|
597
|
+
result = trafilatura.extract(
|
|
598
|
+
html,
|
|
599
|
+
include_links=True,
|
|
600
|
+
include_formatting=True,
|
|
601
|
+
include_tables=True,
|
|
602
|
+
output_format="markdown",
|
|
603
|
+
)
|
|
604
|
+
if result:
|
|
605
|
+
return result.strip(), title
|
|
606
|
+
# Fallback: extract as text if markdown fails
|
|
607
|
+
result = trafilatura.extract(html)
|
|
608
|
+
return result.strip() if result else "", title
|
|
609
|
+
|
|
610
|
+
# For text format, use trafilatura without formatting
|
|
611
|
+
result = trafilatura.extract(html, include_links=False, include_formatting=False)
|
|
612
|
+
return result.strip() if result else "", title
|
|
613
|
+
|
|
614
|
+
def _extract_links(self, html: str, base_url: str, max_links: int = 50) -> list[ExtractedLink]:
|
|
615
|
+
"""Extract structured links from HTML content.
|
|
616
|
+
|
|
617
|
+
Args:
|
|
618
|
+
html: Raw HTML content
|
|
619
|
+
base_url: Base URL for resolving relative links
|
|
620
|
+
max_links: Maximum number of links to extract (default: 50)
|
|
621
|
+
|
|
622
|
+
Returns:
|
|
623
|
+
List of ExtractedLink dictionaries with href, text, and type
|
|
624
|
+
"""
|
|
625
|
+
links: list[ExtractedLink] = []
|
|
626
|
+
try:
|
|
627
|
+
tree = lxml_html.fromstring(html)
|
|
628
|
+
parsed_base = urlparse(base_url)
|
|
629
|
+
base_domain = parsed_base.netloc.lower()
|
|
630
|
+
|
|
631
|
+
for anchor in tree.iter("a"):
|
|
632
|
+
href = anchor.get("href")
|
|
633
|
+
if not href:
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
# Get link text (strip whitespace and normalize)
|
|
637
|
+
text = anchor.text_content().strip() if anchor.text_content() else ""
|
|
638
|
+
text = " ".join(text.split()) # Normalize whitespace
|
|
639
|
+
if not text:
|
|
640
|
+
# Try alt text from child img if no text
|
|
641
|
+
img = anchor.find(".//img")
|
|
642
|
+
if img is not None:
|
|
643
|
+
text = img.get("alt", "").strip()
|
|
644
|
+
|
|
645
|
+
# Determine link type
|
|
646
|
+
link_type: str
|
|
647
|
+
if href.startswith("#"):
|
|
648
|
+
link_type = "anchor"
|
|
649
|
+
elif href.startswith("mailto:"):
|
|
650
|
+
link_type = "mailto"
|
|
651
|
+
elif href.startswith("tel:"):
|
|
652
|
+
link_type = "tel"
|
|
653
|
+
elif href.startswith(("javascript:", "data:")):
|
|
654
|
+
continue # Skip javascript and data URLs
|
|
655
|
+
else:
|
|
656
|
+
# Resolve relative URLs
|
|
657
|
+
resolved_href = urljoin(base_url, href)
|
|
658
|
+
parsed_href = urlparse(resolved_href)
|
|
659
|
+
href_domain = parsed_href.netloc.lower()
|
|
660
|
+
|
|
661
|
+
link_type = "internal" if href_domain == base_domain else "external"
|
|
662
|
+
href = resolved_href
|
|
663
|
+
|
|
664
|
+
links.append(ExtractedLink(href=href, text=text[:200], type=link_type))
|
|
665
|
+
|
|
666
|
+
if len(links) >= max_links:
|
|
667
|
+
break
|
|
668
|
+
|
|
669
|
+
except Exception:
|
|
670
|
+
# If parsing fails, return empty list
|
|
671
|
+
pass
|
|
672
|
+
|
|
673
|
+
return links
|