aloop 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aloop might be problematic. Click here for more details.

Files changed (62) hide show
  1. agent/__init__.py +0 -0
  2. agent/agent.py +182 -0
  3. agent/base.py +406 -0
  4. agent/context.py +126 -0
  5. agent/todo.py +149 -0
  6. agent/tool_executor.py +54 -0
  7. agent/verification.py +135 -0
  8. aloop-0.1.0.dist-info/METADATA +246 -0
  9. aloop-0.1.0.dist-info/RECORD +62 -0
  10. aloop-0.1.0.dist-info/WHEEL +5 -0
  11. aloop-0.1.0.dist-info/entry_points.txt +2 -0
  12. aloop-0.1.0.dist-info/licenses/LICENSE +21 -0
  13. aloop-0.1.0.dist-info/top_level.txt +9 -0
  14. cli.py +19 -0
  15. config.py +146 -0
  16. interactive.py +865 -0
  17. llm/__init__.py +51 -0
  18. llm/base.py +26 -0
  19. llm/compat.py +226 -0
  20. llm/content_utils.py +309 -0
  21. llm/litellm_adapter.py +450 -0
  22. llm/message_types.py +245 -0
  23. llm/model_manager.py +265 -0
  24. llm/retry.py +95 -0
  25. main.py +246 -0
  26. memory/__init__.py +20 -0
  27. memory/compressor.py +554 -0
  28. memory/manager.py +538 -0
  29. memory/serialization.py +82 -0
  30. memory/short_term.py +88 -0
  31. memory/token_tracker.py +203 -0
  32. memory/types.py +51 -0
  33. tools/__init__.py +6 -0
  34. tools/advanced_file_ops.py +557 -0
  35. tools/base.py +51 -0
  36. tools/calculator.py +50 -0
  37. tools/code_navigator.py +975 -0
  38. tools/explore.py +254 -0
  39. tools/file_ops.py +150 -0
  40. tools/git_tools.py +791 -0
  41. tools/notify.py +69 -0
  42. tools/parallel_execute.py +420 -0
  43. tools/session_manager.py +205 -0
  44. tools/shell.py +147 -0
  45. tools/shell_background.py +470 -0
  46. tools/smart_edit.py +491 -0
  47. tools/todo.py +130 -0
  48. tools/web_fetch.py +673 -0
  49. tools/web_search.py +61 -0
  50. utils/__init__.py +15 -0
  51. utils/logger.py +105 -0
  52. utils/model_pricing.py +49 -0
  53. utils/runtime.py +75 -0
  54. utils/terminal_ui.py +422 -0
  55. utils/tui/__init__.py +39 -0
  56. utils/tui/command_registry.py +49 -0
  57. utils/tui/components.py +306 -0
  58. utils/tui/input_handler.py +393 -0
  59. utils/tui/model_ui.py +204 -0
  60. utils/tui/progress.py +292 -0
  61. utils/tui/status_bar.py +178 -0
  62. utils/tui/theme.py +165 -0
tools/web_fetch.py ADDED
@@ -0,0 +1,673 @@
1
+ """Web fetch tool for retrieving content from URLs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import hashlib
7
+ import ipaddress
8
+ import json
9
+ import os
10
+ import socket
11
+ import time
12
+ from dataclasses import dataclass
13
+ from email.message import Message
14
+ from typing import Any, TypedDict
15
+ from urllib.parse import urljoin, urlparse
16
+
17
+ import aiofiles
18
+ import aiofiles.os
19
+ import httpx
20
+ import trafilatura
21
+ from lxml import html as lxml_html
22
+
23
+ from .base import BaseTool
24
+
25
+ MAX_RESPONSE_BYTES = 5 * 1024 * 1024
26
+ DEFAULT_TIMEOUT_SECONDS = 30
27
+ MAX_TIMEOUT_SECONDS = 120
28
+ MAX_REDIRECTS = 5
29
+ ALLOWED_PORTS = {80, 443}
30
+ BLOCKED_HOSTS = {"localhost"}
31
+ BLOCKED_SUFFIXES = (".local",)
32
+ TEXT_CONTENT_TYPES = {"", "text/plain", "text/markdown"}
33
+ HTML_STRIP_XPATH = "//script|//style|//noscript|//iframe|//object|//embed"
34
+ ACCEPT_HEADERS = {
35
+ "markdown": "text/markdown;q=1.0, text/x-markdown;q=0.9, text/plain;q=0.8, text/html;q=0.7, */*;q=0.1",
36
+ "text": "text/plain;q=1.0, text/markdown;q=0.9, text/html;q=0.8, */*;q=0.1",
37
+ "html": "text/html;q=1.0, application/xhtml+xml;q=0.9, text/plain;q=0.8, text/markdown;q=0.7, */*;q=0.1",
38
+ }
39
+
40
+ # Cache configuration
41
+ CACHE_TTL_SECONDS = 300 # 5 minutes TTL
42
+ CACHE_MAX_ENTRIES = 100
43
+
44
+
45
+ def _get_encoding_from_headers(headers: httpx.Headers) -> str | None:
46
+ content_type = headers.get("content-type")
47
+ if not content_type:
48
+ return None
49
+ message = Message()
50
+ message["content-type"] = content_type
51
+ charset = message.get_param("charset")
52
+ if not charset:
53
+ return None
54
+ if isinstance(charset, tuple):
55
+ charset = charset[0]
56
+ if not isinstance(charset, str):
57
+ return None
58
+ return charset.strip("'\"")
59
+
60
+
61
+ class ExtractedLink(TypedDict):
62
+ """Structured link extracted from HTML content."""
63
+
64
+ href: str
65
+ text: str
66
+ type: str # "internal", "external", "anchor", "mailto", "tel"
67
+
68
+
69
+ @dataclass
70
+ class CacheEntry:
71
+ """Cache entry for URL fetch results."""
72
+
73
+ result: dict[str, Any]
74
+ timestamp: float
75
+ ttl: float
76
+
77
+
78
+ class WebFetchCache:
79
+ """Simple in-memory cache for web fetch results."""
80
+
81
+ def __init__(self, max_entries: int = CACHE_MAX_ENTRIES):
82
+ self._cache: dict[str, CacheEntry] = {}
83
+ self._max_entries = max_entries
84
+
85
+ def _make_key(self, url: str, format: str) -> str:
86
+ """Create a cache key from URL and format."""
87
+ return hashlib.md5(f"{url}:{format}".encode()).hexdigest()
88
+
89
+ def get(self, url: str, format: str) -> dict[str, Any] | None:
90
+ """Get cached result if valid."""
91
+ key = self._make_key(url, format)
92
+ entry = self._cache.get(key)
93
+ if entry is None:
94
+ return None
95
+ # Check TTL
96
+ if time.time() - entry.timestamp > entry.ttl:
97
+ del self._cache[key]
98
+ return None
99
+ return entry.result
100
+
101
+ def set(
102
+ self, url: str, format: str, result: dict[str, Any], ttl: float = CACHE_TTL_SECONDS
103
+ ) -> None:
104
+ """Cache a result."""
105
+ # Evict oldest entries if at capacity
106
+ if len(self._cache) >= self._max_entries:
107
+ oldest_key = min(self._cache, key=lambda k: self._cache[k].timestamp)
108
+ del self._cache[oldest_key]
109
+
110
+ key = self._make_key(url, format)
111
+ self._cache[key] = CacheEntry(result=result, timestamp=time.time(), ttl=ttl)
112
+
113
+ def clear(self) -> None:
114
+ """Clear all cached entries."""
115
+ self._cache.clear()
116
+
117
+
118
+ # Global cache instance
119
+ _url_cache = WebFetchCache()
120
+
121
+
122
+ class WebFetchError(Exception):
123
+ """Structured error for WebFetchTool."""
124
+
125
+ def __init__(self, code: str, message: str, metadata: dict[str, Any] | None = None):
126
+ super().__init__(message)
127
+ self.code = code
128
+ self.message = message
129
+ self.metadata = metadata or {}
130
+
131
+
132
+ class WebFetchTool(BaseTool):
133
+ """Fetch content from URLs and convert to various formats."""
134
+
135
+ @property
136
+ def name(self) -> str:
137
+ return "web_fetch"
138
+
139
+ @property
140
+ def description(self) -> str:
141
+ return (
142
+ "Fetch content from a URL and convert to markdown, text, or HTML. "
143
+ "Returns JSON with ok/output/metadata or error_code/message. "
144
+ "Use save_to parameter to save content to a local file for later grep/search. "
145
+ "IMPORTANT: When using save_to, the response will NOT contain the actual content - "
146
+ "only a confirmation that the file was saved. You MUST use read_file or grep_content "
147
+ "to access the saved content before using it."
148
+ )
149
+
150
+ @property
151
+ def parameters(self) -> dict[str, Any]:
152
+ return {
153
+ "url": {
154
+ "type": "string",
155
+ "description": "The URL to fetch content from (must start with http:// or https://)",
156
+ },
157
+ "format": {
158
+ "type": "string",
159
+ "enum": ["markdown", "text", "html"],
160
+ "description": "Output format - markdown by default",
161
+ "default": "markdown",
162
+ },
163
+ "timeout": {
164
+ "type": "number",
165
+ "description": "Optional timeout in seconds (max 120)",
166
+ "default": DEFAULT_TIMEOUT_SECONDS,
167
+ },
168
+ "save_to": {
169
+ "type": "string",
170
+ "description": (
171
+ "Optional file path to save the fetched content. "
172
+ "Parent directories will be created if needed. "
173
+ "WARNING: When this parameter is used, the response will only contain "
174
+ "a save confirmation, NOT the actual content. You MUST call read_file "
175
+ "or grep_content afterwards to access the content."
176
+ ),
177
+ },
178
+ "use_cache": {
179
+ "type": "boolean",
180
+ "description": (
181
+ "Whether to use cached results if available (default: true). "
182
+ "Cache TTL is 5 minutes. Set to false to force a fresh fetch."
183
+ ),
184
+ "default": True,
185
+ },
186
+ }
187
+
188
+ async def execute(self, **kwargs) -> str:
189
+ """Execute web fetch with format conversion."""
190
+ url = kwargs.get("url")
191
+ format_value = kwargs.get("format", "markdown")
192
+ timeout = kwargs.get("timeout")
193
+ save_to = kwargs.get("save_to")
194
+ use_cache = kwargs.get("use_cache", True)
195
+ start = time.time()
196
+
197
+ try:
198
+ if not url:
199
+ raise WebFetchError("invalid_url", "URL is required", {"requested_url": url})
200
+
201
+ # Check cache first (only if use_cache is True and save_to is not specified)
202
+ if use_cache and not save_to:
203
+ cached_result = _url_cache.get(url, format_value)
204
+ if cached_result is not None:
205
+ # Update metadata to indicate cache hit
206
+ result = cached_result.copy()
207
+ result["metadata"] = result.get("metadata", {}).copy()
208
+ result["metadata"]["cache_hit"] = True
209
+ result["metadata"]["duration_ms"] = int((time.time() - start) * 1000)
210
+ return json.dumps(result, ensure_ascii=False)
211
+
212
+ result = await self._execute(
213
+ url=url, format=format_value, timeout=timeout, start_time=start, save_to=save_to
214
+ )
215
+
216
+ # Cache successful results (only if save_to is not specified)
217
+ if result.get("ok") and not save_to:
218
+ _url_cache.set(url, format_value, result)
219
+ result["metadata"]["cache_hit"] = False
220
+
221
+ return json.dumps(result, ensure_ascii=False)
222
+ except WebFetchError as exc:
223
+ error_result = {
224
+ "ok": False,
225
+ "error_code": exc.code,
226
+ "message": exc.message,
227
+ "metadata": exc.metadata,
228
+ }
229
+ return json.dumps(error_result, ensure_ascii=False)
230
+ except Exception as exc:
231
+ error_result = {
232
+ "ok": False,
233
+ "error_code": "unexpected_error",
234
+ "message": str(exc),
235
+ "metadata": {"requested_url": url},
236
+ }
237
+ return json.dumps(error_result, ensure_ascii=False)
238
+
239
+ async def _execute(
240
+ self,
241
+ url: str,
242
+ format: str,
243
+ timeout: float | None,
244
+ start_time: float,
245
+ save_to: str | None = None,
246
+ ) -> dict[str, Any]:
247
+ if format not in {"markdown", "text", "html"}:
248
+ raise WebFetchError(
249
+ "invalid_format",
250
+ "Format must be one of markdown, text, or html",
251
+ {"requested_format": format},
252
+ )
253
+
254
+ parsed_url = await self._validate_url(url)
255
+ timeout_seconds = (
256
+ DEFAULT_TIMEOUT_SECONDS
257
+ if timeout is None
258
+ else max(1.0, min(float(timeout), MAX_TIMEOUT_SECONDS))
259
+ )
260
+
261
+ response, content_bytes, redirects = await self._fetch_with_redirects(
262
+ parsed_url.geturl(), format, timeout_seconds
263
+ )
264
+
265
+ content_type_header = response.headers.get("content-type", "")
266
+ content_type = content_type_header.partition(";")[0].strip().lower()
267
+ encoding = (
268
+ _get_encoding_from_headers(response.headers)
269
+ or getattr(response, "encoding", None)
270
+ or getattr(response, "apparent_encoding", None)
271
+ or "utf-8"
272
+ )
273
+ content = content_bytes.decode(encoding, errors="replace")
274
+
275
+ output, title = self._convert_content(content, content_type, format, url)
276
+
277
+ # Extract structured links from HTML content
278
+ links: list[ExtractedLink] = []
279
+ if "html" in content_type or content_type in {"application/xhtml+xml"}:
280
+ links = self._extract_links(content, str(response.url))
281
+
282
+ # Save content to file if save_to is specified
283
+ saved_path = None
284
+ if save_to:
285
+ saved_path = await self._save_content(output, save_to)
286
+
287
+ metadata: dict[str, Any] = {
288
+ "requested_url": url,
289
+ "final_url": str(response.url),
290
+ "status_code": response.status_code,
291
+ "content_type": content_type_header,
292
+ "charset": encoding,
293
+ "fetched_bytes": len(content_bytes),
294
+ "output_chars": len(output),
295
+ "redirects": redirects,
296
+ "truncated": len(content_bytes) >= MAX_RESPONSE_BYTES,
297
+ "duration_ms": int((time.time() - start_time) * 1000),
298
+ }
299
+
300
+ # Add extracted links to metadata if any were found
301
+ if links:
302
+ metadata["links"] = links
303
+
304
+ if saved_path:
305
+ metadata["saved_to"] = saved_path
306
+ # When saved to file, don't include full content in result to save tokens
307
+ # User can access content via read_file or grep_content
308
+ return {
309
+ "ok": True,
310
+ "title": title,
311
+ "output": f"Content saved to: {saved_path}\nUse read_file or grep_content to access the content.",
312
+ "metadata": metadata,
313
+ }
314
+
315
+ # Check content size before returning
316
+ estimated_tokens = len(output) // self.CHARS_PER_TOKEN
317
+ if estimated_tokens > self.MAX_TOKENS:
318
+ raise WebFetchError(
319
+ "content_too_large",
320
+ f"Page content (~{estimated_tokens} tokens) exceeds maximum allowed ({self.MAX_TOKENS}). "
321
+ f"Use save_to parameter to save content to a file, then use grep_content to search it.",
322
+ {"estimated_tokens": estimated_tokens, "max_tokens": self.MAX_TOKENS},
323
+ )
324
+
325
+ return {
326
+ "ok": True,
327
+ "title": title,
328
+ "output": output,
329
+ "metadata": metadata,
330
+ }
331
+
332
+ async def _save_content(self, content: str, save_to: str) -> str:
333
+ """Save content to a file, creating parent directories if needed.
334
+
335
+ Args:
336
+ content: Content to save
337
+ save_to: File path to save to
338
+
339
+ Returns:
340
+ Absolute path where content was saved
341
+ """
342
+ # Get absolute path
343
+ abs_path = os.path.abspath(save_to)
344
+
345
+ # Create parent directories if needed
346
+ parent_dir = os.path.dirname(abs_path)
347
+ if parent_dir:
348
+ await aiofiles.os.makedirs(parent_dir, exist_ok=True)
349
+
350
+ # Write content to file
351
+ async with aiofiles.open(abs_path, "w", encoding="utf-8") as f:
352
+ await f.write(content)
353
+
354
+ return abs_path
355
+
356
+ async def _validate_url(self, url: str):
357
+ if not url.startswith(("http://", "https://")):
358
+ raise WebFetchError(
359
+ "invalid_url",
360
+ "URL must start with http:// or https://",
361
+ {"requested_url": url},
362
+ )
363
+
364
+ parsed = urlparse(url)
365
+ if not parsed.scheme or not parsed.netloc or not parsed.hostname:
366
+ raise WebFetchError(
367
+ "invalid_url",
368
+ "URL is missing a valid hostname",
369
+ {"requested_url": url},
370
+ )
371
+
372
+ if parsed.username or parsed.password:
373
+ raise WebFetchError(
374
+ "invalid_url",
375
+ "URLs with embedded credentials are not allowed",
376
+ {"requested_url": url},
377
+ )
378
+
379
+ host = parsed.hostname.lower()
380
+ if host in BLOCKED_HOSTS or host.endswith(BLOCKED_SUFFIXES):
381
+ raise WebFetchError(
382
+ "blocked_host",
383
+ "Access to localhost or .local domains is not allowed",
384
+ {"host": host},
385
+ )
386
+
387
+ port = parsed.port or (443 if parsed.scheme == "https" else 80)
388
+ if port not in ALLOWED_PORTS:
389
+ raise WebFetchError(
390
+ "blocked_host",
391
+ "Access to the requested port is not allowed",
392
+ {"host": host, "port": port},
393
+ )
394
+
395
+ await self._ensure_host_safe(host, port)
396
+ return parsed
397
+
398
+ async def _ensure_host_safe(self, host: str, port: int) -> None:
399
+ try:
400
+ ip = ipaddress.ip_address(host)
401
+ except ValueError:
402
+ ip = None
403
+
404
+ if ip:
405
+ if not self._is_ip_allowed(ip):
406
+ raise WebFetchError(
407
+ "blocked_ip",
408
+ "Access to the requested IP address is not allowed",
409
+ {"ip": str(ip)},
410
+ )
411
+ return
412
+
413
+ resolved_ips = await self._resolve_host(host, port)
414
+ if not resolved_ips:
415
+ raise WebFetchError(
416
+ "dns_error",
417
+ "Failed to resolve hostname",
418
+ {"host": host},
419
+ )
420
+
421
+ for resolved in resolved_ips:
422
+ ip_value = ipaddress.ip_address(resolved)
423
+ if not self._is_ip_allowed(ip_value):
424
+ raise WebFetchError(
425
+ "blocked_ip",
426
+ "Resolved IP address is not allowed",
427
+ {"host": host, "ip": str(ip_value)},
428
+ )
429
+
430
+ async def _resolve_host(self, host: str, port: int) -> list[str]:
431
+ try:
432
+ loop = asyncio.get_running_loop()
433
+ infos = await loop.getaddrinfo(host, port, type=socket.SOCK_STREAM)
434
+ except socket.gaierror:
435
+ return []
436
+ addresses = []
437
+ for info in infos:
438
+ sockaddr = info[4]
439
+ if sockaddr:
440
+ addresses.append(str(sockaddr[0]))
441
+ return list(dict.fromkeys(addresses))
442
+
443
+ def _is_ip_allowed(self, ip: ipaddress.IPv4Address | ipaddress.IPv6Address) -> bool:
444
+ if str(ip) == "169.254.169.254":
445
+ return False
446
+ return ip.is_global
447
+
448
+ async def _fetch_with_redirects(
449
+ self, url: str, format: str, timeout: float
450
+ ) -> tuple[httpx.Response, bytes, list[str]]:
451
+ headers = {
452
+ "User-Agent": "Mozilla/5.0 (compatible; aloop/1.0)",
453
+ "Accept": ACCEPT_HEADERS.get(format, "*/*"),
454
+ "Accept-Language": "en-US,en;q=0.9",
455
+ }
456
+
457
+ redirects: list[str] = []
458
+ current_url = url
459
+ timeout_config = httpx.Timeout(timeout)
460
+ async with httpx.AsyncClient(follow_redirects=False, timeout=timeout_config) as client:
461
+ for _ in range(MAX_REDIRECTS + 1):
462
+ parsed = await self._validate_url(current_url)
463
+ try:
464
+ response, content_bytes = await self._request(
465
+ client, parsed.geturl(), headers, timeout
466
+ )
467
+ except httpx.TimeoutException as exc:
468
+ raise WebFetchError(
469
+ "timeout",
470
+ "Request timed out",
471
+ {"requested_url": current_url},
472
+ ) from exc
473
+ except httpx.RequestError as exc:
474
+ raise WebFetchError(
475
+ "request_error",
476
+ "Failed to fetch URL",
477
+ {"requested_url": current_url, "error": str(exc)},
478
+ ) from exc
479
+
480
+ if response.status_code in {301, 302, 303, 307, 308}:
481
+ location = response.headers.get("location")
482
+ if not location:
483
+ raise WebFetchError(
484
+ "http_error",
485
+ "Redirect response missing Location header",
486
+ {"requested_url": current_url, "status_code": response.status_code},
487
+ )
488
+ next_url = urljoin(current_url, location)
489
+ try:
490
+ await self._validate_url(next_url)
491
+ except WebFetchError as exc:
492
+ raise WebFetchError(
493
+ "redirect_blocked",
494
+ "Redirect target is not allowed",
495
+ {
496
+ "requested_url": current_url,
497
+ "redirect_url": next_url,
498
+ "redirect_error": exc.code,
499
+ },
500
+ ) from exc
501
+ redirects.append(next_url)
502
+ current_url = next_url
503
+ continue
504
+
505
+ if response.status_code >= 400:
506
+ raise WebFetchError(
507
+ "http_error",
508
+ f"Request failed with status code: {response.status_code}",
509
+ {"requested_url": current_url, "status_code": response.status_code},
510
+ )
511
+
512
+ return response, content_bytes, redirects
513
+
514
+ raise WebFetchError(
515
+ "redirect_blocked",
516
+ "Too many redirects",
517
+ {"requested_url": url, "redirects": redirects},
518
+ )
519
+
520
+ async def _request(
521
+ self, client: httpx.AsyncClient, url: str, headers: dict[str, str], timeout: float
522
+ ) -> tuple[httpx.Response, bytes]:
523
+ async with client.stream("GET", url, headers=headers, follow_redirects=False) as response:
524
+ if response.status_code in {301, 302, 303, 307, 308}:
525
+ return response, b""
526
+ if response.status_code >= 400:
527
+ return response, b""
528
+ content_bytes = await self._read_response(response)
529
+ return response, content_bytes
530
+
531
+ async def _read_response(self, response: httpx.Response) -> bytes:
532
+ content_length = response.headers.get("content-length")
533
+ if content_length:
534
+ try:
535
+ if int(content_length) > MAX_RESPONSE_BYTES:
536
+ raise WebFetchError(
537
+ "too_large",
538
+ "Response too large (exceeds 5MB limit)",
539
+ {"content_length": int(content_length)},
540
+ )
541
+ except ValueError:
542
+ pass
543
+
544
+ chunks: list[bytes] = []
545
+ total = 0
546
+ async for chunk in response.aiter_bytes():
547
+ if not chunk:
548
+ continue
549
+ total += len(chunk)
550
+ if total > MAX_RESPONSE_BYTES:
551
+ raise WebFetchError(
552
+ "too_large",
553
+ "Response too large (exceeds 5MB limit)",
554
+ {"fetched_bytes": total},
555
+ )
556
+ chunks.append(chunk)
557
+ return b"".join(chunks)
558
+
559
+ def _convert_content(
560
+ self, content: str, content_type: str, format: str, url: str
561
+ ) -> tuple[str, str]:
562
+ if "html" in content_type or content_type in {"application/xhtml+xml"}:
563
+ if format == "html":
564
+ return content, f"{url} ({content_type})"
565
+ return self._render_html(content, format, url)
566
+
567
+ if content_type.startswith("text/") or content_type in TEXT_CONTENT_TYPES:
568
+ return content, f"{url} ({content_type or 'text/plain'})"
569
+
570
+ if content_type == "application/json":
571
+ try:
572
+ payload = json.loads(content)
573
+ formatted = json.dumps(payload, ensure_ascii=False, indent=2)
574
+ except json.JSONDecodeError:
575
+ formatted = content
576
+ return formatted, f"{url} ({content_type})"
577
+
578
+ raise WebFetchError(
579
+ "unsupported_content_type",
580
+ "Unsupported content type for web fetch",
581
+ {"content_type": content_type},
582
+ )
583
+
584
+ def _render_html(self, html: str, format: str, url: str) -> tuple[str, str]:
585
+ # Extract title from HTML
586
+ title = url
587
+ try:
588
+ tree = lxml_html.fromstring(html)
589
+ title_elem = tree.find(".//title")
590
+ if title_elem is not None and title_elem.text:
591
+ title = title_elem.text.strip()
592
+ except Exception:
593
+ pass
594
+
595
+ if format == "markdown":
596
+ # Use trafilatura for markdown extraction
597
+ result = trafilatura.extract(
598
+ html,
599
+ include_links=True,
600
+ include_formatting=True,
601
+ include_tables=True,
602
+ output_format="markdown",
603
+ )
604
+ if result:
605
+ return result.strip(), title
606
+ # Fallback: extract as text if markdown fails
607
+ result = trafilatura.extract(html)
608
+ return result.strip() if result else "", title
609
+
610
+ # For text format, use trafilatura without formatting
611
+ result = trafilatura.extract(html, include_links=False, include_formatting=False)
612
+ return result.strip() if result else "", title
613
+
614
+ def _extract_links(self, html: str, base_url: str, max_links: int = 50) -> list[ExtractedLink]:
615
+ """Extract structured links from HTML content.
616
+
617
+ Args:
618
+ html: Raw HTML content
619
+ base_url: Base URL for resolving relative links
620
+ max_links: Maximum number of links to extract (default: 50)
621
+
622
+ Returns:
623
+ List of ExtractedLink dictionaries with href, text, and type
624
+ """
625
+ links: list[ExtractedLink] = []
626
+ try:
627
+ tree = lxml_html.fromstring(html)
628
+ parsed_base = urlparse(base_url)
629
+ base_domain = parsed_base.netloc.lower()
630
+
631
+ for anchor in tree.iter("a"):
632
+ href = anchor.get("href")
633
+ if not href:
634
+ continue
635
+
636
+ # Get link text (strip whitespace and normalize)
637
+ text = anchor.text_content().strip() if anchor.text_content() else ""
638
+ text = " ".join(text.split()) # Normalize whitespace
639
+ if not text:
640
+ # Try alt text from child img if no text
641
+ img = anchor.find(".//img")
642
+ if img is not None:
643
+ text = img.get("alt", "").strip()
644
+
645
+ # Determine link type
646
+ link_type: str
647
+ if href.startswith("#"):
648
+ link_type = "anchor"
649
+ elif href.startswith("mailto:"):
650
+ link_type = "mailto"
651
+ elif href.startswith("tel:"):
652
+ link_type = "tel"
653
+ elif href.startswith(("javascript:", "data:")):
654
+ continue # Skip javascript and data URLs
655
+ else:
656
+ # Resolve relative URLs
657
+ resolved_href = urljoin(base_url, href)
658
+ parsed_href = urlparse(resolved_href)
659
+ href_domain = parsed_href.netloc.lower()
660
+
661
+ link_type = "internal" if href_domain == base_domain else "external"
662
+ href = resolved_href
663
+
664
+ links.append(ExtractedLink(href=href, text=text[:200], type=link_type))
665
+
666
+ if len(links) >= max_links:
667
+ break
668
+
669
+ except Exception:
670
+ # If parsing fails, return empty list
671
+ pass
672
+
673
+ return links