abstractcore 2.4.5__py3-none-any.whl → 2.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,17 +11,29 @@ import os
11
11
  import subprocess
12
12
  import requests
13
13
  from pathlib import Path
14
- from typing import Optional
14
+ from typing import Optional, Dict, Any, Union
15
15
  import logging
16
16
  import platform
17
17
  import re
18
18
  import time
19
+ import json
20
+ import base64
21
+ from datetime import datetime
22
+ from urllib.parse import urlparse, urljoin
23
+ import mimetypes
19
24
 
20
25
  try:
21
26
  from bs4 import BeautifulSoup
22
27
  BS4_AVAILABLE = True
28
+ # Try to use lxml parser for better performance
29
+ try:
30
+ import lxml
31
+ BS4_PARSER = 'lxml'
32
+ except ImportError:
33
+ BS4_PARSER = 'html.parser'
23
34
  except ImportError:
24
35
  BS4_AVAILABLE = False
36
+ BS4_PARSER = None
25
37
 
26
38
  try:
27
39
  import psutil
@@ -995,6 +1007,643 @@ def web_search(query: str, num_results: int = 5, safe_search: str = "moderate",
995
1007
  return f"Error searching internet: {str(e)}"
996
1008
 
997
1009
 
1010
+ @tool(
1011
+ description="Fetch and intelligently parse content from URLs with automatic content type detection and metadata extraction",
1012
+ tags=["web", "fetch", "url", "http", "content", "parse", "scraping"],
1013
+ when_to_use="When you need to retrieve and analyze content from specific URLs, including web pages, APIs, documents, or media files",
1014
+ examples=[
1015
+ {
1016
+ "description": "Fetch and parse HTML webpage",
1017
+ "arguments": {
1018
+ "url": "https://example.com/article.html"
1019
+ }
1020
+ },
1021
+ {
1022
+ "description": "Fetch JSON API response",
1023
+ "arguments": {
1024
+ "url": "https://api.github.com/repos/python/cpython",
1025
+ "headers": {"Accept": "application/json"}
1026
+ }
1027
+ },
1028
+ {
1029
+ "description": "POST data to API endpoint",
1030
+ "arguments": {
1031
+ "url": "https://httpbin.org/post",
1032
+ "method": "POST",
1033
+ "data": {"key": "value", "test": "data"}
1034
+ }
1035
+ },
1036
+ {
1037
+ "description": "Fetch binary content with metadata",
1038
+ "arguments": {
1039
+ "url": "https://example.com/document.pdf",
1040
+ "include_binary_preview": True
1041
+ }
1042
+ }
1043
+ ]
1044
+ )
1045
+ def fetch_url(
1046
+ url: str,
1047
+ method: str = "GET",
1048
+ headers: Optional[Dict[str, str]] = None,
1049
+ data: Optional[Union[Dict[str, Any], str]] = None,
1050
+ timeout: int = 30,
1051
+ max_content_length: int = 10485760, # 10MB default
1052
+ follow_redirects: bool = True,
1053
+ include_binary_preview: bool = False,
1054
+ extract_links: bool = True,
1055
+ user_agent: str = "AbstractCore-FetchTool/1.0"
1056
+ ) -> str:
1057
+ """
1058
+ Fetch and intelligently parse content from URLs with comprehensive content type detection.
1059
+
1060
+ This tool automatically detects content types (HTML, JSON, XML, images, etc.) and provides
1061
+ appropriate parsing with metadata extraction including timestamps and response headers.
1062
+
1063
+ Args:
1064
+ url: The URL to fetch content from
1065
+ method: HTTP method to use (default: "GET")
1066
+ headers: Optional custom headers to send with the request
1067
+ data: Optional data to send with POST/PUT requests (dict or string)
1068
+ timeout: Request timeout in seconds (default: 30)
1069
+ max_content_length: Maximum content length to fetch in bytes (default: 10MB)
1070
+ follow_redirects: Whether to follow HTTP redirects (default: True)
1071
+ include_binary_preview: Whether to include base64 preview for binary content (default: False)
1072
+ extract_links: Whether to extract links from HTML content (default: True)
1073
+ user_agent: User-Agent header to use (default: "AbstractCore-FetchTool/1.0")
1074
+
1075
+ Returns:
1076
+ Formatted string with parsed content, metadata, and analysis or error message
1077
+
1078
+ Examples:
1079
+ fetch_url("https://api.github.com/repos/python/cpython") # Fetch and parse JSON API
1080
+ fetch_url("https://example.com", headers={"Accept": "text/html"}) # Fetch HTML with custom headers
1081
+ fetch_url("https://httpbin.org/post", method="POST", data={"test": "value"}) # POST request
1082
+ fetch_url("https://example.com/image.jpg", include_binary_preview=True) # Fetch image with preview
1083
+ """
1084
+ try:
1085
+ # Validate URL
1086
+ parsed_url = urlparse(url)
1087
+ if not parsed_url.scheme or not parsed_url.netloc:
1088
+ return f"❌ Invalid URL format: {url}"
1089
+
1090
+ if parsed_url.scheme not in ['http', 'https']:
1091
+ return f"❌ Unsupported URL scheme: {parsed_url.scheme}. Only HTTP and HTTPS are supported."
1092
+
1093
+ # Prepare request headers
1094
+ request_headers = {
1095
+ 'User-Agent': user_agent,
1096
+ 'Accept': '*/*',
1097
+ 'Accept-Encoding': 'gzip, deflate',
1098
+ 'Connection': 'keep-alive'
1099
+ }
1100
+
1101
+ if headers:
1102
+ request_headers.update(headers)
1103
+
1104
+ # Prepare request parameters
1105
+ request_params = {
1106
+ 'url': url,
1107
+ 'method': method.upper(),
1108
+ 'headers': request_headers,
1109
+ 'timeout': timeout,
1110
+ 'allow_redirects': follow_redirects,
1111
+ 'stream': True # Stream to check content length
1112
+ }
1113
+
1114
+ # Add data for POST/PUT requests
1115
+ if data and method.upper() in ['POST', 'PUT', 'PATCH']:
1116
+ if isinstance(data, dict):
1117
+ # Try JSON first, fallback to form data
1118
+ if request_headers.get('Content-Type', '').startswith('application/json'):
1119
+ request_params['json'] = data
1120
+ else:
1121
+ request_params['data'] = data
1122
+ else:
1123
+ request_params['data'] = data
1124
+
1125
+ # Record fetch timestamp
1126
+ fetch_timestamp = datetime.now().isoformat()
1127
+
1128
+ # Make the request with session for connection reuse
1129
+ with requests.Session() as session:
1130
+ session.headers.update(request_headers)
1131
+ response = session.request(
1132
+ method=method.upper(),
1133
+ url=url,
1134
+ timeout=timeout,
1135
+ allow_redirects=follow_redirects,
1136
+ stream=True,
1137
+ json=request_params.get('json'),
1138
+ data=request_params.get('data')
1139
+ )
1140
+
1141
+ # Check response status
1142
+ if not response.ok:
1143
+ return f"❌ HTTP Error {response.status_code}: {response.reason}\n" \
1144
+ f"URL: {url}\n" \
1145
+ f"Timestamp: {fetch_timestamp}\n" \
1146
+ f"Response headers: {dict(response.headers)}"
1147
+
1148
+ # Get content info
1149
+ content_type = response.headers.get('content-type', '').lower()
1150
+ content_length = response.headers.get('content-length')
1151
+ if content_length:
1152
+ content_length = int(content_length)
1153
+
1154
+ # Check content length before downloading
1155
+ if content_length and content_length > max_content_length:
1156
+ return f"⚠️ Content too large: {content_length:,} bytes (max: {max_content_length:,})\n" \
1157
+ f"URL: {url}\n" \
1158
+ f"Content-Type: {content_type}\n" \
1159
+ f"Timestamp: {fetch_timestamp}\n" \
1160
+ f"Use max_content_length parameter to increase limit if needed"
1161
+
1162
+ # Download content with optimized chunking
1163
+ content_chunks = []
1164
+ downloaded_size = 0
1165
+
1166
+ # Use larger chunks for better performance
1167
+ chunk_size = 32768 if 'image/' in content_type or 'video/' in content_type else 16384
1168
+
1169
+ for chunk in response.iter_content(chunk_size=chunk_size):
1170
+ if chunk:
1171
+ downloaded_size += len(chunk)
1172
+ if downloaded_size > max_content_length:
1173
+ return f"⚠️ Content exceeded size limit during download: {downloaded_size:,} bytes (max: {max_content_length:,})\n" \
1174
+ f"URL: {url}\n" \
1175
+ f"Content-Type: {content_type}\n" \
1176
+ f"Timestamp: {fetch_timestamp}"
1177
+ content_chunks.append(chunk)
1178
+
1179
+ content_bytes = b''.join(content_chunks)
1180
+ actual_size = len(content_bytes)
1181
+
1182
+ # Detect content type and parse accordingly
1183
+ parsed_content = _parse_content_by_type(content_bytes, content_type, url, extract_links, include_binary_preview)
1184
+
1185
+ # Build comprehensive response
1186
+ result_parts = []
1187
+ result_parts.append(f"🌐 URL Fetch Results")
1188
+ result_parts.append(f"📍 URL: {response.url}") # Final URL after redirects
1189
+ if response.url != url:
1190
+ result_parts.append(f"🔄 Original URL: {url}")
1191
+ result_parts.append(f"⏰ Timestamp: {fetch_timestamp}")
1192
+ result_parts.append(f"✅ Status: {response.status_code} {response.reason}")
1193
+ result_parts.append(f"📊 Content-Type: {content_type}")
1194
+ result_parts.append(f"📏 Size: {actual_size:,} bytes")
1195
+
1196
+ # Add important response headers
1197
+ important_headers = ['server', 'last-modified', 'etag', 'cache-control', 'expires', 'location']
1198
+ response_metadata = []
1199
+ for header in important_headers:
1200
+ value = response.headers.get(header)
1201
+ if value:
1202
+ response_metadata.append(f" {header.title()}: {value}")
1203
+
1204
+ if response_metadata:
1205
+ result_parts.append(f"📋 Response Headers:")
1206
+ result_parts.extend(response_metadata)
1207
+
1208
+ # Add parsed content
1209
+ result_parts.append(f"\n📄 Content Analysis:")
1210
+ result_parts.append(parsed_content)
1211
+
1212
+ return "\n".join(result_parts)
1213
+
1214
+ except requests.exceptions.Timeout:
1215
+ return f"⏰ Request timeout after {timeout} seconds\n" \
1216
+ f"URL: {url}\n" \
1217
+ f"Consider increasing timeout parameter"
1218
+
1219
+ except requests.exceptions.ConnectionError as e:
1220
+ return f"🔌 Connection error: {str(e)}\n" \
1221
+ f"URL: {url}\n" \
1222
+ f"Check network connectivity and URL validity"
1223
+
1224
+ except requests.exceptions.TooManyRedirects:
1225
+ return f"🔄 Too many redirects\n" \
1226
+ f"URL: {url}\n" \
1227
+ f"Try setting follow_redirects=False to see redirect chain"
1228
+
1229
+ except requests.exceptions.RequestException as e:
1230
+ return f"❌ Request error: {str(e)}\n" \
1231
+ f"URL: {url}"
1232
+
1233
+ except Exception as e:
1234
+ return f"❌ Unexpected error fetching URL: {str(e)}\n" \
1235
+ f"URL: {url}"
1236
+
1237
+
1238
+ def _parse_content_by_type(content_bytes: bytes, content_type: str, url: str, extract_links: bool = True, include_binary_preview: bool = False) -> str:
1239
+ """
1240
+ Parse content based on detected content type with intelligent fallbacks.
1241
+
1242
+ This function provides robust content type detection and parsing for various formats
1243
+ including HTML, JSON, XML, plain text, images, and other binary formats.
1244
+ """
1245
+ try:
1246
+ # Normalize content type
1247
+ main_type = content_type.split(';')[0].strip().lower()
1248
+
1249
+ # Try to decode as text first for text-based formats
1250
+ text_content = None
1251
+ encoding = 'utf-8'
1252
+
1253
+ # Detect encoding from content-type header
1254
+ if 'charset=' in content_type:
1255
+ try:
1256
+ encoding = content_type.split('charset=')[1].split(';')[0].strip()
1257
+ except:
1258
+ encoding = 'utf-8'
1259
+
1260
+ # Attempt text decoding for text-based content types with better encoding detection
1261
+ text_based_types = [
1262
+ 'text/', 'application/json', 'application/xml', 'application/javascript',
1263
+ 'application/rss+xml', 'application/atom+xml', 'application/xhtml+xml'
1264
+ ]
1265
+
1266
+ is_text_based = any(main_type.startswith(t) for t in text_based_types)
1267
+
1268
+ if is_text_based:
1269
+ # Try multiple encoding strategies
1270
+ for enc in [encoding, 'utf-8', 'iso-8859-1', 'windows-1252']:
1271
+ try:
1272
+ text_content = content_bytes.decode(enc)
1273
+ break
1274
+ except (UnicodeDecodeError, LookupError):
1275
+ continue
1276
+ else:
1277
+ # Final fallback with error replacement
1278
+ text_content = content_bytes.decode('utf-8', errors='replace')
1279
+
1280
+ # Parse based on content type
1281
+ if main_type.startswith('text/html') or main_type.startswith('application/xhtml'):
1282
+ return _parse_html_content(text_content, url, extract_links)
1283
+
1284
+ elif main_type == 'application/json':
1285
+ return _parse_json_content(text_content)
1286
+
1287
+ elif main_type in ['application/xml', 'text/xml', 'application/rss+xml', 'application/atom+xml']:
1288
+ return _parse_xml_content(text_content)
1289
+
1290
+ elif main_type.startswith('text/'):
1291
+ return _parse_text_content(text_content, main_type)
1292
+
1293
+ elif main_type.startswith('image/'):
1294
+ return _parse_image_content(content_bytes, main_type, include_binary_preview)
1295
+
1296
+ elif main_type == 'application/pdf':
1297
+ return _parse_pdf_content(content_bytes, include_binary_preview)
1298
+
1299
+ else:
1300
+ return _parse_binary_content(content_bytes, main_type, include_binary_preview)
1301
+
1302
+ except Exception as e:
1303
+ return f"❌ Error parsing content: {str(e)}\n" \
1304
+ f"Content-Type: {content_type}\n" \
1305
+ f"Content size: {len(content_bytes):,} bytes"
1306
+
1307
+
1308
+ def _parse_html_content(html_content: str, url: str, extract_links: bool = True) -> str:
1309
+ """Parse HTML content and extract meaningful information."""
1310
+ if not html_content:
1311
+ return "❌ No HTML content to parse"
1312
+
1313
+ result_parts = []
1314
+ result_parts.append("🌐 HTML Document Analysis")
1315
+
1316
+ # Use BeautifulSoup if available for better parsing
1317
+ if BS4_AVAILABLE:
1318
+ try:
1319
+ soup = BeautifulSoup(html_content, BS4_PARSER)
1320
+
1321
+ # Extract title
1322
+ title = soup.find('title')
1323
+ if title:
1324
+ result_parts.append(f"📰 Title: {title.get_text().strip()}")
1325
+
1326
+ # Extract meta description
1327
+ meta_desc = soup.find('meta', attrs={'name': 'description'})
1328
+ if meta_desc and meta_desc.get('content'):
1329
+ result_parts.append(f"📝 Description: {meta_desc['content'][:200]}...")
1330
+
1331
+ # Extract headings
1332
+ headings = []
1333
+ for i in range(1, 7):
1334
+ h_tags = soup.find_all(f'h{i}')
1335
+ for h in h_tags[:5]: # Limit to first 5 of each level
1336
+ headings.append(f"H{i}: {h.get_text().strip()[:100]}")
1337
+
1338
+ if headings:
1339
+ result_parts.append(f"📋 Headings (first 5 per level):")
1340
+ for heading in headings[:10]: # Limit total headings
1341
+ result_parts.append(f" • {heading}")
1342
+
1343
+ # Extract links if requested
1344
+ if extract_links:
1345
+ links = []
1346
+ for a in soup.find_all('a', href=True)[:20]: # Limit to first 20 links
1347
+ href = a['href']
1348
+ text = a.get_text().strip()[:50]
1349
+ # Convert relative URLs to absolute
1350
+ if href.startswith('/'):
1351
+ href = urljoin(url, href)
1352
+ elif not href.startswith(('http://', 'https://')):
1353
+ href = urljoin(url, href)
1354
+ links.append(f"{text} → {href}")
1355
+
1356
+ if links:
1357
+ result_parts.append(f"🔗 Links (first 20):")
1358
+ for link in links:
1359
+ result_parts.append(f" • {link}")
1360
+
1361
+ # Extract main text content with better cleaning
1362
+ # Remove script, style, nav, footer, header elements for cleaner content
1363
+ for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
1364
+ element.decompose()
1365
+
1366
+ # Try to find main content area first
1367
+ main_content = soup.find(['main', 'article']) or soup.find('div', class_=lambda x: x and any(word in x.lower() for word in ['content', 'article', 'post', 'main']))
1368
+ content_soup = main_content if main_content else soup
1369
+
1370
+ text = content_soup.get_text()
1371
+ # Clean up text more efficiently
1372
+ lines = (line.strip() for line in text.splitlines() if line.strip())
1373
+ text = ' '.join(lines)
1374
+ # Remove excessive whitespace
1375
+ text = ' '.join(text.split())
1376
+
1377
+ if text:
1378
+ preview_length = 500
1379
+ text_preview = text[:preview_length]
1380
+ if len(text) > preview_length:
1381
+ text_preview += "..."
1382
+ result_parts.append(f"📄 Text Content Preview:")
1383
+ result_parts.append(f"{text_preview}")
1384
+ result_parts.append(f"📊 Total text length: {len(text):,} characters")
1385
+
1386
+ except Exception as e:
1387
+ result_parts.append(f"⚠️ BeautifulSoup parsing error: {str(e)}")
1388
+ result_parts.append(f"📄 Raw HTML Preview (first 1000 chars):")
1389
+ result_parts.append(html_content[:1000] + ("..." if len(html_content) > 1000 else ""))
1390
+
1391
+ else:
1392
+ # Fallback parsing without BeautifulSoup
1393
+ result_parts.append("⚠️ BeautifulSoup not available - using basic parsing")
1394
+
1395
+ # Extract title with regex
1396
+ import re
1397
+ title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
1398
+ if title_match:
1399
+ result_parts.append(f"📰 Title: {title_match.group(1).strip()}")
1400
+
1401
+ # Show HTML preview
1402
+ result_parts.append(f"📄 HTML Preview (first 1000 chars):")
1403
+ result_parts.append(html_content[:1000] + ("..." if len(html_content) > 1000 else ""))
1404
+
1405
+ return "\n".join(result_parts)
1406
+
1407
+
1408
+ def _parse_json_content(json_content: str) -> str:
1409
+ """Parse JSON content and provide structured analysis."""
1410
+ if not json_content:
1411
+ return "❌ No JSON content to parse"
1412
+
1413
+ result_parts = []
1414
+ result_parts.append("📊 JSON Data Analysis")
1415
+
1416
+ try:
1417
+ data = json.loads(json_content)
1418
+
1419
+ # Analyze JSON structure
1420
+ result_parts.append(f"📋 Structure: {type(data).__name__}")
1421
+
1422
+ if isinstance(data, dict):
1423
+ result_parts.append(f"🔑 Keys ({len(data)}): {', '.join(list(data.keys())[:10])}")
1424
+ if len(data) > 10:
1425
+ result_parts.append(f" ... and {len(data) - 10} more keys")
1426
+ elif isinstance(data, list):
1427
+ result_parts.append(f"📝 Array length: {len(data)}")
1428
+ if data and isinstance(data[0], dict):
1429
+ result_parts.append(f"🔑 First item keys: {', '.join(list(data[0].keys())[:10])}")
1430
+
1431
+ # Pretty print JSON with smart truncation
1432
+ json_str = json.dumps(data, indent=2, ensure_ascii=False, separators=(',', ': '))
1433
+ preview_length = 1500 # Reduced for better readability
1434
+ if len(json_str) > preview_length:
1435
+ # Try to truncate at a logical point (end of object/array)
1436
+ truncate_pos = json_str.rfind('\n', 0, preview_length)
1437
+ if truncate_pos > preview_length - 200: # If close to limit, use it
1438
+ json_preview = json_str[:truncate_pos] + "\n... (truncated)"
1439
+ else:
1440
+ json_preview = json_str[:preview_length] + "\n... (truncated)"
1441
+ else:
1442
+ json_preview = json_str
1443
+
1444
+ result_parts.append(f"📄 JSON Content:")
1445
+ result_parts.append(json_preview)
1446
+ result_parts.append(f"📊 Total size: {len(json_content):,} characters")
1447
+
1448
+ except json.JSONDecodeError as e:
1449
+ result_parts.append(f"❌ JSON parsing error: {str(e)}")
1450
+ result_parts.append(f"📄 Raw content preview (first 1000 chars):")
1451
+ result_parts.append(json_content[:1000] + ("..." if len(json_content) > 1000 else ""))
1452
+
1453
+ return "\n".join(result_parts)
1454
+
1455
+
1456
+ def _parse_xml_content(xml_content: str) -> str:
1457
+ """Parse XML content including RSS/Atom feeds."""
1458
+ if not xml_content:
1459
+ return "❌ No XML content to parse"
1460
+
1461
+ result_parts = []
1462
+ result_parts.append("📄 XML/RSS/Atom Analysis")
1463
+
1464
+ try:
1465
+ # Try to detect if it's RSS/Atom
1466
+ if '<rss' in xml_content.lower() or '<feed' in xml_content.lower():
1467
+ result_parts.append("📡 Detected: RSS/Atom Feed")
1468
+
1469
+ # Basic XML structure analysis
1470
+ import re
1471
+
1472
+ # Find root element
1473
+ root_match = re.search(r'<([^?\s/>]+)', xml_content)
1474
+ if root_match:
1475
+ result_parts.append(f"🏷️ Root element: <{root_match.group(1)}>")
1476
+
1477
+ # Count elements (basic)
1478
+ elements = re.findall(r'<([^/\s>]+)', xml_content)
1479
+ if elements:
1480
+ from collections import Counter
1481
+ element_counts = Counter(elements[:50]) # Limit analysis
1482
+ result_parts.append(f"📊 Top elements: {dict(list(element_counts.most_common(10)))}")
1483
+
1484
+ # Show XML preview
1485
+ preview_length = 1500
1486
+ xml_preview = xml_content[:preview_length]
1487
+ if len(xml_content) > preview_length:
1488
+ xml_preview += "\n... (truncated)"
1489
+
1490
+ result_parts.append(f"📄 XML Content Preview:")
1491
+ result_parts.append(xml_preview)
1492
+ result_parts.append(f"📊 Total size: {len(xml_content):,} characters")
1493
+
1494
+ except Exception as e:
1495
+ result_parts.append(f"❌ XML parsing error: {str(e)}")
1496
+ result_parts.append(f"📄 Raw content preview (first 1000 chars):")
1497
+ result_parts.append(xml_content[:1000] + ("..." if len(xml_content) > 1000 else ""))
1498
+
1499
+ return "\n".join(result_parts)
1500
+
1501
+
1502
+ def _parse_text_content(text_content: str, content_type: str) -> str:
1503
+ """Parse plain text content."""
1504
+ if not text_content:
1505
+ return "❌ No text content to parse"
1506
+
1507
+ result_parts = []
1508
+ result_parts.append(f"📝 Text Content Analysis ({content_type})")
1509
+
1510
+ # Basic text statistics
1511
+ lines = text_content.splitlines()
1512
+ words = text_content.split()
1513
+
1514
+ result_parts.append(f"📊 Statistics:")
1515
+ result_parts.append(f" • Lines: {len(lines):,}")
1516
+ result_parts.append(f" • Words: {len(words):,}")
1517
+ result_parts.append(f" • Characters: {len(text_content):,}")
1518
+
1519
+ # Show text preview
1520
+ preview_length = 2000
1521
+ text_preview = text_content[:preview_length]
1522
+ if len(text_content) > preview_length:
1523
+ text_preview += "\n... (truncated)"
1524
+
1525
+ result_parts.append(f"📄 Content Preview:")
1526
+ result_parts.append(text_preview)
1527
+
1528
+ return "\n".join(result_parts)
1529
+
1530
+
1531
+ def _parse_image_content(image_bytes: bytes, content_type: str, include_preview: bool = False) -> str:
1532
+ """Parse image content and extract metadata."""
1533
+ result_parts = []
1534
+ result_parts.append(f"🖼️ Image Analysis ({content_type})")
1535
+
1536
+ result_parts.append(f"📊 Size: {len(image_bytes):,} bytes")
1537
+
1538
+ # Try to get image dimensions (basic approach)
1539
+ try:
1540
+ if content_type.startswith('image/jpeg') or content_type.startswith('image/jpg'):
1541
+ # Basic JPEG header parsing for dimensions
1542
+ if image_bytes.startswith(b'\xff\xd8\xff'):
1543
+ result_parts.append("✅ Valid JPEG format detected")
1544
+ elif content_type.startswith('image/png'):
1545
+ # Basic PNG header parsing
1546
+ if image_bytes.startswith(b'\x89PNG\r\n\x1a\n'):
1547
+ result_parts.append("✅ Valid PNG format detected")
1548
+ elif content_type.startswith('image/gif'):
1549
+ if image_bytes.startswith(b'GIF87a') or image_bytes.startswith(b'GIF89a'):
1550
+ result_parts.append("✅ Valid GIF format detected")
1551
+ except Exception:
1552
+ pass
1553
+
1554
+ if include_preview:
1555
+ # Provide base64 preview for small images
1556
+ if len(image_bytes) <= 1048576: # 1MB limit for preview
1557
+ b64_preview = base64.b64encode(image_bytes[:1024]).decode('ascii') # First 1KB
1558
+ result_parts.append(f"🔍 Base64 Preview (first 1KB):")
1559
+ result_parts.append(f"{b64_preview}...")
1560
+ else:
1561
+ result_parts.append("⚠️ Image too large for base64 preview")
1562
+
1563
+ result_parts.append("💡 Use image processing tools for detailed analysis")
1564
+
1565
+ return "\n".join(result_parts)
1566
+
1567
+
1568
+ def _parse_pdf_content(pdf_bytes: bytes, include_preview: bool = False) -> str:
1569
+ """Parse PDF content and extract basic metadata."""
1570
+ result_parts = []
1571
+ result_parts.append("📄 PDF Document Analysis")
1572
+
1573
+ result_parts.append(f"📊 Size: {len(pdf_bytes):,} bytes")
1574
+
1575
+ # Check PDF header
1576
+ if pdf_bytes.startswith(b'%PDF-'):
1577
+ try:
1578
+ version_line = pdf_bytes[:20].decode('ascii', errors='ignore')
1579
+ result_parts.append(f"✅ Valid PDF format: {version_line.strip()}")
1580
+ except:
1581
+ result_parts.append("✅ Valid PDF format detected")
1582
+ else:
1583
+ result_parts.append("⚠️ Invalid PDF format - missing PDF header")
1584
+
1585
+ if include_preview:
1586
+ # Show hex preview of first few bytes
1587
+ hex_preview = ' '.join(f'{b:02x}' for b in pdf_bytes[:64])
1588
+ result_parts.append(f"🔍 Hex Preview (first 64 bytes):")
1589
+ result_parts.append(hex_preview)
1590
+
1591
+ result_parts.append("💡 Use PDF processing tools for text extraction and detailed analysis")
1592
+
1593
+ return "\n".join(result_parts)
1594
+
1595
+
1596
+ def _parse_binary_content(binary_bytes: bytes, content_type: str, include_preview: bool = False) -> str:
1597
+ """Parse generic binary content."""
1598
+ result_parts = []
1599
+ result_parts.append(f"📦 Binary Content Analysis ({content_type})")
1600
+
1601
+ result_parts.append(f"📊 Size: {len(binary_bytes):,} bytes")
1602
+
1603
+ # Detect file type by magic bytes
1604
+ magic_signatures = {
1605
+ b'\x50\x4b\x03\x04': 'ZIP archive',
1606
+ b'\x50\x4b\x05\x06': 'ZIP archive (empty)',
1607
+ b'\x50\x4b\x07\x08': 'ZIP archive (spanned)',
1608
+ b'\x1f\x8b\x08': 'GZIP compressed',
1609
+ b'\x42\x5a\x68': 'BZIP2 compressed',
1610
+ b'\x37\x7a\xbc\xaf\x27\x1c': '7-Zip archive',
1611
+ b'\x52\x61\x72\x21\x1a\x07': 'RAR archive',
1612
+ b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a': 'PNG image',
1613
+ b'\xff\xd8\xff': 'JPEG image',
1614
+ b'\x47\x49\x46\x38': 'GIF image',
1615
+ b'\x25\x50\x44\x46': 'PDF document',
1616
+ b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': 'Microsoft Office document',
1617
+ b'\x4d\x5a': 'Windows executable'
1618
+ }
1619
+
1620
+ detected_type = None
1621
+ for signature, file_type in magic_signatures.items():
1622
+ if binary_bytes.startswith(signature):
1623
+ detected_type = file_type
1624
+ break
1625
+
1626
+ if detected_type:
1627
+ result_parts.append(f"🔍 Detected format: {detected_type}")
1628
+
1629
+ if include_preview:
1630
+ # Show hex preview
1631
+ hex_preview = ' '.join(f'{b:02x}' for b in binary_bytes[:64])
1632
+ result_parts.append(f"🔍 Hex Preview (first 64 bytes):")
1633
+ result_parts.append(hex_preview)
1634
+
1635
+ # Try to show any readable ASCII strings
1636
+ try:
1637
+ ascii_preview = ''.join(chr(b) if 32 <= b <= 126 else '.' for b in binary_bytes[:200])
1638
+ if ascii_preview.strip():
1639
+ result_parts.append(f"📝 ASCII Preview (first 200 bytes):")
1640
+ result_parts.append(ascii_preview)
1641
+ except:
1642
+ pass
1643
+
1644
+ result_parts.append("💡 Use specialized tools for detailed binary analysis")
1645
+
1646
+ return "\n".join(result_parts)
998
1647
 
999
1648
 
1000
1649
  @tool(
@@ -1524,5 +2173,6 @@ __all__ = [
1524
2173
  'write_file',
1525
2174
  'edit_file',
1526
2175
  'web_search',
2176
+ 'fetch_url',
1527
2177
  'execute_command'
1528
2178
  ]
@@ -11,4 +11,4 @@ including when the package is installed from PyPI where pyproject.toml is not av
11
11
 
12
12
  # Package version - update this when releasing new versions
13
13
  # This must be manually synchronized with the version in pyproject.toml
14
- __version__ = "2.4.5"
14
+ __version__ = "2.4.7"