abstractcore 2.4.5__py3-none-any.whl → 2.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstractcore/__init__.py +5 -1
- abstractcore/assets/session_schema.json +1 -1
- abstractcore/core/interface.py +7 -0
- abstractcore/core/session.py +28 -3
- abstractcore/core/types.py +25 -1
- abstractcore/providers/anthropic_provider.py +20 -2
- abstractcore/providers/base.py +24 -0
- abstractcore/providers/huggingface_provider.py +44 -18
- abstractcore/providers/lmstudio_provider.py +17 -4
- abstractcore/providers/mlx_provider.py +36 -14
- abstractcore/providers/mock_provider.py +17 -7
- abstractcore/providers/ollama_provider.py +16 -4
- abstractcore/providers/openai_provider.py +18 -5
- abstractcore/tools/common_tools.py +651 -1
- abstractcore/utils/version.py +1 -1
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/METADATA +108 -12
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/RECORD +21 -21
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/WHEEL +0 -0
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/entry_points.txt +0 -0
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/licenses/LICENSE +0 -0
- {abstractcore-2.4.5.dist-info → abstractcore-2.4.7.dist-info}/top_level.txt +0 -0
|
@@ -11,17 +11,29 @@ import os
|
|
|
11
11
|
import subprocess
|
|
12
12
|
import requests
|
|
13
13
|
from pathlib import Path
|
|
14
|
-
from typing import Optional
|
|
14
|
+
from typing import Optional, Dict, Any, Union
|
|
15
15
|
import logging
|
|
16
16
|
import platform
|
|
17
17
|
import re
|
|
18
18
|
import time
|
|
19
|
+
import json
|
|
20
|
+
import base64
|
|
21
|
+
from datetime import datetime
|
|
22
|
+
from urllib.parse import urlparse, urljoin
|
|
23
|
+
import mimetypes
|
|
19
24
|
|
|
20
25
|
try:
|
|
21
26
|
from bs4 import BeautifulSoup
|
|
22
27
|
BS4_AVAILABLE = True
|
|
28
|
+
# Try to use lxml parser for better performance
|
|
29
|
+
try:
|
|
30
|
+
import lxml
|
|
31
|
+
BS4_PARSER = 'lxml'
|
|
32
|
+
except ImportError:
|
|
33
|
+
BS4_PARSER = 'html.parser'
|
|
23
34
|
except ImportError:
|
|
24
35
|
BS4_AVAILABLE = False
|
|
36
|
+
BS4_PARSER = None
|
|
25
37
|
|
|
26
38
|
try:
|
|
27
39
|
import psutil
|
|
@@ -995,6 +1007,643 @@ def web_search(query: str, num_results: int = 5, safe_search: str = "moderate",
|
|
|
995
1007
|
return f"Error searching internet: {str(e)}"
|
|
996
1008
|
|
|
997
1009
|
|
|
1010
|
+
@tool(
|
|
1011
|
+
description="Fetch and intelligently parse content from URLs with automatic content type detection and metadata extraction",
|
|
1012
|
+
tags=["web", "fetch", "url", "http", "content", "parse", "scraping"],
|
|
1013
|
+
when_to_use="When you need to retrieve and analyze content from specific URLs, including web pages, APIs, documents, or media files",
|
|
1014
|
+
examples=[
|
|
1015
|
+
{
|
|
1016
|
+
"description": "Fetch and parse HTML webpage",
|
|
1017
|
+
"arguments": {
|
|
1018
|
+
"url": "https://example.com/article.html"
|
|
1019
|
+
}
|
|
1020
|
+
},
|
|
1021
|
+
{
|
|
1022
|
+
"description": "Fetch JSON API response",
|
|
1023
|
+
"arguments": {
|
|
1024
|
+
"url": "https://api.github.com/repos/python/cpython",
|
|
1025
|
+
"headers": {"Accept": "application/json"}
|
|
1026
|
+
}
|
|
1027
|
+
},
|
|
1028
|
+
{
|
|
1029
|
+
"description": "POST data to API endpoint",
|
|
1030
|
+
"arguments": {
|
|
1031
|
+
"url": "https://httpbin.org/post",
|
|
1032
|
+
"method": "POST",
|
|
1033
|
+
"data": {"key": "value", "test": "data"}
|
|
1034
|
+
}
|
|
1035
|
+
},
|
|
1036
|
+
{
|
|
1037
|
+
"description": "Fetch binary content with metadata",
|
|
1038
|
+
"arguments": {
|
|
1039
|
+
"url": "https://example.com/document.pdf",
|
|
1040
|
+
"include_binary_preview": True
|
|
1041
|
+
}
|
|
1042
|
+
}
|
|
1043
|
+
]
|
|
1044
|
+
)
|
|
1045
|
+
def fetch_url(
|
|
1046
|
+
url: str,
|
|
1047
|
+
method: str = "GET",
|
|
1048
|
+
headers: Optional[Dict[str, str]] = None,
|
|
1049
|
+
data: Optional[Union[Dict[str, Any], str]] = None,
|
|
1050
|
+
timeout: int = 30,
|
|
1051
|
+
max_content_length: int = 10485760, # 10MB default
|
|
1052
|
+
follow_redirects: bool = True,
|
|
1053
|
+
include_binary_preview: bool = False,
|
|
1054
|
+
extract_links: bool = True,
|
|
1055
|
+
user_agent: str = "AbstractCore-FetchTool/1.0"
|
|
1056
|
+
) -> str:
|
|
1057
|
+
"""
|
|
1058
|
+
Fetch and intelligently parse content from URLs with comprehensive content type detection.
|
|
1059
|
+
|
|
1060
|
+
This tool automatically detects content types (HTML, JSON, XML, images, etc.) and provides
|
|
1061
|
+
appropriate parsing with metadata extraction including timestamps and response headers.
|
|
1062
|
+
|
|
1063
|
+
Args:
|
|
1064
|
+
url: The URL to fetch content from
|
|
1065
|
+
method: HTTP method to use (default: "GET")
|
|
1066
|
+
headers: Optional custom headers to send with the request
|
|
1067
|
+
data: Optional data to send with POST/PUT requests (dict or string)
|
|
1068
|
+
timeout: Request timeout in seconds (default: 30)
|
|
1069
|
+
max_content_length: Maximum content length to fetch in bytes (default: 10MB)
|
|
1070
|
+
follow_redirects: Whether to follow HTTP redirects (default: True)
|
|
1071
|
+
include_binary_preview: Whether to include base64 preview for binary content (default: False)
|
|
1072
|
+
extract_links: Whether to extract links from HTML content (default: True)
|
|
1073
|
+
user_agent: User-Agent header to use (default: "AbstractCore-FetchTool/1.0")
|
|
1074
|
+
|
|
1075
|
+
Returns:
|
|
1076
|
+
Formatted string with parsed content, metadata, and analysis or error message
|
|
1077
|
+
|
|
1078
|
+
Examples:
|
|
1079
|
+
fetch_url("https://api.github.com/repos/python/cpython") # Fetch and parse JSON API
|
|
1080
|
+
fetch_url("https://example.com", headers={"Accept": "text/html"}) # Fetch HTML with custom headers
|
|
1081
|
+
fetch_url("https://httpbin.org/post", method="POST", data={"test": "value"}) # POST request
|
|
1082
|
+
fetch_url("https://example.com/image.jpg", include_binary_preview=True) # Fetch image with preview
|
|
1083
|
+
"""
|
|
1084
|
+
try:
|
|
1085
|
+
# Validate URL
|
|
1086
|
+
parsed_url = urlparse(url)
|
|
1087
|
+
if not parsed_url.scheme or not parsed_url.netloc:
|
|
1088
|
+
return f"❌ Invalid URL format: {url}"
|
|
1089
|
+
|
|
1090
|
+
if parsed_url.scheme not in ['http', 'https']:
|
|
1091
|
+
return f"❌ Unsupported URL scheme: {parsed_url.scheme}. Only HTTP and HTTPS are supported."
|
|
1092
|
+
|
|
1093
|
+
# Prepare request headers
|
|
1094
|
+
request_headers = {
|
|
1095
|
+
'User-Agent': user_agent,
|
|
1096
|
+
'Accept': '*/*',
|
|
1097
|
+
'Accept-Encoding': 'gzip, deflate',
|
|
1098
|
+
'Connection': 'keep-alive'
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
if headers:
|
|
1102
|
+
request_headers.update(headers)
|
|
1103
|
+
|
|
1104
|
+
# Prepare request parameters
|
|
1105
|
+
request_params = {
|
|
1106
|
+
'url': url,
|
|
1107
|
+
'method': method.upper(),
|
|
1108
|
+
'headers': request_headers,
|
|
1109
|
+
'timeout': timeout,
|
|
1110
|
+
'allow_redirects': follow_redirects,
|
|
1111
|
+
'stream': True # Stream to check content length
|
|
1112
|
+
}
|
|
1113
|
+
|
|
1114
|
+
# Add data for POST/PUT requests
|
|
1115
|
+
if data and method.upper() in ['POST', 'PUT', 'PATCH']:
|
|
1116
|
+
if isinstance(data, dict):
|
|
1117
|
+
# Try JSON first, fallback to form data
|
|
1118
|
+
if request_headers.get('Content-Type', '').startswith('application/json'):
|
|
1119
|
+
request_params['json'] = data
|
|
1120
|
+
else:
|
|
1121
|
+
request_params['data'] = data
|
|
1122
|
+
else:
|
|
1123
|
+
request_params['data'] = data
|
|
1124
|
+
|
|
1125
|
+
# Record fetch timestamp
|
|
1126
|
+
fetch_timestamp = datetime.now().isoformat()
|
|
1127
|
+
|
|
1128
|
+
# Make the request with session for connection reuse
|
|
1129
|
+
with requests.Session() as session:
|
|
1130
|
+
session.headers.update(request_headers)
|
|
1131
|
+
response = session.request(
|
|
1132
|
+
method=method.upper(),
|
|
1133
|
+
url=url,
|
|
1134
|
+
timeout=timeout,
|
|
1135
|
+
allow_redirects=follow_redirects,
|
|
1136
|
+
stream=True,
|
|
1137
|
+
json=request_params.get('json'),
|
|
1138
|
+
data=request_params.get('data')
|
|
1139
|
+
)
|
|
1140
|
+
|
|
1141
|
+
# Check response status
|
|
1142
|
+
if not response.ok:
|
|
1143
|
+
return f"❌ HTTP Error {response.status_code}: {response.reason}\n" \
|
|
1144
|
+
f"URL: {url}\n" \
|
|
1145
|
+
f"Timestamp: {fetch_timestamp}\n" \
|
|
1146
|
+
f"Response headers: {dict(response.headers)}"
|
|
1147
|
+
|
|
1148
|
+
# Get content info
|
|
1149
|
+
content_type = response.headers.get('content-type', '').lower()
|
|
1150
|
+
content_length = response.headers.get('content-length')
|
|
1151
|
+
if content_length:
|
|
1152
|
+
content_length = int(content_length)
|
|
1153
|
+
|
|
1154
|
+
# Check content length before downloading
|
|
1155
|
+
if content_length and content_length > max_content_length:
|
|
1156
|
+
return f"⚠️ Content too large: {content_length:,} bytes (max: {max_content_length:,})\n" \
|
|
1157
|
+
f"URL: {url}\n" \
|
|
1158
|
+
f"Content-Type: {content_type}\n" \
|
|
1159
|
+
f"Timestamp: {fetch_timestamp}\n" \
|
|
1160
|
+
f"Use max_content_length parameter to increase limit if needed"
|
|
1161
|
+
|
|
1162
|
+
# Download content with optimized chunking
|
|
1163
|
+
content_chunks = []
|
|
1164
|
+
downloaded_size = 0
|
|
1165
|
+
|
|
1166
|
+
# Use larger chunks for better performance
|
|
1167
|
+
chunk_size = 32768 if 'image/' in content_type or 'video/' in content_type else 16384
|
|
1168
|
+
|
|
1169
|
+
for chunk in response.iter_content(chunk_size=chunk_size):
|
|
1170
|
+
if chunk:
|
|
1171
|
+
downloaded_size += len(chunk)
|
|
1172
|
+
if downloaded_size > max_content_length:
|
|
1173
|
+
return f"⚠️ Content exceeded size limit during download: {downloaded_size:,} bytes (max: {max_content_length:,})\n" \
|
|
1174
|
+
f"URL: {url}\n" \
|
|
1175
|
+
f"Content-Type: {content_type}\n" \
|
|
1176
|
+
f"Timestamp: {fetch_timestamp}"
|
|
1177
|
+
content_chunks.append(chunk)
|
|
1178
|
+
|
|
1179
|
+
content_bytes = b''.join(content_chunks)
|
|
1180
|
+
actual_size = len(content_bytes)
|
|
1181
|
+
|
|
1182
|
+
# Detect content type and parse accordingly
|
|
1183
|
+
parsed_content = _parse_content_by_type(content_bytes, content_type, url, extract_links, include_binary_preview)
|
|
1184
|
+
|
|
1185
|
+
# Build comprehensive response
|
|
1186
|
+
result_parts = []
|
|
1187
|
+
result_parts.append(f"🌐 URL Fetch Results")
|
|
1188
|
+
result_parts.append(f"📍 URL: {response.url}") # Final URL after redirects
|
|
1189
|
+
if response.url != url:
|
|
1190
|
+
result_parts.append(f"🔄 Original URL: {url}")
|
|
1191
|
+
result_parts.append(f"⏰ Timestamp: {fetch_timestamp}")
|
|
1192
|
+
result_parts.append(f"✅ Status: {response.status_code} {response.reason}")
|
|
1193
|
+
result_parts.append(f"📊 Content-Type: {content_type}")
|
|
1194
|
+
result_parts.append(f"📏 Size: {actual_size:,} bytes")
|
|
1195
|
+
|
|
1196
|
+
# Add important response headers
|
|
1197
|
+
important_headers = ['server', 'last-modified', 'etag', 'cache-control', 'expires', 'location']
|
|
1198
|
+
response_metadata = []
|
|
1199
|
+
for header in important_headers:
|
|
1200
|
+
value = response.headers.get(header)
|
|
1201
|
+
if value:
|
|
1202
|
+
response_metadata.append(f" {header.title()}: {value}")
|
|
1203
|
+
|
|
1204
|
+
if response_metadata:
|
|
1205
|
+
result_parts.append(f"📋 Response Headers:")
|
|
1206
|
+
result_parts.extend(response_metadata)
|
|
1207
|
+
|
|
1208
|
+
# Add parsed content
|
|
1209
|
+
result_parts.append(f"\n📄 Content Analysis:")
|
|
1210
|
+
result_parts.append(parsed_content)
|
|
1211
|
+
|
|
1212
|
+
return "\n".join(result_parts)
|
|
1213
|
+
|
|
1214
|
+
except requests.exceptions.Timeout:
|
|
1215
|
+
return f"⏰ Request timeout after {timeout} seconds\n" \
|
|
1216
|
+
f"URL: {url}\n" \
|
|
1217
|
+
f"Consider increasing timeout parameter"
|
|
1218
|
+
|
|
1219
|
+
except requests.exceptions.ConnectionError as e:
|
|
1220
|
+
return f"🔌 Connection error: {str(e)}\n" \
|
|
1221
|
+
f"URL: {url}\n" \
|
|
1222
|
+
f"Check network connectivity and URL validity"
|
|
1223
|
+
|
|
1224
|
+
except requests.exceptions.TooManyRedirects:
|
|
1225
|
+
return f"🔄 Too many redirects\n" \
|
|
1226
|
+
f"URL: {url}\n" \
|
|
1227
|
+
f"Try setting follow_redirects=False to see redirect chain"
|
|
1228
|
+
|
|
1229
|
+
except requests.exceptions.RequestException as e:
|
|
1230
|
+
return f"❌ Request error: {str(e)}\n" \
|
|
1231
|
+
f"URL: {url}"
|
|
1232
|
+
|
|
1233
|
+
except Exception as e:
|
|
1234
|
+
return f"❌ Unexpected error fetching URL: {str(e)}\n" \
|
|
1235
|
+
f"URL: {url}"
|
|
1236
|
+
|
|
1237
|
+
|
|
1238
|
+
def _parse_content_by_type(content_bytes: bytes, content_type: str, url: str, extract_links: bool = True, include_binary_preview: bool = False) -> str:
|
|
1239
|
+
"""
|
|
1240
|
+
Parse content based on detected content type with intelligent fallbacks.
|
|
1241
|
+
|
|
1242
|
+
This function provides robust content type detection and parsing for various formats
|
|
1243
|
+
including HTML, JSON, XML, plain text, images, and other binary formats.
|
|
1244
|
+
"""
|
|
1245
|
+
try:
|
|
1246
|
+
# Normalize content type
|
|
1247
|
+
main_type = content_type.split(';')[0].strip().lower()
|
|
1248
|
+
|
|
1249
|
+
# Try to decode as text first for text-based formats
|
|
1250
|
+
text_content = None
|
|
1251
|
+
encoding = 'utf-8'
|
|
1252
|
+
|
|
1253
|
+
# Detect encoding from content-type header
|
|
1254
|
+
if 'charset=' in content_type:
|
|
1255
|
+
try:
|
|
1256
|
+
encoding = content_type.split('charset=')[1].split(';')[0].strip()
|
|
1257
|
+
except:
|
|
1258
|
+
encoding = 'utf-8'
|
|
1259
|
+
|
|
1260
|
+
# Attempt text decoding for text-based content types with better encoding detection
|
|
1261
|
+
text_based_types = [
|
|
1262
|
+
'text/', 'application/json', 'application/xml', 'application/javascript',
|
|
1263
|
+
'application/rss+xml', 'application/atom+xml', 'application/xhtml+xml'
|
|
1264
|
+
]
|
|
1265
|
+
|
|
1266
|
+
is_text_based = any(main_type.startswith(t) for t in text_based_types)
|
|
1267
|
+
|
|
1268
|
+
if is_text_based:
|
|
1269
|
+
# Try multiple encoding strategies
|
|
1270
|
+
for enc in [encoding, 'utf-8', 'iso-8859-1', 'windows-1252']:
|
|
1271
|
+
try:
|
|
1272
|
+
text_content = content_bytes.decode(enc)
|
|
1273
|
+
break
|
|
1274
|
+
except (UnicodeDecodeError, LookupError):
|
|
1275
|
+
continue
|
|
1276
|
+
else:
|
|
1277
|
+
# Final fallback with error replacement
|
|
1278
|
+
text_content = content_bytes.decode('utf-8', errors='replace')
|
|
1279
|
+
|
|
1280
|
+
# Parse based on content type
|
|
1281
|
+
if main_type.startswith('text/html') or main_type.startswith('application/xhtml'):
|
|
1282
|
+
return _parse_html_content(text_content, url, extract_links)
|
|
1283
|
+
|
|
1284
|
+
elif main_type == 'application/json':
|
|
1285
|
+
return _parse_json_content(text_content)
|
|
1286
|
+
|
|
1287
|
+
elif main_type in ['application/xml', 'text/xml', 'application/rss+xml', 'application/atom+xml']:
|
|
1288
|
+
return _parse_xml_content(text_content)
|
|
1289
|
+
|
|
1290
|
+
elif main_type.startswith('text/'):
|
|
1291
|
+
return _parse_text_content(text_content, main_type)
|
|
1292
|
+
|
|
1293
|
+
elif main_type.startswith('image/'):
|
|
1294
|
+
return _parse_image_content(content_bytes, main_type, include_binary_preview)
|
|
1295
|
+
|
|
1296
|
+
elif main_type == 'application/pdf':
|
|
1297
|
+
return _parse_pdf_content(content_bytes, include_binary_preview)
|
|
1298
|
+
|
|
1299
|
+
else:
|
|
1300
|
+
return _parse_binary_content(content_bytes, main_type, include_binary_preview)
|
|
1301
|
+
|
|
1302
|
+
except Exception as e:
|
|
1303
|
+
return f"❌ Error parsing content: {str(e)}\n" \
|
|
1304
|
+
f"Content-Type: {content_type}\n" \
|
|
1305
|
+
f"Content size: {len(content_bytes):,} bytes"
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
def _parse_html_content(html_content: str, url: str, extract_links: bool = True) -> str:
|
|
1309
|
+
"""Parse HTML content and extract meaningful information."""
|
|
1310
|
+
if not html_content:
|
|
1311
|
+
return "❌ No HTML content to parse"
|
|
1312
|
+
|
|
1313
|
+
result_parts = []
|
|
1314
|
+
result_parts.append("🌐 HTML Document Analysis")
|
|
1315
|
+
|
|
1316
|
+
# Use BeautifulSoup if available for better parsing
|
|
1317
|
+
if BS4_AVAILABLE:
|
|
1318
|
+
try:
|
|
1319
|
+
soup = BeautifulSoup(html_content, BS4_PARSER)
|
|
1320
|
+
|
|
1321
|
+
# Extract title
|
|
1322
|
+
title = soup.find('title')
|
|
1323
|
+
if title:
|
|
1324
|
+
result_parts.append(f"📰 Title: {title.get_text().strip()}")
|
|
1325
|
+
|
|
1326
|
+
# Extract meta description
|
|
1327
|
+
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
1328
|
+
if meta_desc and meta_desc.get('content'):
|
|
1329
|
+
result_parts.append(f"📝 Description: {meta_desc['content'][:200]}...")
|
|
1330
|
+
|
|
1331
|
+
# Extract headings
|
|
1332
|
+
headings = []
|
|
1333
|
+
for i in range(1, 7):
|
|
1334
|
+
h_tags = soup.find_all(f'h{i}')
|
|
1335
|
+
for h in h_tags[:5]: # Limit to first 5 of each level
|
|
1336
|
+
headings.append(f"H{i}: {h.get_text().strip()[:100]}")
|
|
1337
|
+
|
|
1338
|
+
if headings:
|
|
1339
|
+
result_parts.append(f"📋 Headings (first 5 per level):")
|
|
1340
|
+
for heading in headings[:10]: # Limit total headings
|
|
1341
|
+
result_parts.append(f" • {heading}")
|
|
1342
|
+
|
|
1343
|
+
# Extract links if requested
|
|
1344
|
+
if extract_links:
|
|
1345
|
+
links = []
|
|
1346
|
+
for a in soup.find_all('a', href=True)[:20]: # Limit to first 20 links
|
|
1347
|
+
href = a['href']
|
|
1348
|
+
text = a.get_text().strip()[:50]
|
|
1349
|
+
# Convert relative URLs to absolute
|
|
1350
|
+
if href.startswith('/'):
|
|
1351
|
+
href = urljoin(url, href)
|
|
1352
|
+
elif not href.startswith(('http://', 'https://')):
|
|
1353
|
+
href = urljoin(url, href)
|
|
1354
|
+
links.append(f"{text} → {href}")
|
|
1355
|
+
|
|
1356
|
+
if links:
|
|
1357
|
+
result_parts.append(f"🔗 Links (first 20):")
|
|
1358
|
+
for link in links:
|
|
1359
|
+
result_parts.append(f" • {link}")
|
|
1360
|
+
|
|
1361
|
+
# Extract main text content with better cleaning
|
|
1362
|
+
# Remove script, style, nav, footer, header elements for cleaner content
|
|
1363
|
+
for element in soup(["script", "style", "nav", "footer", "header", "aside"]):
|
|
1364
|
+
element.decompose()
|
|
1365
|
+
|
|
1366
|
+
# Try to find main content area first
|
|
1367
|
+
main_content = soup.find(['main', 'article']) or soup.find('div', class_=lambda x: x and any(word in x.lower() for word in ['content', 'article', 'post', 'main']))
|
|
1368
|
+
content_soup = main_content if main_content else soup
|
|
1369
|
+
|
|
1370
|
+
text = content_soup.get_text()
|
|
1371
|
+
# Clean up text more efficiently
|
|
1372
|
+
lines = (line.strip() for line in text.splitlines() if line.strip())
|
|
1373
|
+
text = ' '.join(lines)
|
|
1374
|
+
# Remove excessive whitespace
|
|
1375
|
+
text = ' '.join(text.split())
|
|
1376
|
+
|
|
1377
|
+
if text:
|
|
1378
|
+
preview_length = 500
|
|
1379
|
+
text_preview = text[:preview_length]
|
|
1380
|
+
if len(text) > preview_length:
|
|
1381
|
+
text_preview += "..."
|
|
1382
|
+
result_parts.append(f"📄 Text Content Preview:")
|
|
1383
|
+
result_parts.append(f"{text_preview}")
|
|
1384
|
+
result_parts.append(f"📊 Total text length: {len(text):,} characters")
|
|
1385
|
+
|
|
1386
|
+
except Exception as e:
|
|
1387
|
+
result_parts.append(f"⚠️ BeautifulSoup parsing error: {str(e)}")
|
|
1388
|
+
result_parts.append(f"📄 Raw HTML Preview (first 1000 chars):")
|
|
1389
|
+
result_parts.append(html_content[:1000] + ("..." if len(html_content) > 1000 else ""))
|
|
1390
|
+
|
|
1391
|
+
else:
|
|
1392
|
+
# Fallback parsing without BeautifulSoup
|
|
1393
|
+
result_parts.append("⚠️ BeautifulSoup not available - using basic parsing")
|
|
1394
|
+
|
|
1395
|
+
# Extract title with regex
|
|
1396
|
+
import re
|
|
1397
|
+
title_match = re.search(r'<title[^>]*>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
|
|
1398
|
+
if title_match:
|
|
1399
|
+
result_parts.append(f"📰 Title: {title_match.group(1).strip()}")
|
|
1400
|
+
|
|
1401
|
+
# Show HTML preview
|
|
1402
|
+
result_parts.append(f"📄 HTML Preview (first 1000 chars):")
|
|
1403
|
+
result_parts.append(html_content[:1000] + ("..." if len(html_content) > 1000 else ""))
|
|
1404
|
+
|
|
1405
|
+
return "\n".join(result_parts)
|
|
1406
|
+
|
|
1407
|
+
|
|
1408
|
+
def _parse_json_content(json_content: str) -> str:
|
|
1409
|
+
"""Parse JSON content and provide structured analysis."""
|
|
1410
|
+
if not json_content:
|
|
1411
|
+
return "❌ No JSON content to parse"
|
|
1412
|
+
|
|
1413
|
+
result_parts = []
|
|
1414
|
+
result_parts.append("📊 JSON Data Analysis")
|
|
1415
|
+
|
|
1416
|
+
try:
|
|
1417
|
+
data = json.loads(json_content)
|
|
1418
|
+
|
|
1419
|
+
# Analyze JSON structure
|
|
1420
|
+
result_parts.append(f"📋 Structure: {type(data).__name__}")
|
|
1421
|
+
|
|
1422
|
+
if isinstance(data, dict):
|
|
1423
|
+
result_parts.append(f"🔑 Keys ({len(data)}): {', '.join(list(data.keys())[:10])}")
|
|
1424
|
+
if len(data) > 10:
|
|
1425
|
+
result_parts.append(f" ... and {len(data) - 10} more keys")
|
|
1426
|
+
elif isinstance(data, list):
|
|
1427
|
+
result_parts.append(f"📝 Array length: {len(data)}")
|
|
1428
|
+
if data and isinstance(data[0], dict):
|
|
1429
|
+
result_parts.append(f"🔑 First item keys: {', '.join(list(data[0].keys())[:10])}")
|
|
1430
|
+
|
|
1431
|
+
# Pretty print JSON with smart truncation
|
|
1432
|
+
json_str = json.dumps(data, indent=2, ensure_ascii=False, separators=(',', ': '))
|
|
1433
|
+
preview_length = 1500 # Reduced for better readability
|
|
1434
|
+
if len(json_str) > preview_length:
|
|
1435
|
+
# Try to truncate at a logical point (end of object/array)
|
|
1436
|
+
truncate_pos = json_str.rfind('\n', 0, preview_length)
|
|
1437
|
+
if truncate_pos > preview_length - 200: # If close to limit, use it
|
|
1438
|
+
json_preview = json_str[:truncate_pos] + "\n... (truncated)"
|
|
1439
|
+
else:
|
|
1440
|
+
json_preview = json_str[:preview_length] + "\n... (truncated)"
|
|
1441
|
+
else:
|
|
1442
|
+
json_preview = json_str
|
|
1443
|
+
|
|
1444
|
+
result_parts.append(f"📄 JSON Content:")
|
|
1445
|
+
result_parts.append(json_preview)
|
|
1446
|
+
result_parts.append(f"📊 Total size: {len(json_content):,} characters")
|
|
1447
|
+
|
|
1448
|
+
except json.JSONDecodeError as e:
|
|
1449
|
+
result_parts.append(f"❌ JSON parsing error: {str(e)}")
|
|
1450
|
+
result_parts.append(f"📄 Raw content preview (first 1000 chars):")
|
|
1451
|
+
result_parts.append(json_content[:1000] + ("..." if len(json_content) > 1000 else ""))
|
|
1452
|
+
|
|
1453
|
+
return "\n".join(result_parts)
|
|
1454
|
+
|
|
1455
|
+
|
|
1456
|
+
def _parse_xml_content(xml_content: str) -> str:
|
|
1457
|
+
"""Parse XML content including RSS/Atom feeds."""
|
|
1458
|
+
if not xml_content:
|
|
1459
|
+
return "❌ No XML content to parse"
|
|
1460
|
+
|
|
1461
|
+
result_parts = []
|
|
1462
|
+
result_parts.append("📄 XML/RSS/Atom Analysis")
|
|
1463
|
+
|
|
1464
|
+
try:
|
|
1465
|
+
# Try to detect if it's RSS/Atom
|
|
1466
|
+
if '<rss' in xml_content.lower() or '<feed' in xml_content.lower():
|
|
1467
|
+
result_parts.append("📡 Detected: RSS/Atom Feed")
|
|
1468
|
+
|
|
1469
|
+
# Basic XML structure analysis
|
|
1470
|
+
import re
|
|
1471
|
+
|
|
1472
|
+
# Find root element
|
|
1473
|
+
root_match = re.search(r'<([^?\s/>]+)', xml_content)
|
|
1474
|
+
if root_match:
|
|
1475
|
+
result_parts.append(f"🏷️ Root element: <{root_match.group(1)}>")
|
|
1476
|
+
|
|
1477
|
+
# Count elements (basic)
|
|
1478
|
+
elements = re.findall(r'<([^/\s>]+)', xml_content)
|
|
1479
|
+
if elements:
|
|
1480
|
+
from collections import Counter
|
|
1481
|
+
element_counts = Counter(elements[:50]) # Limit analysis
|
|
1482
|
+
result_parts.append(f"📊 Top elements: {dict(list(element_counts.most_common(10)))}")
|
|
1483
|
+
|
|
1484
|
+
# Show XML preview
|
|
1485
|
+
preview_length = 1500
|
|
1486
|
+
xml_preview = xml_content[:preview_length]
|
|
1487
|
+
if len(xml_content) > preview_length:
|
|
1488
|
+
xml_preview += "\n... (truncated)"
|
|
1489
|
+
|
|
1490
|
+
result_parts.append(f"📄 XML Content Preview:")
|
|
1491
|
+
result_parts.append(xml_preview)
|
|
1492
|
+
result_parts.append(f"📊 Total size: {len(xml_content):,} characters")
|
|
1493
|
+
|
|
1494
|
+
except Exception as e:
|
|
1495
|
+
result_parts.append(f"❌ XML parsing error: {str(e)}")
|
|
1496
|
+
result_parts.append(f"📄 Raw content preview (first 1000 chars):")
|
|
1497
|
+
result_parts.append(xml_content[:1000] + ("..." if len(xml_content) > 1000 else ""))
|
|
1498
|
+
|
|
1499
|
+
return "\n".join(result_parts)
|
|
1500
|
+
|
|
1501
|
+
|
|
1502
|
+
def _parse_text_content(text_content: str, content_type: str) -> str:
|
|
1503
|
+
"""Parse plain text content."""
|
|
1504
|
+
if not text_content:
|
|
1505
|
+
return "❌ No text content to parse"
|
|
1506
|
+
|
|
1507
|
+
result_parts = []
|
|
1508
|
+
result_parts.append(f"📝 Text Content Analysis ({content_type})")
|
|
1509
|
+
|
|
1510
|
+
# Basic text statistics
|
|
1511
|
+
lines = text_content.splitlines()
|
|
1512
|
+
words = text_content.split()
|
|
1513
|
+
|
|
1514
|
+
result_parts.append(f"📊 Statistics:")
|
|
1515
|
+
result_parts.append(f" • Lines: {len(lines):,}")
|
|
1516
|
+
result_parts.append(f" • Words: {len(words):,}")
|
|
1517
|
+
result_parts.append(f" • Characters: {len(text_content):,}")
|
|
1518
|
+
|
|
1519
|
+
# Show text preview
|
|
1520
|
+
preview_length = 2000
|
|
1521
|
+
text_preview = text_content[:preview_length]
|
|
1522
|
+
if len(text_content) > preview_length:
|
|
1523
|
+
text_preview += "\n... (truncated)"
|
|
1524
|
+
|
|
1525
|
+
result_parts.append(f"📄 Content Preview:")
|
|
1526
|
+
result_parts.append(text_preview)
|
|
1527
|
+
|
|
1528
|
+
return "\n".join(result_parts)
|
|
1529
|
+
|
|
1530
|
+
|
|
1531
|
+
def _parse_image_content(image_bytes: bytes, content_type: str, include_preview: bool = False) -> str:
|
|
1532
|
+
"""Parse image content and extract metadata."""
|
|
1533
|
+
result_parts = []
|
|
1534
|
+
result_parts.append(f"🖼️ Image Analysis ({content_type})")
|
|
1535
|
+
|
|
1536
|
+
result_parts.append(f"📊 Size: {len(image_bytes):,} bytes")
|
|
1537
|
+
|
|
1538
|
+
# Try to get image dimensions (basic approach)
|
|
1539
|
+
try:
|
|
1540
|
+
if content_type.startswith('image/jpeg') or content_type.startswith('image/jpg'):
|
|
1541
|
+
# Basic JPEG header parsing for dimensions
|
|
1542
|
+
if image_bytes.startswith(b'\xff\xd8\xff'):
|
|
1543
|
+
result_parts.append("✅ Valid JPEG format detected")
|
|
1544
|
+
elif content_type.startswith('image/png'):
|
|
1545
|
+
# Basic PNG header parsing
|
|
1546
|
+
if image_bytes.startswith(b'\x89PNG\r\n\x1a\n'):
|
|
1547
|
+
result_parts.append("✅ Valid PNG format detected")
|
|
1548
|
+
elif content_type.startswith('image/gif'):
|
|
1549
|
+
if image_bytes.startswith(b'GIF87a') or image_bytes.startswith(b'GIF89a'):
|
|
1550
|
+
result_parts.append("✅ Valid GIF format detected")
|
|
1551
|
+
except Exception:
|
|
1552
|
+
pass
|
|
1553
|
+
|
|
1554
|
+
if include_preview:
|
|
1555
|
+
# Provide base64 preview for small images
|
|
1556
|
+
if len(image_bytes) <= 1048576: # 1MB limit for preview
|
|
1557
|
+
b64_preview = base64.b64encode(image_bytes[:1024]).decode('ascii') # First 1KB
|
|
1558
|
+
result_parts.append(f"🔍 Base64 Preview (first 1KB):")
|
|
1559
|
+
result_parts.append(f"{b64_preview}...")
|
|
1560
|
+
else:
|
|
1561
|
+
result_parts.append("⚠️ Image too large for base64 preview")
|
|
1562
|
+
|
|
1563
|
+
result_parts.append("💡 Use image processing tools for detailed analysis")
|
|
1564
|
+
|
|
1565
|
+
return "\n".join(result_parts)
|
|
1566
|
+
|
|
1567
|
+
|
|
1568
|
+
def _parse_pdf_content(pdf_bytes: bytes, include_preview: bool = False) -> str:
|
|
1569
|
+
"""Parse PDF content and extract basic metadata."""
|
|
1570
|
+
result_parts = []
|
|
1571
|
+
result_parts.append("📄 PDF Document Analysis")
|
|
1572
|
+
|
|
1573
|
+
result_parts.append(f"📊 Size: {len(pdf_bytes):,} bytes")
|
|
1574
|
+
|
|
1575
|
+
# Check PDF header
|
|
1576
|
+
if pdf_bytes.startswith(b'%PDF-'):
|
|
1577
|
+
try:
|
|
1578
|
+
version_line = pdf_bytes[:20].decode('ascii', errors='ignore')
|
|
1579
|
+
result_parts.append(f"✅ Valid PDF format: {version_line.strip()}")
|
|
1580
|
+
except:
|
|
1581
|
+
result_parts.append("✅ Valid PDF format detected")
|
|
1582
|
+
else:
|
|
1583
|
+
result_parts.append("⚠️ Invalid PDF format - missing PDF header")
|
|
1584
|
+
|
|
1585
|
+
if include_preview:
|
|
1586
|
+
# Show hex preview of first few bytes
|
|
1587
|
+
hex_preview = ' '.join(f'{b:02x}' for b in pdf_bytes[:64])
|
|
1588
|
+
result_parts.append(f"🔍 Hex Preview (first 64 bytes):")
|
|
1589
|
+
result_parts.append(hex_preview)
|
|
1590
|
+
|
|
1591
|
+
result_parts.append("💡 Use PDF processing tools for text extraction and detailed analysis")
|
|
1592
|
+
|
|
1593
|
+
return "\n".join(result_parts)
|
|
1594
|
+
|
|
1595
|
+
|
|
1596
|
+
def _parse_binary_content(binary_bytes: bytes, content_type: str, include_preview: bool = False) -> str:
|
|
1597
|
+
"""Parse generic binary content."""
|
|
1598
|
+
result_parts = []
|
|
1599
|
+
result_parts.append(f"📦 Binary Content Analysis ({content_type})")
|
|
1600
|
+
|
|
1601
|
+
result_parts.append(f"📊 Size: {len(binary_bytes):,} bytes")
|
|
1602
|
+
|
|
1603
|
+
# Detect file type by magic bytes
|
|
1604
|
+
magic_signatures = {
|
|
1605
|
+
b'\x50\x4b\x03\x04': 'ZIP archive',
|
|
1606
|
+
b'\x50\x4b\x05\x06': 'ZIP archive (empty)',
|
|
1607
|
+
b'\x50\x4b\x07\x08': 'ZIP archive (spanned)',
|
|
1608
|
+
b'\x1f\x8b\x08': 'GZIP compressed',
|
|
1609
|
+
b'\x42\x5a\x68': 'BZIP2 compressed',
|
|
1610
|
+
b'\x37\x7a\xbc\xaf\x27\x1c': '7-Zip archive',
|
|
1611
|
+
b'\x52\x61\x72\x21\x1a\x07': 'RAR archive',
|
|
1612
|
+
b'\x89\x50\x4e\x47\x0d\x0a\x1a\x0a': 'PNG image',
|
|
1613
|
+
b'\xff\xd8\xff': 'JPEG image',
|
|
1614
|
+
b'\x47\x49\x46\x38': 'GIF image',
|
|
1615
|
+
b'\x25\x50\x44\x46': 'PDF document',
|
|
1616
|
+
b'\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1': 'Microsoft Office document',
|
|
1617
|
+
b'\x4d\x5a': 'Windows executable'
|
|
1618
|
+
}
|
|
1619
|
+
|
|
1620
|
+
detected_type = None
|
|
1621
|
+
for signature, file_type in magic_signatures.items():
|
|
1622
|
+
if binary_bytes.startswith(signature):
|
|
1623
|
+
detected_type = file_type
|
|
1624
|
+
break
|
|
1625
|
+
|
|
1626
|
+
if detected_type:
|
|
1627
|
+
result_parts.append(f"🔍 Detected format: {detected_type}")
|
|
1628
|
+
|
|
1629
|
+
if include_preview:
|
|
1630
|
+
# Show hex preview
|
|
1631
|
+
hex_preview = ' '.join(f'{b:02x}' for b in binary_bytes[:64])
|
|
1632
|
+
result_parts.append(f"🔍 Hex Preview (first 64 bytes):")
|
|
1633
|
+
result_parts.append(hex_preview)
|
|
1634
|
+
|
|
1635
|
+
# Try to show any readable ASCII strings
|
|
1636
|
+
try:
|
|
1637
|
+
ascii_preview = ''.join(chr(b) if 32 <= b <= 126 else '.' for b in binary_bytes[:200])
|
|
1638
|
+
if ascii_preview.strip():
|
|
1639
|
+
result_parts.append(f"📝 ASCII Preview (first 200 bytes):")
|
|
1640
|
+
result_parts.append(ascii_preview)
|
|
1641
|
+
except:
|
|
1642
|
+
pass
|
|
1643
|
+
|
|
1644
|
+
result_parts.append("💡 Use specialized tools for detailed binary analysis")
|
|
1645
|
+
|
|
1646
|
+
return "\n".join(result_parts)
|
|
998
1647
|
|
|
999
1648
|
|
|
1000
1649
|
@tool(
|
|
@@ -1524,5 +2173,6 @@ __all__ = [
|
|
|
1524
2173
|
'write_file',
|
|
1525
2174
|
'edit_file',
|
|
1526
2175
|
'web_search',
|
|
2176
|
+
'fetch_url',
|
|
1527
2177
|
'execute_command'
|
|
1528
2178
|
]
|
abstractcore/utils/version.py
CHANGED
|
@@ -11,4 +11,4 @@ including when the package is installed from PyPI where pyproject.toml is not av
|
|
|
11
11
|
|
|
12
12
|
# Package version - update this when releasing new versions
|
|
13
13
|
# This must be manually synchronized with the version in pyproject.toml
|
|
14
|
-
__version__ = "2.4.
|
|
14
|
+
__version__ = "2.4.7"
|