thordata-mcp-server 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- thordata_mcp/__init__.py +1 -1
- thordata_mcp/browser_session.py +157 -12
- thordata_mcp/config.py +14 -3
- thordata_mcp/context.py +1 -1
- thordata_mcp/tools/data/browser.py +124 -18
- thordata_mcp/tools/debug.py +125 -0
- thordata_mcp/tools/params_utils.py +107 -0
- thordata_mcp/tools/product.py +83 -5
- thordata_mcp/tools/product_compact.py +2108 -962
- thordata_mcp/tools/utils.py +2 -0
- thordata_mcp/utils.py +393 -322
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/METADATA +29 -54
- thordata_mcp_server-0.5.0.dist-info/RECORD +26 -0
- thordata_mcp_server-0.4.4.dist-info/RECORD +0 -24
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/WHEEL +0 -0
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/entry_points.txt +0 -0
- {thordata_mcp_server-0.4.4.dist-info → thordata_mcp_server-0.5.0.dist-info}/top_level.txt +0 -0
thordata_mcp/utils.py
CHANGED
|
@@ -1,322 +1,393 @@
|
|
|
1
|
-
"""Common utility helpers for Thordata MCP tools."""
|
|
2
|
-
from __future__ import annotations
|
|
3
|
-
|
|
4
|
-
import functools
|
|
5
|
-
import html2text
|
|
6
|
-
import
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
# ---------------------------------------------------------------------------
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
"
|
|
39
|
-
"
|
|
40
|
-
"
|
|
41
|
-
"
|
|
42
|
-
"
|
|
43
|
-
"
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
)
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
error_info["
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
#
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
if
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
1
|
+
"""Common utility helpers for Thordata MCP tools."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import functools
|
|
5
|
+
import html2text
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import uuid
|
|
9
|
+
from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
|
|
10
|
+
from typing import Any, Callable, Optional
|
|
11
|
+
|
|
12
|
+
from markdownify import markdownify as md
|
|
13
|
+
from thordata import (
|
|
14
|
+
ThordataAPIError,
|
|
15
|
+
ThordataConfigError,
|
|
16
|
+
ThordataNetworkError,
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger("thordata_mcp")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# ---------------------------------------------------------------------------
|
|
23
|
+
# Enhanced error diagnostics
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
|
|
26
|
+
def get_error_suggestion(error_type: str, url: Optional[str] = None) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Provide helpful suggestions based on error type.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
error_type: Type of error encountered
|
|
32
|
+
url: Optional URL that caused the error
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Helpful suggestion string
|
|
36
|
+
"""
|
|
37
|
+
suggestions = {
|
|
38
|
+
"timeout": "The request timed out. Try enabling JS rendering or check if the site is accessible.",
|
|
39
|
+
"blocked": "The request was blocked (403/CAPTCHA). The site may have anti-bot protection.",
|
|
40
|
+
"parse_failed": "Failed to parse the response. The site structure may have changed.",
|
|
41
|
+
"not_found": "The requested resource was not found (404).",
|
|
42
|
+
"upstream_timeout": "The upstream service timed out (504). Try again later.",
|
|
43
|
+
"upstream_internal_error": "The upstream service encountered an error (500). Try again later.",
|
|
44
|
+
"network_error": "Network error occurred. Check your internet connection and Thordata service status.",
|
|
45
|
+
"config_error": "Configuration error. Check your API credentials in .env file.",
|
|
46
|
+
"auth_error": "Authentication failed. Verify THORDATA_PUBLIC_TOKEN/THORDATA_PUBLIC_KEY/THORDATA_SCRAPER_TOKEN in .env match your Dashboard credentials.",
|
|
47
|
+
"validation_error": "Parameter validation failed. Ensure 'params' is a dictionary object, not a string. Example: params={'url': 'https://example.com'}",
|
|
48
|
+
"json_error": "Invalid JSON in params. Use dictionary format: params={'url': 'https://example.com'} or valid JSON string: params='{\"url\":\"https://example.com\"}'",
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
suggestion = suggestions.get(error_type, "An unexpected error occurred.")
|
|
52
|
+
|
|
53
|
+
if url and error_type == "timeout":
|
|
54
|
+
suggestion += f" URL: {url}"
|
|
55
|
+
|
|
56
|
+
return suggestion
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def diagnose_scraping_error(error: Exception, url: Optional[str] = None) -> dict[str, Any]:
|
|
60
|
+
"""
|
|
61
|
+
Diagnose a scraping error and provide detailed information.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
error: The exception that occurred
|
|
65
|
+
url: Optional URL that was being scraped
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
Dictionary with diagnostic information
|
|
69
|
+
"""
|
|
70
|
+
error_info = {
|
|
71
|
+
"error_type": type(error).__name__,
|
|
72
|
+
"error_message": str(error),
|
|
73
|
+
"url": url,
|
|
74
|
+
"timestamp": logging.Formatter().formatTime(logging.LogRecord(
|
|
75
|
+
name="", level=0, pathname="", lineno=0,
|
|
76
|
+
msg="", args=(), exc_info=None
|
|
77
|
+
)),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# Add specific diagnostics based on error type
|
|
81
|
+
if isinstance(error, ThordataAPIError):
|
|
82
|
+
error_info["api_code"] = getattr(error, "code", None)
|
|
83
|
+
error_info["api_payload"] = getattr(error, "payload", None)
|
|
84
|
+
# Provide a best-effort suggestion based on backend error content.
|
|
85
|
+
payload = error_info.get("api_payload")
|
|
86
|
+
msg = str(getattr(error, "message", str(error)) or "")
|
|
87
|
+
payload_s = ""
|
|
88
|
+
if isinstance(payload, dict):
|
|
89
|
+
payload_s = " ".join(str(v) for v in payload.values())
|
|
90
|
+
combined = f"{msg} {payload_s}".lower()
|
|
91
|
+
if "sign authentication failed" in combined or "authentication failed" in combined or "invalid signature" in combined:
|
|
92
|
+
error_info["suggestion"] = get_error_suggestion("auth_error", url)
|
|
93
|
+
else:
|
|
94
|
+
error_info["suggestion"] = get_error_suggestion("upstream_internal_error", url)
|
|
95
|
+
elif isinstance(error, ThordataNetworkError):
|
|
96
|
+
error_info["suggestion"] = get_error_suggestion("network_error", url)
|
|
97
|
+
elif isinstance(error, ThordataConfigError):
|
|
98
|
+
error_info["suggestion"] = get_error_suggestion("config_error", url)
|
|
99
|
+
elif "timeout" in str(error).lower():
|
|
100
|
+
error_info["suggestion"] = get_error_suggestion("timeout", url)
|
|
101
|
+
elif isinstance(error, ValueError) and "params" in str(error).lower():
|
|
102
|
+
error_info["suggestion"] = get_error_suggestion("validation_error", url)
|
|
103
|
+
elif isinstance(error, json.JSONDecodeError) and "params" in str(error).lower():
|
|
104
|
+
error_info["suggestion"] = get_error_suggestion("json_error", url)
|
|
105
|
+
else:
|
|
106
|
+
error_info["suggestion"] = "An unexpected error occurred. Check logs for details."
|
|
107
|
+
|
|
108
|
+
return error_info
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ---------------------------------------------------------------------------
|
|
112
|
+
# Safe Context helpers (for HTTP mode compatibility)
|
|
113
|
+
# ---------------------------------------------------------------------------
|
|
114
|
+
|
|
115
|
+
async def safe_ctx_info(ctx: Optional[Any], message: str) -> None:
|
|
116
|
+
"""Safely call ctx.info() if context is available and valid.
|
|
117
|
+
|
|
118
|
+
In HTTP mode, ctx may exist but not be a valid MCP Context,
|
|
119
|
+
so we wrap the call in try-except to avoid errors.
|
|
120
|
+
"""
|
|
121
|
+
if ctx is None:
|
|
122
|
+
return
|
|
123
|
+
try:
|
|
124
|
+
await ctx.info(message)
|
|
125
|
+
except (ValueError, AttributeError):
|
|
126
|
+
# Context not available (e.g., HTTP mode) - silently skip
|
|
127
|
+
pass
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# ---------------------------------------------------------------------------
|
|
131
|
+
# Structured response helpers (LLM-friendly)
|
|
132
|
+
# ---------------------------------------------------------------------------
|
|
133
|
+
|
|
134
|
+
def ok_response(
|
|
135
|
+
*,
|
|
136
|
+
tool: str,
|
|
137
|
+
input: dict[str, Any],
|
|
138
|
+
output: Any,
|
|
139
|
+
request_id: str | None = None,
|
|
140
|
+
) -> dict[str, Any]:
|
|
141
|
+
rid = request_id or uuid.uuid4().hex
|
|
142
|
+
return {"ok": True, "tool": tool, "request_id": rid, "input": input, "output": output}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def error_response(
|
|
146
|
+
*,
|
|
147
|
+
tool: str,
|
|
148
|
+
input: dict[str, Any],
|
|
149
|
+
error_type: str,
|
|
150
|
+
message: str,
|
|
151
|
+
details: Any | None = None,
|
|
152
|
+
code: str = "E0000",
|
|
153
|
+
request_id: str | None = None,
|
|
154
|
+
) -> dict[str, Any]:
|
|
155
|
+
"""Return a standardized error dict with machine-readable code."""
|
|
156
|
+
rid = request_id or uuid.uuid4().hex
|
|
157
|
+
return {
|
|
158
|
+
"ok": False,
|
|
159
|
+
"tool": tool,
|
|
160
|
+
"request_id": rid,
|
|
161
|
+
"input": input,
|
|
162
|
+
"error": {"type": error_type, "code": code, "message": message, "details": details},
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
# ---------------------------------------------------------------------------
|
|
167
|
+
# Decorator to convert SDK exceptions to structured output
|
|
168
|
+
# ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
def handle_mcp_errors(func: Callable) -> Callable: # noqa: D401
|
|
171
|
+
"""Wrap a tool so it always returns dict instead of raising SDK errors."""
|
|
172
|
+
|
|
173
|
+
@functools.wraps(func)
|
|
174
|
+
async def wrapper(*args, **kwargs): # type: ignore[return-value]
|
|
175
|
+
try:
|
|
176
|
+
return await func(*args, **kwargs)
|
|
177
|
+
except ThordataConfigError as e:
|
|
178
|
+
logger.error("Config error in %s: %s", func.__name__, e)
|
|
179
|
+
return error_response(
|
|
180
|
+
tool=func.__name__,
|
|
181
|
+
input={k: v for k, v in kwargs.items() if k != "ctx"},
|
|
182
|
+
error_type="config_error",
|
|
183
|
+
code="E1001",
|
|
184
|
+
message="Missing or invalid credentials.",
|
|
185
|
+
details=str(e),
|
|
186
|
+
)
|
|
187
|
+
except ThordataAPIError as e:
|
|
188
|
+
logger.error("API error in %s: %s", func.__name__, e)
|
|
189
|
+
msg = getattr(e, "message", str(e))
|
|
190
|
+
payload = getattr(e, "payload", None)
|
|
191
|
+
code = getattr(e, "code", None)
|
|
192
|
+
# Try to normalize common backend codes/messages for better UX
|
|
193
|
+
error_type = "api_error"
|
|
194
|
+
norm_code = "E2001"
|
|
195
|
+
msg_l = str(msg).lower()
|
|
196
|
+
if isinstance(payload, dict):
|
|
197
|
+
msg = payload.get("msg", msg)
|
|
198
|
+
# Some backend errors embed more detail in payload fields
|
|
199
|
+
if isinstance(payload.get("error"), str) and not msg:
|
|
200
|
+
msg = payload["error"]
|
|
201
|
+
if isinstance(payload.get("message"), str) and not msg:
|
|
202
|
+
msg = payload["message"]
|
|
203
|
+
# Heuristics for frequent categories
|
|
204
|
+
if "captcha" in msg_l or "403" in msg_l:
|
|
205
|
+
error_type = "blocked"
|
|
206
|
+
norm_code = "E2101"
|
|
207
|
+
elif "sign authentication failed" in msg_l or "authentication failed" in msg_l or "invalid signature" in msg_l:
|
|
208
|
+
error_type = "auth_error"
|
|
209
|
+
norm_code = "E1002"
|
|
210
|
+
elif "not collected" in msg_l or "failed to parse" in msg_l:
|
|
211
|
+
error_type = "parse_failed"
|
|
212
|
+
norm_code = "E2102"
|
|
213
|
+
elif "not exist" in msg_l or "404" in msg_l:
|
|
214
|
+
error_type = "not_found"
|
|
215
|
+
norm_code = "E2104"
|
|
216
|
+
elif "504" in msg_l or "gateway timeout" in msg_l:
|
|
217
|
+
error_type = "upstream_timeout"
|
|
218
|
+
norm_code = "E2105"
|
|
219
|
+
elif "500" in msg_l or "internal server error" in msg_l:
|
|
220
|
+
error_type = "upstream_internal_error"
|
|
221
|
+
norm_code = "E2106"
|
|
222
|
+
elif "subtitles_error" in msg_l or "unable to download api page" in msg_l:
|
|
223
|
+
error_type = "media_backend_error"
|
|
224
|
+
norm_code = "E2107"
|
|
225
|
+
|
|
226
|
+
# Attach richer diagnostics without breaking existing callers
|
|
227
|
+
url = None
|
|
228
|
+
if "url" in kwargs:
|
|
229
|
+
url = kwargs.get("url")
|
|
230
|
+
elif "params" in kwargs and isinstance(kwargs.get("params"), dict):
|
|
231
|
+
url = kwargs["params"].get("url")
|
|
232
|
+
|
|
233
|
+
diagnostic = diagnose_scraping_error(e, url=url)
|
|
234
|
+
return error_response(
|
|
235
|
+
tool=func.__name__,
|
|
236
|
+
input={k: v for k, v in kwargs.items() if k != "ctx"},
|
|
237
|
+
error_type=error_type,
|
|
238
|
+
code=norm_code,
|
|
239
|
+
message=msg,
|
|
240
|
+
details={"code": code, "payload": payload, "diagnostic": diagnostic},
|
|
241
|
+
)
|
|
242
|
+
except ThordataNetworkError as e:
|
|
243
|
+
err_str = str(e)
|
|
244
|
+
if "Task" in err_str and "failed" in err_str:
|
|
245
|
+
error_code = "E3001"
|
|
246
|
+
err_type = "task_failed"
|
|
247
|
+
msg = "Scraping task failed."
|
|
248
|
+
else:
|
|
249
|
+
error_code = "E2002"
|
|
250
|
+
err_type = "network_error"
|
|
251
|
+
msg = "Network error: could not reach Thordata services."
|
|
252
|
+
|
|
253
|
+
url = None
|
|
254
|
+
if "url" in kwargs:
|
|
255
|
+
url = kwargs.get("url")
|
|
256
|
+
elif "params" in kwargs and isinstance(kwargs.get("params"), dict):
|
|
257
|
+
url = kwargs["params"].get("url")
|
|
258
|
+
|
|
259
|
+
diagnostic = diagnose_scraping_error(e, url=url)
|
|
260
|
+
return error_response(
|
|
261
|
+
tool=func.__name__,
|
|
262
|
+
input={k: v for k, v in kwargs.items() if k != "ctx"},
|
|
263
|
+
error_type=err_type,
|
|
264
|
+
code=error_code,
|
|
265
|
+
message=msg,
|
|
266
|
+
details={"raw_error": err_str, "diagnostic": diagnostic},
|
|
267
|
+
)
|
|
268
|
+
except Exception as e: # pragma: no cover
|
|
269
|
+
# Use logger.error instead of logger.exception to avoid rich traceback issues
|
|
270
|
+
logger.error("Unexpected error in %s: %s", func.__name__, str(e), exc_info=False)
|
|
271
|
+
return error_response(
|
|
272
|
+
tool=func.__name__,
|
|
273
|
+
input={k: v for k, v in kwargs.items() if k != "ctx"},
|
|
274
|
+
error_type="unexpected_error",
|
|
275
|
+
code="E9000",
|
|
276
|
+
message=str(e),
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
return wrapper
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
# ---------------------------------------------------------------------------
|
|
283
|
+
# Helpers for HTML → Markdown & truncation
|
|
284
|
+
# ---------------------------------------------------------------------------
|
|
285
|
+
|
|
286
|
+
def _strip_large_data_urls(html: str, *, max_keep_chars: int = 256) -> str:
|
|
287
|
+
"""Remove large inlined data: URLs (base64 fonts/images) to reduce token bloat.
|
|
288
|
+
|
|
289
|
+
Keeps small data URLs (<= max_keep_chars) to avoid breaking tiny icons.
|
|
290
|
+
"""
|
|
291
|
+
import re
|
|
292
|
+
|
|
293
|
+
def _repl(m: re.Match[str]) -> str:
|
|
294
|
+
s = m.group(0)
|
|
295
|
+
if len(s) <= max_keep_chars:
|
|
296
|
+
return s
|
|
297
|
+
return 'data:...'
|
|
298
|
+
|
|
299
|
+
# Replace any data:... sequences inside quotes.
|
|
300
|
+
return re.sub(r"data:[^\s\"']+", _repl, html)
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def _extract_readable_html(html: str) -> str:
|
|
304
|
+
"""Best-effort extraction of main readable content.
|
|
305
|
+
|
|
306
|
+
Prefer <main> or <article>. Fall back to full document if not found.
|
|
307
|
+
"""
|
|
308
|
+
import re
|
|
309
|
+
|
|
310
|
+
# Very lightweight heuristics (no extra deps): keep the largest <main>/<article> block.
|
|
311
|
+
candidates: list[str] = []
|
|
312
|
+
for tag in ("main", "article"):
|
|
313
|
+
pattern = re.compile(rf"<{tag}[^>]*>([\\s\\S]*?)</{tag}>", re.IGNORECASE)
|
|
314
|
+
for m in pattern.finditer(html):
|
|
315
|
+
block = m.group(0)
|
|
316
|
+
if block:
|
|
317
|
+
candidates.append(block)
|
|
318
|
+
if not candidates:
|
|
319
|
+
return html
|
|
320
|
+
candidates.sort(key=len, reverse=True)
|
|
321
|
+
return candidates[0]
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def html_to_markdown_clean(html: str) -> str:
|
|
325
|
+
try:
|
|
326
|
+
html = _strip_large_data_urls(html)
|
|
327
|
+
html = _extract_readable_html(html)
|
|
328
|
+
text = md(html, heading_style="ATX", strip=["script", "style", "noscript", "nav", "footer", "iframe", "svg"])
|
|
329
|
+
lines = [line.rstrip() for line in text.splitlines()]
|
|
330
|
+
return "\n".join(line for line in lines if line)
|
|
331
|
+
except Exception:
|
|
332
|
+
h = html2text.HTML2Text()
|
|
333
|
+
h.ignore_links = False
|
|
334
|
+
return h.handle(html)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def truncate_content(content: str, max_length: int = 20_000) -> str:
|
|
338
|
+
if len(content) <= max_length:
|
|
339
|
+
return content
|
|
340
|
+
return content[:max_length] + f"\n\n... [Content Truncated, original length: {len(content)} chars]"
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
# ---------------------------------------------------------------------------
|
|
344
|
+
# Download URL helpers
|
|
345
|
+
# ---------------------------------------------------------------------------
|
|
346
|
+
|
|
347
|
+
def enrich_download_url(download_url: str, *, task_id: str | None = None, file_type: str | None = None) -> str:
|
|
348
|
+
"""Ensure returned download URLs are directly usable in a browser.
|
|
349
|
+
|
|
350
|
+
Some SDK / backend paths may return a URL missing required query params such as
|
|
351
|
+
`api_key` and `plat`, leading to {"error":"Missing necessary parameters."}.
|
|
352
|
+
"""
|
|
353
|
+
try:
|
|
354
|
+
from .config import settings
|
|
355
|
+
except Exception: # pragma: no cover
|
|
356
|
+
settings = None # type: ignore[assignment]
|
|
357
|
+
|
|
358
|
+
token = getattr(settings, "THORDATA_SCRAPER_TOKEN", None) if settings else None
|
|
359
|
+
plat = getattr(settings, "THORDATA_DOWNLOAD_PLAT", "1") if settings else "1"
|
|
360
|
+
base = getattr(settings, "THORDATA_DOWNLOAD_BASE_URL", "https://scraperapi.thordata.com/download") if settings else "https://scraperapi.thordata.com/download"
|
|
361
|
+
|
|
362
|
+
# If we can't enrich (no token), return as-is.
|
|
363
|
+
if not token:
|
|
364
|
+
return download_url
|
|
365
|
+
|
|
366
|
+
parsed = urlparse(download_url)
|
|
367
|
+
qs = dict(parse_qsl(parsed.query, keep_blank_values=True))
|
|
368
|
+
|
|
369
|
+
# Backfill known parameters
|
|
370
|
+
if "api_key" not in qs:
|
|
371
|
+
qs["api_key"] = token
|
|
372
|
+
if "plat" not in qs and plat:
|
|
373
|
+
qs["plat"] = plat
|
|
374
|
+
if "task_id" not in qs and task_id:
|
|
375
|
+
qs["task_id"] = task_id
|
|
376
|
+
if "type" not in qs and file_type:
|
|
377
|
+
qs["type"] = file_type
|
|
378
|
+
|
|
379
|
+
# If SDK returned a relative/alternate host, normalize to configured base
|
|
380
|
+
if not parsed.scheme or not parsed.netloc:
|
|
381
|
+
parsed = urlparse(base)
|
|
382
|
+
elif parsed.path.rstrip("/") != urlparse(base).path.rstrip("/"):
|
|
383
|
+
# Keep original host, only fix query; unless path looks non-download
|
|
384
|
+
pass
|
|
385
|
+
|
|
386
|
+
new_query = urlencode(qs, doseq=True)
|
|
387
|
+
new_parsed = parsed._replace(query=new_query)
|
|
388
|
+
# If original URL had a different host/path but is a valid absolute URL, preserve them.
|
|
389
|
+
if parsed.scheme and parsed.netloc and urlparse(download_url).scheme and urlparse(download_url).netloc:
|
|
390
|
+
orig = urlparse(download_url)
|
|
391
|
+
new_parsed = orig._replace(query=new_query)
|
|
392
|
+
|
|
393
|
+
return urlunparse(new_parsed)
|