thordata-mcp-server 0.4.4__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
thordata_mcp/utils.py CHANGED
@@ -1,322 +1,393 @@
1
- """Common utility helpers for Thordata MCP tools."""
2
- from __future__ import annotations
3
-
4
- import functools
5
- import html2text
6
- import logging
7
- from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
8
- from typing import Any, Callable, Optional
9
-
10
- from markdownify import markdownify as md
11
- from thordata import (
12
- ThordataAPIError,
13
- ThordataConfigError,
14
- ThordataNetworkError,
15
- )
16
-
17
- logger = logging.getLogger("thordata_mcp")
18
-
19
-
20
- # ---------------------------------------------------------------------------
21
- # Enhanced error diagnostics
22
- # ---------------------------------------------------------------------------
23
-
24
- def get_error_suggestion(error_type: str, url: Optional[str] = None) -> str:
25
- """
26
- Provide helpful suggestions based on error type.
27
-
28
- Args:
29
- error_type: Type of error encountered
30
- url: Optional URL that caused the error
31
-
32
- Returns:
33
- Helpful suggestion string
34
- """
35
- suggestions = {
36
- "timeout": "The request timed out. Try enabling JS rendering or check if the site is accessible.",
37
- "blocked": "The request was blocked (403/CAPTCHA). The site may have anti-bot protection.",
38
- "parse_failed": "Failed to parse the response. The site structure may have changed.",
39
- "not_found": "The requested resource was not found (404).",
40
- "upstream_timeout": "The upstream service timed out (504). Try again later.",
41
- "upstream_internal_error": "The upstream service encountered an error (500). Try again later.",
42
- "network_error": "Network error occurred. Check your internet connection and Thordata service status.",
43
- "config_error": "Configuration error. Check your API credentials in .env file.",
44
- }
45
-
46
- suggestion = suggestions.get(error_type, "An unexpected error occurred.")
47
-
48
- if url and error_type == "timeout":
49
- suggestion += f" URL: {url}"
50
-
51
- return suggestion
52
-
53
-
54
- def diagnose_scraping_error(error: Exception, url: Optional[str] = None) -> dict[str, Any]:
55
- """
56
- Diagnose a scraping error and provide detailed information.
57
-
58
- Args:
59
- error: The exception that occurred
60
- url: Optional URL that was being scraped
61
-
62
- Returns:
63
- Dictionary with diagnostic information
64
- """
65
- error_info = {
66
- "error_type": type(error).__name__,
67
- "error_message": str(error),
68
- "url": url,
69
- "timestamp": logging.Formatter().formatTime(logging.LogRecord(
70
- name="", level=0, pathname="", lineno=0,
71
- msg="", args=(), exc_info=None
72
- )),
73
- }
74
-
75
- # Add specific diagnostics based on error type
76
- if isinstance(error, ThordataAPIError):
77
- error_info["api_code"] = getattr(error, "code", None)
78
- error_info["api_payload"] = getattr(error, "payload", None)
79
- # Keep a stable error_type for callers while still providing a suggestion
80
- error_info["suggestion"] = get_error_suggestion("upstream_internal_error", url)
81
- elif isinstance(error, ThordataNetworkError):
82
- error_info["suggestion"] = get_error_suggestion("network_error", url)
83
- elif isinstance(error, ThordataConfigError):
84
- error_info["suggestion"] = get_error_suggestion("config_error", url)
85
- elif "timeout" in str(error).lower():
86
- error_info["suggestion"] = get_error_suggestion("timeout", url)
87
- else:
88
- error_info["suggestion"] = "An unexpected error occurred. Check logs for details."
89
-
90
- return error_info
91
-
92
-
93
- # ---------------------------------------------------------------------------
94
- # Safe Context helpers (for HTTP mode compatibility)
95
- # ---------------------------------------------------------------------------
96
-
97
- async def safe_ctx_info(ctx: Optional[Any], message: str) -> None:
98
- """Safely call ctx.info() if context is available and valid.
99
-
100
- In HTTP mode, ctx may exist but not be a valid MCP Context,
101
- so we wrap the call in try-except to avoid errors.
102
- """
103
- if ctx is None:
104
- return
105
- try:
106
- await ctx.info(message)
107
- except (ValueError, AttributeError):
108
- # Context not available (e.g., HTTP mode) - silently skip
109
- pass
110
-
111
-
112
- # ---------------------------------------------------------------------------
113
- # Structured response helpers (LLM-friendly)
114
- # ---------------------------------------------------------------------------
115
-
116
- def ok_response(*, tool: str, input: dict[str, Any], output: Any) -> dict[str, Any]:
117
- return {"ok": True, "tool": tool, "input": input, "output": output}
118
-
119
-
120
- def error_response(
121
- *,
122
- tool: str,
123
- input: dict[str, Any],
124
- error_type: str,
125
- message: str,
126
- details: Any | None = None,
127
- code: str = "E0000",
128
- ) -> dict[str, Any]:
129
- """Return a standardized error dict with machine-readable code."""
130
- return {
131
- "ok": False,
132
- "tool": tool,
133
- "input": input,
134
- "error": {"type": error_type, "code": code, "message": message, "details": details},
135
- }
136
-
137
-
138
- # ---------------------------------------------------------------------------
139
- # Decorator to convert SDK exceptions to structured output
140
- # ---------------------------------------------------------------------------
141
-
142
- def handle_mcp_errors(func: Callable) -> Callable: # noqa: D401
143
- """Wrap a tool so it always returns dict instead of raising SDK errors."""
144
-
145
- @functools.wraps(func)
146
- async def wrapper(*args, **kwargs): # type: ignore[return-value]
147
- try:
148
- return await func(*args, **kwargs)
149
- except ThordataConfigError as e:
150
- logger.error("Config error in %s: %s", func.__name__, e)
151
- return error_response(
152
- tool=func.__name__,
153
- input={k: v for k, v in kwargs.items() if k != "ctx"},
154
- error_type="config_error",
155
- code="E1001",
156
- message="Missing or invalid credentials.",
157
- details=str(e),
158
- )
159
- except ThordataAPIError as e:
160
- logger.error("API error in %s: %s", func.__name__, e)
161
- msg = getattr(e, "message", str(e))
162
- payload = getattr(e, "payload", None)
163
- code = getattr(e, "code", None)
164
- # Try to normalize common backend codes/messages for better UX
165
- error_type = "api_error"
166
- norm_code = "E2001"
167
- msg_l = str(msg).lower()
168
- if isinstance(payload, dict):
169
- msg = payload.get("msg", msg)
170
- # Some backend errors embed more detail in payload fields
171
- if isinstance(payload.get("error"), str) and not msg:
172
- msg = payload["error"]
173
- if isinstance(payload.get("message"), str) and not msg:
174
- msg = payload["message"]
175
- # Heuristics for frequent categories
176
- if "captcha" in msg_l or "403" in msg_l:
177
- error_type = "blocked"
178
- norm_code = "E2101"
179
- elif "not collected" in msg_l or "failed to parse" in msg_l:
180
- error_type = "parse_failed"
181
- norm_code = "E2102"
182
- elif "not exist" in msg_l or "404" in msg_l:
183
- error_type = "not_found"
184
- norm_code = "E2104"
185
- elif "504" in msg_l or "gateway timeout" in msg_l:
186
- error_type = "upstream_timeout"
187
- norm_code = "E2105"
188
- elif "500" in msg_l or "internal server error" in msg_l:
189
- error_type = "upstream_internal_error"
190
- norm_code = "E2106"
191
- elif "subtitles_error" in msg_l or "unable to download api page" in msg_l:
192
- error_type = "media_backend_error"
193
- norm_code = "E2107"
194
-
195
- # Attach richer diagnostics without breaking existing callers
196
- url = None
197
- if "url" in kwargs:
198
- url = kwargs.get("url")
199
- elif "params" in kwargs and isinstance(kwargs.get("params"), dict):
200
- url = kwargs["params"].get("url")
201
-
202
- diagnostic = diagnose_scraping_error(e, url=url)
203
- return error_response(
204
- tool=func.__name__,
205
- input={k: v for k, v in kwargs.items() if k != "ctx"},
206
- error_type=error_type,
207
- code=norm_code,
208
- message=msg,
209
- details={"code": code, "payload": payload, "diagnostic": diagnostic},
210
- )
211
- except ThordataNetworkError as e:
212
- err_str = str(e)
213
- if "Task" in err_str and "failed" in err_str:
214
- error_code = "E3001"
215
- err_type = "task_failed"
216
- msg = "Scraping task failed."
217
- else:
218
- error_code = "E2002"
219
- err_type = "network_error"
220
- msg = "Network error: could not reach Thordata services."
221
-
222
- url = None
223
- if "url" in kwargs:
224
- url = kwargs.get("url")
225
- elif "params" in kwargs and isinstance(kwargs.get("params"), dict):
226
- url = kwargs["params"].get("url")
227
-
228
- diagnostic = diagnose_scraping_error(e, url=url)
229
- return error_response(
230
- tool=func.__name__,
231
- input={k: v for k, v in kwargs.items() if k != "ctx"},
232
- error_type=err_type,
233
- code=error_code,
234
- message=msg,
235
- details={"raw_error": err_str, "diagnostic": diagnostic},
236
- )
237
- except Exception as e: # pragma: no cover
238
- # Use logger.error instead of logger.exception to avoid rich traceback issues
239
- logger.error("Unexpected error in %s: %s", func.__name__, str(e), exc_info=False)
240
- return error_response(
241
- tool=func.__name__,
242
- input={k: v for k, v in kwargs.items() if k != "ctx"},
243
- error_type="unexpected_error",
244
- code="E9000",
245
- message=str(e),
246
- )
247
-
248
- return wrapper
249
-
250
-
251
- # ---------------------------------------------------------------------------
252
- # Helpers for HTML → Markdown & truncation
253
- # ---------------------------------------------------------------------------
254
-
255
- def html_to_markdown_clean(html: str) -> str:
256
- try:
257
- text = md(html, heading_style="ATX", strip=["script", "style", "nav", "footer", "iframe"])
258
- lines = [line.rstrip() for line in text.splitlines()]
259
- return "\n".join(line for line in lines if line)
260
- except Exception:
261
- h = html2text.HTML2Text()
262
- h.ignore_links = False
263
- return h.handle(html)
264
-
265
-
266
- def truncate_content(content: str, max_length: int = 20_000) -> str:
267
- if len(content) <= max_length:
268
- return content
269
- return content[:max_length] + f"\n\n... [Content Truncated, original length: {len(content)} chars]"
270
-
271
-
272
- # ---------------------------------------------------------------------------
273
- # Download URL helpers
274
- # ---------------------------------------------------------------------------
275
-
276
- def enrich_download_url(download_url: str, *, task_id: str | None = None, file_type: str | None = None) -> str:
277
- """Ensure returned download URLs are directly usable in a browser.
278
-
279
- Some SDK / backend paths may return a URL missing required query params such as
280
- `api_key` and `plat`, leading to {"error":"Missing necessary parameters."}.
281
- """
282
- try:
283
- from .config import settings
284
- except Exception: # pragma: no cover
285
- settings = None # type: ignore[assignment]
286
-
287
- token = getattr(settings, "THORDATA_SCRAPER_TOKEN", None) if settings else None
288
- plat = getattr(settings, "THORDATA_DOWNLOAD_PLAT", "1") if settings else "1"
289
- base = getattr(settings, "THORDATA_DOWNLOAD_BASE_URL", "https://scraperapi.thordata.com/download") if settings else "https://scraperapi.thordata.com/download"
290
-
291
- # If we can't enrich (no token), return as-is.
292
- if not token:
293
- return download_url
294
-
295
- parsed = urlparse(download_url)
296
- qs = dict(parse_qsl(parsed.query, keep_blank_values=True))
297
-
298
- # Backfill known parameters
299
- if "api_key" not in qs:
300
- qs["api_key"] = token
301
- if "plat" not in qs and plat:
302
- qs["plat"] = plat
303
- if "task_id" not in qs and task_id:
304
- qs["task_id"] = task_id
305
- if "type" not in qs and file_type:
306
- qs["type"] = file_type
307
-
308
- # If SDK returned a relative/alternate host, normalize to configured base
309
- if not parsed.scheme or not parsed.netloc:
310
- parsed = urlparse(base)
311
- elif parsed.path.rstrip("/") != urlparse(base).path.rstrip("/"):
312
- # Keep original host, only fix query; unless path looks non-download
313
- pass
314
-
315
- new_query = urlencode(qs, doseq=True)
316
- new_parsed = parsed._replace(query=new_query)
317
- # If original URL had a different host/path but is a valid absolute URL, preserve them.
318
- if parsed.scheme and parsed.netloc and urlparse(download_url).scheme and urlparse(download_url).netloc:
319
- orig = urlparse(download_url)
320
- new_parsed = orig._replace(query=new_query)
321
-
322
- return urlunparse(new_parsed)
1
+ """Common utility helpers for Thordata MCP tools."""
2
+ from __future__ import annotations
3
+
4
+ import functools
5
+ import html2text
6
+ import json
7
+ import logging
8
+ import uuid
9
+ from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
10
+ from typing import Any, Callable, Optional
11
+
12
+ from markdownify import markdownify as md
13
+ from thordata import (
14
+ ThordataAPIError,
15
+ ThordataConfigError,
16
+ ThordataNetworkError,
17
+ )
18
+
19
+ logger = logging.getLogger("thordata_mcp")
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Enhanced error diagnostics
24
+ # ---------------------------------------------------------------------------
25
+
26
+ def get_error_suggestion(error_type: str, url: Optional[str] = None) -> str:
27
+ """
28
+ Provide helpful suggestions based on error type.
29
+
30
+ Args:
31
+ error_type: Type of error encountered
32
+ url: Optional URL that caused the error
33
+
34
+ Returns:
35
+ Helpful suggestion string
36
+ """
37
+ suggestions = {
38
+ "timeout": "The request timed out. Try enabling JS rendering or check if the site is accessible.",
39
+ "blocked": "The request was blocked (403/CAPTCHA). The site may have anti-bot protection.",
40
+ "parse_failed": "Failed to parse the response. The site structure may have changed.",
41
+ "not_found": "The requested resource was not found (404).",
42
+ "upstream_timeout": "The upstream service timed out (504). Try again later.",
43
+ "upstream_internal_error": "The upstream service encountered an error (500). Try again later.",
44
+ "network_error": "Network error occurred. Check your internet connection and Thordata service status.",
45
+ "config_error": "Configuration error. Check your API credentials in .env file.",
46
+ "auth_error": "Authentication failed. Verify THORDATA_PUBLIC_TOKEN/THORDATA_PUBLIC_KEY/THORDATA_SCRAPER_TOKEN in .env match your Dashboard credentials.",
47
+ "validation_error": "Parameter validation failed. Ensure 'params' is a dictionary object, not a string. Example: params={'url': 'https://example.com'}",
48
+ "json_error": "Invalid JSON in params. Use dictionary format: params={'url': 'https://example.com'} or valid JSON string: params='{\"url\":\"https://example.com\"}'",
49
+ }
50
+
51
+ suggestion = suggestions.get(error_type, "An unexpected error occurred.")
52
+
53
+ if url and error_type == "timeout":
54
+ suggestion += f" URL: {url}"
55
+
56
+ return suggestion
57
+
58
+
59
+ def diagnose_scraping_error(error: Exception, url: Optional[str] = None) -> dict[str, Any]:
60
+ """
61
+ Diagnose a scraping error and provide detailed information.
62
+
63
+ Args:
64
+ error: The exception that occurred
65
+ url: Optional URL that was being scraped
66
+
67
+ Returns:
68
+ Dictionary with diagnostic information
69
+ """
70
+ error_info = {
71
+ "error_type": type(error).__name__,
72
+ "error_message": str(error),
73
+ "url": url,
74
+ "timestamp": logging.Formatter().formatTime(logging.LogRecord(
75
+ name="", level=0, pathname="", lineno=0,
76
+ msg="", args=(), exc_info=None
77
+ )),
78
+ }
79
+
80
+ # Add specific diagnostics based on error type
81
+ if isinstance(error, ThordataAPIError):
82
+ error_info["api_code"] = getattr(error, "code", None)
83
+ error_info["api_payload"] = getattr(error, "payload", None)
84
+ # Provide a best-effort suggestion based on backend error content.
85
+ payload = error_info.get("api_payload")
86
+ msg = str(getattr(error, "message", str(error)) or "")
87
+ payload_s = ""
88
+ if isinstance(payload, dict):
89
+ payload_s = " ".join(str(v) for v in payload.values())
90
+ combined = f"{msg} {payload_s}".lower()
91
+ if "sign authentication failed" in combined or "authentication failed" in combined or "invalid signature" in combined:
92
+ error_info["suggestion"] = get_error_suggestion("auth_error", url)
93
+ else:
94
+ error_info["suggestion"] = get_error_suggestion("upstream_internal_error", url)
95
+ elif isinstance(error, ThordataNetworkError):
96
+ error_info["suggestion"] = get_error_suggestion("network_error", url)
97
+ elif isinstance(error, ThordataConfigError):
98
+ error_info["suggestion"] = get_error_suggestion("config_error", url)
99
+ elif "timeout" in str(error).lower():
100
+ error_info["suggestion"] = get_error_suggestion("timeout", url)
101
+ elif isinstance(error, ValueError) and "params" in str(error).lower():
102
+ error_info["suggestion"] = get_error_suggestion("validation_error", url)
103
+ elif isinstance(error, json.JSONDecodeError) and "params" in str(error).lower():
104
+ error_info["suggestion"] = get_error_suggestion("json_error", url)
105
+ else:
106
+ error_info["suggestion"] = "An unexpected error occurred. Check logs for details."
107
+
108
+ return error_info
109
+
110
+
111
+ # ---------------------------------------------------------------------------
112
+ # Safe Context helpers (for HTTP mode compatibility)
113
+ # ---------------------------------------------------------------------------
114
+
115
+ async def safe_ctx_info(ctx: Optional[Any], message: str) -> None:
116
+ """Safely call ctx.info() if context is available and valid.
117
+
118
+ In HTTP mode, ctx may exist but not be a valid MCP Context,
119
+ so we wrap the call in try-except to avoid errors.
120
+ """
121
+ if ctx is None:
122
+ return
123
+ try:
124
+ await ctx.info(message)
125
+ except (ValueError, AttributeError):
126
+ # Context not available (e.g., HTTP mode) - silently skip
127
+ pass
128
+
129
+
130
+ # ---------------------------------------------------------------------------
131
+ # Structured response helpers (LLM-friendly)
132
+ # ---------------------------------------------------------------------------
133
+
134
+ def ok_response(
135
+ *,
136
+ tool: str,
137
+ input: dict[str, Any],
138
+ output: Any,
139
+ request_id: str | None = None,
140
+ ) -> dict[str, Any]:
141
+ rid = request_id or uuid.uuid4().hex
142
+ return {"ok": True, "tool": tool, "request_id": rid, "input": input, "output": output}
143
+
144
+
145
+ def error_response(
146
+ *,
147
+ tool: str,
148
+ input: dict[str, Any],
149
+ error_type: str,
150
+ message: str,
151
+ details: Any | None = None,
152
+ code: str = "E0000",
153
+ request_id: str | None = None,
154
+ ) -> dict[str, Any]:
155
+ """Return a standardized error dict with machine-readable code."""
156
+ rid = request_id or uuid.uuid4().hex
157
+ return {
158
+ "ok": False,
159
+ "tool": tool,
160
+ "request_id": rid,
161
+ "input": input,
162
+ "error": {"type": error_type, "code": code, "message": message, "details": details},
163
+ }
164
+
165
+
166
+ # ---------------------------------------------------------------------------
167
+ # Decorator to convert SDK exceptions to structured output
168
+ # ---------------------------------------------------------------------------
169
+
170
+ def handle_mcp_errors(func: Callable) -> Callable: # noqa: D401
171
+ """Wrap a tool so it always returns dict instead of raising SDK errors."""
172
+
173
+ @functools.wraps(func)
174
+ async def wrapper(*args, **kwargs): # type: ignore[return-value]
175
+ try:
176
+ return await func(*args, **kwargs)
177
+ except ThordataConfigError as e:
178
+ logger.error("Config error in %s: %s", func.__name__, e)
179
+ return error_response(
180
+ tool=func.__name__,
181
+ input={k: v for k, v in kwargs.items() if k != "ctx"},
182
+ error_type="config_error",
183
+ code="E1001",
184
+ message="Missing or invalid credentials.",
185
+ details=str(e),
186
+ )
187
+ except ThordataAPIError as e:
188
+ logger.error("API error in %s: %s", func.__name__, e)
189
+ msg = getattr(e, "message", str(e))
190
+ payload = getattr(e, "payload", None)
191
+ code = getattr(e, "code", None)
192
+ # Try to normalize common backend codes/messages for better UX
193
+ error_type = "api_error"
194
+ norm_code = "E2001"
195
+ msg_l = str(msg).lower()
196
+ if isinstance(payload, dict):
197
+ msg = payload.get("msg", msg)
198
+ # Some backend errors embed more detail in payload fields
199
+ if isinstance(payload.get("error"), str) and not msg:
200
+ msg = payload["error"]
201
+ if isinstance(payload.get("message"), str) and not msg:
202
+ msg = payload["message"]
203
+ # Heuristics for frequent categories
204
+ if "captcha" in msg_l or "403" in msg_l:
205
+ error_type = "blocked"
206
+ norm_code = "E2101"
207
+ elif "sign authentication failed" in msg_l or "authentication failed" in msg_l or "invalid signature" in msg_l:
208
+ error_type = "auth_error"
209
+ norm_code = "E1002"
210
+ elif "not collected" in msg_l or "failed to parse" in msg_l:
211
+ error_type = "parse_failed"
212
+ norm_code = "E2102"
213
+ elif "not exist" in msg_l or "404" in msg_l:
214
+ error_type = "not_found"
215
+ norm_code = "E2104"
216
+ elif "504" in msg_l or "gateway timeout" in msg_l:
217
+ error_type = "upstream_timeout"
218
+ norm_code = "E2105"
219
+ elif "500" in msg_l or "internal server error" in msg_l:
220
+ error_type = "upstream_internal_error"
221
+ norm_code = "E2106"
222
+ elif "subtitles_error" in msg_l or "unable to download api page" in msg_l:
223
+ error_type = "media_backend_error"
224
+ norm_code = "E2107"
225
+
226
+ # Attach richer diagnostics without breaking existing callers
227
+ url = None
228
+ if "url" in kwargs:
229
+ url = kwargs.get("url")
230
+ elif "params" in kwargs and isinstance(kwargs.get("params"), dict):
231
+ url = kwargs["params"].get("url")
232
+
233
+ diagnostic = diagnose_scraping_error(e, url=url)
234
+ return error_response(
235
+ tool=func.__name__,
236
+ input={k: v for k, v in kwargs.items() if k != "ctx"},
237
+ error_type=error_type,
238
+ code=norm_code,
239
+ message=msg,
240
+ details={"code": code, "payload": payload, "diagnostic": diagnostic},
241
+ )
242
+ except ThordataNetworkError as e:
243
+ err_str = str(e)
244
+ if "Task" in err_str and "failed" in err_str:
245
+ error_code = "E3001"
246
+ err_type = "task_failed"
247
+ msg = "Scraping task failed."
248
+ else:
249
+ error_code = "E2002"
250
+ err_type = "network_error"
251
+ msg = "Network error: could not reach Thordata services."
252
+
253
+ url = None
254
+ if "url" in kwargs:
255
+ url = kwargs.get("url")
256
+ elif "params" in kwargs and isinstance(kwargs.get("params"), dict):
257
+ url = kwargs["params"].get("url")
258
+
259
+ diagnostic = diagnose_scraping_error(e, url=url)
260
+ return error_response(
261
+ tool=func.__name__,
262
+ input={k: v for k, v in kwargs.items() if k != "ctx"},
263
+ error_type=err_type,
264
+ code=error_code,
265
+ message=msg,
266
+ details={"raw_error": err_str, "diagnostic": diagnostic},
267
+ )
268
+ except Exception as e: # pragma: no cover
269
+ # Use logger.error instead of logger.exception to avoid rich traceback issues
270
+ logger.error("Unexpected error in %s: %s", func.__name__, str(e), exc_info=False)
271
+ return error_response(
272
+ tool=func.__name__,
273
+ input={k: v for k, v in kwargs.items() if k != "ctx"},
274
+ error_type="unexpected_error",
275
+ code="E9000",
276
+ message=str(e),
277
+ )
278
+
279
+ return wrapper
280
+
281
+
282
+ # ---------------------------------------------------------------------------
283
+ # Helpers for HTML → Markdown & truncation
284
+ # ---------------------------------------------------------------------------
285
+
286
+ def _strip_large_data_urls(html: str, *, max_keep_chars: int = 256) -> str:
287
+ """Remove large inlined data: URLs (base64 fonts/images) to reduce token bloat.
288
+
289
+ Keeps small data URLs (<= max_keep_chars) to avoid breaking tiny icons.
290
+ """
291
+ import re
292
+
293
+ def _repl(m: re.Match[str]) -> str:
294
+ s = m.group(0)
295
+ if len(s) <= max_keep_chars:
296
+ return s
297
+ return 'data:...'
298
+
299
+ # Replace any data:... sequences inside quotes.
300
+ return re.sub(r"data:[^\s\"']+", _repl, html)
301
+
302
+
303
+ def _extract_readable_html(html: str) -> str:
304
+ """Best-effort extraction of main readable content.
305
+
306
+ Prefer <main> or <article>. Fall back to full document if not found.
307
+ """
308
+ import re
309
+
310
+ # Very lightweight heuristics (no extra deps): keep the largest <main>/<article> block.
311
+ candidates: list[str] = []
312
+ for tag in ("main", "article"):
313
+ pattern = re.compile(rf"<{tag}[^>]*>([\\s\\S]*?)</{tag}>", re.IGNORECASE)
314
+ for m in pattern.finditer(html):
315
+ block = m.group(0)
316
+ if block:
317
+ candidates.append(block)
318
+ if not candidates:
319
+ return html
320
+ candidates.sort(key=len, reverse=True)
321
+ return candidates[0]
322
+
323
+
324
+ def html_to_markdown_clean(html: str) -> str:
325
+ try:
326
+ html = _strip_large_data_urls(html)
327
+ html = _extract_readable_html(html)
328
+ text = md(html, heading_style="ATX", strip=["script", "style", "noscript", "nav", "footer", "iframe", "svg"])
329
+ lines = [line.rstrip() for line in text.splitlines()]
330
+ return "\n".join(line for line in lines if line)
331
+ except Exception:
332
+ h = html2text.HTML2Text()
333
+ h.ignore_links = False
334
+ return h.handle(html)
335
+
336
+
337
+ def truncate_content(content: str, max_length: int = 20_000) -> str:
338
+ if len(content) <= max_length:
339
+ return content
340
+ return content[:max_length] + f"\n\n... [Content Truncated, original length: {len(content)} chars]"
341
+
342
+
343
+ # ---------------------------------------------------------------------------
344
+ # Download URL helpers
345
+ # ---------------------------------------------------------------------------
346
+
347
+ def enrich_download_url(download_url: str, *, task_id: str | None = None, file_type: str | None = None) -> str:
348
+ """Ensure returned download URLs are directly usable in a browser.
349
+
350
+ Some SDK / backend paths may return a URL missing required query params such as
351
+ `api_key` and `plat`, leading to {"error":"Missing necessary parameters."}.
352
+ """
353
+ try:
354
+ from .config import settings
355
+ except Exception: # pragma: no cover
356
+ settings = None # type: ignore[assignment]
357
+
358
+ token = getattr(settings, "THORDATA_SCRAPER_TOKEN", None) if settings else None
359
+ plat = getattr(settings, "THORDATA_DOWNLOAD_PLAT", "1") if settings else "1"
360
+ base = getattr(settings, "THORDATA_DOWNLOAD_BASE_URL", "https://scraperapi.thordata.com/download") if settings else "https://scraperapi.thordata.com/download"
361
+
362
+ # If we can't enrich (no token), return as-is.
363
+ if not token:
364
+ return download_url
365
+
366
+ parsed = urlparse(download_url)
367
+ qs = dict(parse_qsl(parsed.query, keep_blank_values=True))
368
+
369
+ # Backfill known parameters
370
+ if "api_key" not in qs:
371
+ qs["api_key"] = token
372
+ if "plat" not in qs and plat:
373
+ qs["plat"] = plat
374
+ if "task_id" not in qs and task_id:
375
+ qs["task_id"] = task_id
376
+ if "type" not in qs and file_type:
377
+ qs["type"] = file_type
378
+
379
+ # If SDK returned a relative/alternate host, normalize to configured base
380
+ if not parsed.scheme or not parsed.netloc:
381
+ parsed = urlparse(base)
382
+ elif parsed.path.rstrip("/") != urlparse(base).path.rstrip("/"):
383
+ # Keep original host, only fix query; unless path looks non-download
384
+ pass
385
+
386
+ new_query = urlencode(qs, doseq=True)
387
+ new_parsed = parsed._replace(query=new_query)
388
+ # If original URL had a different host/path but is a valid absolute URL, preserve them.
389
+ if parsed.scheme and parsed.netloc and urlparse(download_url).scheme and urlparse(download_url).netloc:
390
+ orig = urlparse(download_url)
391
+ new_parsed = orig._replace(query=new_query)
392
+
393
+ return urlunparse(new_parsed)