perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perplexity_webui_scraper/__init__.py +2 -13
- perplexity_webui_scraper/config.py +27 -2
- perplexity_webui_scraper/core.py +166 -9
- perplexity_webui_scraper/enums.py +34 -4
- perplexity_webui_scraper/exceptions.py +74 -0
- perplexity_webui_scraper/http.py +368 -37
- perplexity_webui_scraper/logging.py +256 -0
- perplexity_webui_scraper/mcp/__init__.py +18 -0
- perplexity_webui_scraper/mcp/__main__.py +9 -0
- perplexity_webui_scraper/mcp/server.py +181 -0
- perplexity_webui_scraper/resilience.py +179 -0
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.5.dist-info}/METADATA +98 -8
- perplexity_webui_scraper-0.3.5.dist-info/RECORD +21 -0
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.5.dist-info}/entry_points.txt +1 -0
- perplexity_webui_scraper-0.3.4.dist-info/RECORD +0 -16
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.5.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,256 @@
|
|
|
1
|
+
"""Logging configuration using loguru.
|
|
2
|
+
|
|
3
|
+
Provides detailed, structured logging for all library operations.
|
|
4
|
+
Logging is disabled by default and can be enabled via ClientConfig.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
from typing import TYPE_CHECKING, Any
|
|
12
|
+
|
|
13
|
+
from loguru import logger
|
|
14
|
+
|
|
15
|
+
from .enums import LogLevel
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from os import PathLike
|
|
20
|
+
|
|
21
|
+
# Remove default handler to start with a clean slate
|
|
22
|
+
logger.remove()
|
|
23
|
+
|
|
24
|
+
# Flag to track if logging is configured
|
|
25
|
+
_logging_configured: bool = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def configure_logging(
|
|
29
|
+
level: LogLevel | str = LogLevel.DISABLED,
|
|
30
|
+
log_file: str | PathLike[str] | None = None,
|
|
31
|
+
) -> None:
|
|
32
|
+
"""Configure logging for the library.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
level: Logging level (LogLevel enum or string). Default is DISABLED.
|
|
36
|
+
log_file: Optional file path to write logs. If set, logs go to file only.
|
|
37
|
+
If None, logs go to console. Logs are appended, never deleted.
|
|
38
|
+
|
|
39
|
+
Note:
|
|
40
|
+
- If log_file is set: logs go to file only (no console output)
|
|
41
|
+
- If log_file is None: logs go to console only
|
|
42
|
+
- Log format includes timestamp, level, module, function, and message
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
global _logging_configured # noqa: PLW0603
|
|
46
|
+
|
|
47
|
+
# Remove any existing handlers
|
|
48
|
+
logger.remove()
|
|
49
|
+
|
|
50
|
+
# Normalize level to string
|
|
51
|
+
level_str = level.value if isinstance(level, LogLevel) else str(level).upper()
|
|
52
|
+
|
|
53
|
+
if level_str == "DISABLED":
|
|
54
|
+
# Logging disabled, add a null handler to suppress all output
|
|
55
|
+
logger.disable("perplexity_webui_scraper")
|
|
56
|
+
_logging_configured = False
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
# Enable the logger
|
|
60
|
+
logger.enable("perplexity_webui_scraper")
|
|
61
|
+
|
|
62
|
+
# Console format - concise but informative
|
|
63
|
+
console_format = (
|
|
64
|
+
"<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> | "
|
|
65
|
+
"<level>{level: <8}</level> | "
|
|
66
|
+
"<cyan>{name}</cyan>:<cyan>{function}</cyan>:<cyan>{line}</cyan> | "
|
|
67
|
+
"<level>{message}</level>"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# File format - detailed with extra context
|
|
71
|
+
file_format = "{time:YYYY-MM-DD HH:mm:ss.SSS} | {level: <8} | {name}:{function}:{line} | {message} | {extra}"
|
|
72
|
+
|
|
73
|
+
if log_file is not None:
|
|
74
|
+
# Log to file only (no console output)
|
|
75
|
+
log_path = Path(log_file)
|
|
76
|
+
logger.add(
|
|
77
|
+
log_path,
|
|
78
|
+
format=file_format,
|
|
79
|
+
level=level_str,
|
|
80
|
+
rotation=None, # Never rotate
|
|
81
|
+
retention=None, # Never delete
|
|
82
|
+
compression=None, # No compression
|
|
83
|
+
mode="a", # Append mode
|
|
84
|
+
encoding="utf-8",
|
|
85
|
+
filter="perplexity_webui_scraper",
|
|
86
|
+
enqueue=True, # Thread-safe
|
|
87
|
+
)
|
|
88
|
+
else:
|
|
89
|
+
# Log to console only (no file)
|
|
90
|
+
logger.add(
|
|
91
|
+
sys.stderr,
|
|
92
|
+
format=console_format,
|
|
93
|
+
level=level_str,
|
|
94
|
+
colorize=True,
|
|
95
|
+
filter="perplexity_webui_scraper",
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
_logging_configured = True
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def get_logger(name: str) -> Any:
|
|
102
|
+
"""Get a logger instance bound to the given module name.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
name: Module name (typically __name__).
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
A loguru logger instance bound to the module.
|
|
109
|
+
"""
|
|
110
|
+
|
|
111
|
+
return logger.bind(module=name)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
# Convenience shortcuts for common log operations
|
|
115
|
+
def log_request(
|
|
116
|
+
method: str,
|
|
117
|
+
url: str,
|
|
118
|
+
*,
|
|
119
|
+
params: dict[str, Any] | None = None,
|
|
120
|
+
headers: dict[str, str] | None = None,
|
|
121
|
+
body_size: int | None = None,
|
|
122
|
+
) -> None:
|
|
123
|
+
"""Log an outgoing HTTP request with full details."""
|
|
124
|
+
|
|
125
|
+
logger.debug(
|
|
126
|
+
"HTTP request initiated | method={method} url={url} params={params} "
|
|
127
|
+
"headers_count={headers_count} body_size={body_size}",
|
|
128
|
+
method=method,
|
|
129
|
+
url=url,
|
|
130
|
+
params=params,
|
|
131
|
+
headers_count=len(headers) if headers else 0,
|
|
132
|
+
body_size=body_size,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def log_response(
|
|
137
|
+
method: str,
|
|
138
|
+
url: str,
|
|
139
|
+
status_code: int,
|
|
140
|
+
*,
|
|
141
|
+
elapsed_ms: float | None = None,
|
|
142
|
+
content_length: int | None = None,
|
|
143
|
+
headers: dict[str, str] | None = None,
|
|
144
|
+
) -> None:
|
|
145
|
+
"""Log an HTTP response with full details."""
|
|
146
|
+
|
|
147
|
+
level = "DEBUG" if status_code < 400 else "WARNING"
|
|
148
|
+
logger.log(
|
|
149
|
+
level,
|
|
150
|
+
"HTTP response received | method={method} url={url} status={status_code} "
|
|
151
|
+
"elapsed_ms={elapsed_ms:.2f} content_length={content_length}",
|
|
152
|
+
method=method,
|
|
153
|
+
url=url,
|
|
154
|
+
status_code=status_code,
|
|
155
|
+
elapsed_ms=elapsed_ms or 0,
|
|
156
|
+
content_length=content_length,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def log_retry(
|
|
161
|
+
attempt: int,
|
|
162
|
+
max_attempts: int,
|
|
163
|
+
exception: Exception,
|
|
164
|
+
wait_seconds: float,
|
|
165
|
+
) -> None:
|
|
166
|
+
"""Log a retry attempt."""
|
|
167
|
+
|
|
168
|
+
logger.warning(
|
|
169
|
+
"Retry attempt | attempt={attempt}/{max_attempts} exception={exception_type}: {exception_msg} "
|
|
170
|
+
"wait_seconds={wait_seconds:.2f}",
|
|
171
|
+
attempt=attempt,
|
|
172
|
+
max_attempts=max_attempts,
|
|
173
|
+
exception_type=type(exception).__name__,
|
|
174
|
+
exception_msg=str(exception),
|
|
175
|
+
wait_seconds=wait_seconds,
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def log_cloudflare_detected(status_code: int, markers_found: list[str]) -> None:
|
|
180
|
+
"""Log Cloudflare challenge detection."""
|
|
181
|
+
|
|
182
|
+
logger.warning(
|
|
183
|
+
"Cloudflare challenge detected | status_code={status_code} markers={markers}",
|
|
184
|
+
status_code=status_code,
|
|
185
|
+
markers=markers_found,
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def log_fingerprint_rotation(old_profile: str, new_profile: str) -> None:
|
|
190
|
+
"""Log browser fingerprint rotation."""
|
|
191
|
+
|
|
192
|
+
logger.info(
|
|
193
|
+
"Browser fingerprint rotated | old_profile={old} new_profile={new}",
|
|
194
|
+
old=old_profile,
|
|
195
|
+
new=new_profile,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def log_rate_limit(wait_seconds: float) -> None:
|
|
200
|
+
"""Log rate limiting wait."""
|
|
201
|
+
|
|
202
|
+
logger.debug(
|
|
203
|
+
"Rate limiter throttling | wait_seconds={wait_seconds:.3f}",
|
|
204
|
+
wait_seconds=wait_seconds,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def log_session_created(impersonate: str, timeout: int) -> None:
|
|
209
|
+
"""Log HTTP session creation."""
|
|
210
|
+
|
|
211
|
+
logger.info(
|
|
212
|
+
"HTTP session created | browser_profile={profile} timeout={timeout}s",
|
|
213
|
+
profile=impersonate,
|
|
214
|
+
timeout=timeout,
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
|
|
218
|
+
def log_conversation_created(config_summary: str) -> None:
|
|
219
|
+
"""Log conversation creation."""
|
|
220
|
+
|
|
221
|
+
logger.info(
|
|
222
|
+
"Conversation created | config={config}",
|
|
223
|
+
config=config_summary,
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def log_query_sent(query: str, model: str, has_files: bool) -> None:
|
|
228
|
+
"""Log a query being sent."""
|
|
229
|
+
|
|
230
|
+
logger.info(
|
|
231
|
+
"Query sent | model={model} has_files={has_files} query_preview={query_preview}",
|
|
232
|
+
model=model,
|
|
233
|
+
has_files=has_files,
|
|
234
|
+
query_preview=query[:100] + "..." if len(query) > 100 else query,
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def log_stream_chunk(chunk_size: int, is_final: bool) -> None:
|
|
239
|
+
"""Log a streaming chunk received."""
|
|
240
|
+
|
|
241
|
+
logger.debug(
|
|
242
|
+
"Stream chunk received | size={size} is_final={is_final}",
|
|
243
|
+
size=chunk_size,
|
|
244
|
+
is_final=is_final,
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def log_error(error: Exception, context: str = "") -> None:
|
|
249
|
+
"""Log an error with full traceback."""
|
|
250
|
+
|
|
251
|
+
logger.exception(
|
|
252
|
+
"Error occurred | context={context} error_type={error_type} message={message}",
|
|
253
|
+
context=context,
|
|
254
|
+
error_type=type(error).__name__,
|
|
255
|
+
message=str(error),
|
|
256
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""
|
|
2
|
+
MCP (Model Context Protocol) server for Perplexity WebUI Scraper.
|
|
3
|
+
|
|
4
|
+
This module provides an MCP server that exposes Perplexity AI search capabilities to AI assistants.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
__all__: list[str] = ["run_server"]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def run_server() -> None:
|
|
14
|
+
"""Run the MCP server."""
|
|
15
|
+
|
|
16
|
+
from .server import main # noqa: PLC0415
|
|
17
|
+
|
|
18
|
+
main()
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""MCP server implementation using FastMCP."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from os import environ
|
|
6
|
+
from typing import Literal
|
|
7
|
+
|
|
8
|
+
from fastmcp import FastMCP
|
|
9
|
+
|
|
10
|
+
from perplexity_webui_scraper.config import ClientConfig, ConversationConfig
|
|
11
|
+
from perplexity_webui_scraper.core import Perplexity
|
|
12
|
+
from perplexity_webui_scraper.enums import CitationMode, SearchFocus, SourceFocus
|
|
13
|
+
from perplexity_webui_scraper.models import Models
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Create FastMCP server
|
|
17
|
+
mcp = FastMCP(
|
|
18
|
+
"perplexity-webui-scraper-mcp",
|
|
19
|
+
instructions=(
|
|
20
|
+
"Search the web with Perplexity AI using the full range of premium models. "
|
|
21
|
+
"Unlike the official Perplexity API, this tool provides access to GPT-5.2, Claude 4.5, "
|
|
22
|
+
"Gemini 3, Grok 4.1, and other cutting-edge models with reasoning capabilities. "
|
|
23
|
+
"Use for real-time web research, academic searches, financial data, and current events. "
|
|
24
|
+
"Supports multiple source types: web, academic papers, social media, and SEC filings."
|
|
25
|
+
),
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Model name mapping to Model objects
|
|
29
|
+
MODEL_MAP = {
|
|
30
|
+
"best": Models.BEST,
|
|
31
|
+
"research": Models.RESEARCH,
|
|
32
|
+
"labs": Models.LABS,
|
|
33
|
+
"sonar": Models.SONAR,
|
|
34
|
+
"gpt52": Models.GPT_52,
|
|
35
|
+
"gpt52_thinking": Models.GPT_52_THINKING,
|
|
36
|
+
"claude_opus": Models.CLAUDE_45_OPUS,
|
|
37
|
+
"claude_opus_thinking": Models.CLAUDE_45_OPUS_THINKING,
|
|
38
|
+
"claude_sonnet": Models.CLAUDE_45_SONNET,
|
|
39
|
+
"claude_sonnet_thinking": Models.CLAUDE_45_SONNET_THINKING,
|
|
40
|
+
"gemini_pro": Models.GEMINI_3_PRO,
|
|
41
|
+
"gemini_flash": Models.GEMINI_3_FLASH,
|
|
42
|
+
"gemini_flash_thinking": Models.GEMINI_3_FLASH_THINKING,
|
|
43
|
+
"grok": Models.GROK_41,
|
|
44
|
+
"grok_thinking": Models.GROK_41_THINKING,
|
|
45
|
+
"kimi_thinking": Models.KIMI_K2_THINKING,
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
# Available model names for type hints
|
|
49
|
+
ModelName = Literal[
|
|
50
|
+
"best",
|
|
51
|
+
"research",
|
|
52
|
+
"labs",
|
|
53
|
+
"sonar",
|
|
54
|
+
"gpt52",
|
|
55
|
+
"gpt52_thinking",
|
|
56
|
+
"claude_opus",
|
|
57
|
+
"claude_opus_thinking",
|
|
58
|
+
"claude_sonnet",
|
|
59
|
+
"claude_sonnet_thinking",
|
|
60
|
+
"gemini_pro",
|
|
61
|
+
"gemini_flash",
|
|
62
|
+
"gemini_flash_thinking",
|
|
63
|
+
"grok",
|
|
64
|
+
"grok_thinking",
|
|
65
|
+
"kimi_thinking",
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Source focus mapping
|
|
69
|
+
SOURCE_FOCUS_MAP = {
|
|
70
|
+
"web": [SourceFocus.WEB],
|
|
71
|
+
"academic": [SourceFocus.ACADEMIC],
|
|
72
|
+
"social": [SourceFocus.SOCIAL],
|
|
73
|
+
"finance": [SourceFocus.FINANCE],
|
|
74
|
+
"all": [SourceFocus.WEB, SourceFocus.ACADEMIC, SourceFocus.SOCIAL],
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
SourceFocusName = Literal["web", "academic", "social", "finance", "all"]
|
|
78
|
+
|
|
79
|
+
# Client singleton
|
|
80
|
+
_client: Perplexity | None = None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _get_client() -> Perplexity:
|
|
84
|
+
"""Get or create Perplexity client."""
|
|
85
|
+
|
|
86
|
+
global _client # noqa: PLW0603
|
|
87
|
+
if _client is None:
|
|
88
|
+
token = environ.get("PERPLEXITY_SESSION_TOKEN", "")
|
|
89
|
+
|
|
90
|
+
if not token:
|
|
91
|
+
raise ValueError(
|
|
92
|
+
"PERPLEXITY_SESSION_TOKEN environment variable is required. "
|
|
93
|
+
"Set it with: export PERPLEXITY_SESSION_TOKEN='your_token_here'"
|
|
94
|
+
)
|
|
95
|
+
_client = Perplexity(token, config=ClientConfig())
|
|
96
|
+
|
|
97
|
+
return _client
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
@mcp.tool
|
|
101
|
+
def perplexity_ask(
|
|
102
|
+
query: str,
|
|
103
|
+
model: ModelName = "best",
|
|
104
|
+
source_focus: SourceFocusName = "web",
|
|
105
|
+
) -> str:
|
|
106
|
+
"""
|
|
107
|
+
Ask a question and get AI-generated answers with real-time data from the internet.
|
|
108
|
+
|
|
109
|
+
Returns up-to-date information from web sources. Use for factual queries, research,
|
|
110
|
+
current events, news, library versions, documentation, or any question requiring
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
query: The search query or question to ask Perplexity AI.
|
|
114
|
+
model: AI model to use. Options:
|
|
115
|
+
- "best": Automatically selects optimal model (default)
|
|
116
|
+
- "research": Fast and thorough for routine research
|
|
117
|
+
- "labs": Multi-step tasks with advanced troubleshooting
|
|
118
|
+
- "sonar": Perplexity's fast built-in model
|
|
119
|
+
- "gpt52": OpenAI's GPT-5.2
|
|
120
|
+
- "gpt52_thinking": GPT-5.2 with reasoning
|
|
121
|
+
- "claude_opus": Anthropic's Claude Opus 4.5
|
|
122
|
+
- "claude_opus_thinking": Claude Opus with reasoning
|
|
123
|
+
- "claude_sonnet": Anthropic's Claude Sonnet 4.5
|
|
124
|
+
- "claude_sonnet_thinking": Claude Sonnet with reasoning
|
|
125
|
+
- "gemini_pro": Google's Gemini 3 Pro
|
|
126
|
+
- "gemini_flash": Google's Gemini 3 Flash
|
|
127
|
+
- "gemini_flash_thinking": Gemini Flash with reasoning
|
|
128
|
+
- "grok": xAI's Grok 4.1
|
|
129
|
+
- "grok_thinking": Grok with reasoning
|
|
130
|
+
- "kimi_thinking": Moonshot's Kimi K2 with reasoning
|
|
131
|
+
source_focus: Type of sources to prioritize:
|
|
132
|
+
- "web": General web search (default)
|
|
133
|
+
- "academic": Scholarly articles and papers
|
|
134
|
+
- "social": Social media (Reddit, Twitter)
|
|
135
|
+
- "finance": SEC EDGAR financial filings
|
|
136
|
+
- "all": Combine web, academic, and social sources
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
AI-generated answer with inline citations [1][2] and a Citations section.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
client = _get_client()
|
|
143
|
+
selected_model = MODEL_MAP.get(model, Models.BEST)
|
|
144
|
+
sources = SOURCE_FOCUS_MAP.get(source_focus, [SourceFocus.WEB])
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
conversation = client.create_conversation(
|
|
148
|
+
ConversationConfig(
|
|
149
|
+
model=selected_model,
|
|
150
|
+
citation_mode=CitationMode.DEFAULT,
|
|
151
|
+
search_focus=SearchFocus.WEB,
|
|
152
|
+
source_focus=sources,
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
conversation.ask(query)
|
|
157
|
+
answer = conversation.answer or "No answer received"
|
|
158
|
+
|
|
159
|
+
# Build response with Perplexity-style citations
|
|
160
|
+
response_parts = [answer]
|
|
161
|
+
|
|
162
|
+
if conversation.search_results:
|
|
163
|
+
response_parts.append("\n\nCitations:")
|
|
164
|
+
|
|
165
|
+
for i, result in enumerate(conversation.search_results, 1):
|
|
166
|
+
url = result.url or ""
|
|
167
|
+
response_parts.append(f"\n[{i}]: {url}")
|
|
168
|
+
|
|
169
|
+
return "".join(response_parts)
|
|
170
|
+
except Exception as error:
|
|
171
|
+
return f"Error searching Perplexity: {error!s}"
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def main() -> None:
|
|
175
|
+
"""Run the MCP server."""
|
|
176
|
+
|
|
177
|
+
mcp.run()
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__ == "__main__":
|
|
181
|
+
main()
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Resilience utilities for HTTP requests.
|
|
3
|
+
|
|
4
|
+
Provides retry mechanisms, rate limiting, and Cloudflare bypass utilities
|
|
5
|
+
using the tenacity library for robust retry handling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from collections.abc import Callable
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
import random
|
|
13
|
+
from threading import Lock
|
|
14
|
+
import time
|
|
15
|
+
from typing import TYPE_CHECKING, Any, TypeVar
|
|
16
|
+
|
|
17
|
+
from tenacity import RetryCallState, retry, retry_if_exception_type, stop_after_attempt, wait_exponential_jitter
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from collections.abc import Callable
|
|
22
|
+
|
|
23
|
+
T = TypeVar("T")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
# Browser profiles supported by curl_cffi for fingerprint rotation
|
|
27
|
+
BROWSER_PROFILES: tuple[str, ...] = (
|
|
28
|
+
"chrome",
|
|
29
|
+
"chrome110",
|
|
30
|
+
"chrome116",
|
|
31
|
+
"chrome119",
|
|
32
|
+
"chrome120",
|
|
33
|
+
"chrome123",
|
|
34
|
+
"chrome124",
|
|
35
|
+
"chrome131",
|
|
36
|
+
"edge99",
|
|
37
|
+
"edge101",
|
|
38
|
+
"safari15_3",
|
|
39
|
+
"safari15_5",
|
|
40
|
+
"safari17_0",
|
|
41
|
+
"safari17_2_ios",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Cloudflare challenge detection markers
|
|
45
|
+
CLOUDFLARE_MARKERS: tuple[str, ...] = (
|
|
46
|
+
"cf-ray",
|
|
47
|
+
"cf-mitigated",
|
|
48
|
+
"__cf_chl_",
|
|
49
|
+
"Checking your browser",
|
|
50
|
+
"Just a moment...",
|
|
51
|
+
"cloudflare",
|
|
52
|
+
"Enable JavaScript and cookies to continue",
|
|
53
|
+
"challenge-platform",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@dataclass(slots=True)
|
|
58
|
+
class RetryConfig:
|
|
59
|
+
"""Configuration for retry behavior.
|
|
60
|
+
|
|
61
|
+
Attributes:
|
|
62
|
+
max_retries: Maximum number of retry attempts.
|
|
63
|
+
base_delay: Initial delay in seconds before first retry.
|
|
64
|
+
max_delay: Maximum delay between retries.
|
|
65
|
+
jitter: Random jitter factor to add to delays (0-1).
|
|
66
|
+
"""
|
|
67
|
+
|
|
68
|
+
max_retries: int = 3
|
|
69
|
+
base_delay: float = 1.0
|
|
70
|
+
max_delay: float = 60.0
|
|
71
|
+
jitter: float = 0.5
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class RateLimiter:
|
|
76
|
+
"""Token bucket rate limiter for throttling requests.
|
|
77
|
+
|
|
78
|
+
Attributes:
|
|
79
|
+
requests_per_second: Maximum requests allowed per second.
|
|
80
|
+
"""
|
|
81
|
+
|
|
82
|
+
requests_per_second: float = 0.5
|
|
83
|
+
_last_request: float = field(default=0.0, init=False)
|
|
84
|
+
_lock: Lock = field(default_factory=Lock, init=False)
|
|
85
|
+
|
|
86
|
+
def acquire(self) -> None:
|
|
87
|
+
"""Wait until a request can be made within rate limits."""
|
|
88
|
+
|
|
89
|
+
with self._lock:
|
|
90
|
+
now = time.monotonic()
|
|
91
|
+
min_interval = 1.0 / self.requests_per_second
|
|
92
|
+
|
|
93
|
+
if self._last_request > 0:
|
|
94
|
+
elapsed = now - self._last_request
|
|
95
|
+
wait_time = min_interval - elapsed
|
|
96
|
+
|
|
97
|
+
if wait_time > 0:
|
|
98
|
+
time.sleep(wait_time)
|
|
99
|
+
|
|
100
|
+
self._last_request = time.monotonic()
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_random_browser_profile() -> str:
|
|
104
|
+
"""Get a random browser profile for fingerprint rotation.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
A browser profile identifier compatible with curl_cffi.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
return random.choice(BROWSER_PROFILES)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def is_cloudflare_challenge(response_text: str, headers: dict[str, Any] | None = None) -> bool:
|
|
114
|
+
"""Detect if a response is a Cloudflare challenge page.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
response_text: The response body text.
|
|
118
|
+
headers: Optional response headers.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
True if Cloudflare challenge markers are detected.
|
|
122
|
+
"""
|
|
123
|
+
|
|
124
|
+
text_lower = response_text.lower()
|
|
125
|
+
|
|
126
|
+
for marker in CLOUDFLARE_MARKERS:
|
|
127
|
+
if marker.lower() in text_lower:
|
|
128
|
+
return True
|
|
129
|
+
|
|
130
|
+
if headers:
|
|
131
|
+
for key in headers:
|
|
132
|
+
key_lower = key.lower()
|
|
133
|
+
|
|
134
|
+
if "cf-" in key_lower or "cloudflare" in key_lower:
|
|
135
|
+
return True
|
|
136
|
+
|
|
137
|
+
return False
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def is_cloudflare_status(status_code: int) -> bool:
|
|
141
|
+
"""Check if status code indicates a potential Cloudflare block.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
status_code: HTTP status code.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
True if status code is commonly used by Cloudflare challenges.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
return status_code in (403, 503, 520, 521, 522, 523, 524, 525, 526)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def create_retry_decorator(
|
|
154
|
+
config: RetryConfig,
|
|
155
|
+
retryable_exceptions: tuple[type[Exception], ...],
|
|
156
|
+
on_retry: Callable[[RetryCallState], None] | None = None,
|
|
157
|
+
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
158
|
+
"""Create a tenacity retry decorator with the given configuration.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
config: Retry configuration.
|
|
162
|
+
retryable_exceptions: Tuple of exception types to retry on.
|
|
163
|
+
on_retry: Optional callback to execute on each retry.
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
A retry decorator configured with the given settings.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
return retry(
|
|
170
|
+
stop=stop_after_attempt(config.max_retries + 1),
|
|
171
|
+
wait=wait_exponential_jitter(
|
|
172
|
+
initial=config.base_delay,
|
|
173
|
+
max=config.max_delay,
|
|
174
|
+
jitter=config.max_delay * config.jitter,
|
|
175
|
+
),
|
|
176
|
+
retry=retry_if_exception_type(retryable_exceptions),
|
|
177
|
+
before_sleep=on_retry,
|
|
178
|
+
reraise=True,
|
|
179
|
+
)
|