perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perplexity_webui_scraper/__init__.py +5 -14
- perplexity_webui_scraper/cli/get_perplexity_session_token.py +24 -8
- perplexity_webui_scraper/config.py +33 -4
- perplexity_webui_scraper/constants.py +30 -10
- perplexity_webui_scraper/core.py +223 -21
- perplexity_webui_scraper/enums.py +91 -19
- perplexity_webui_scraper/exceptions.py +77 -1
- perplexity_webui_scraper/http.py +374 -38
- perplexity_webui_scraper/limits.py +12 -4
- perplexity_webui_scraper/logging.py +278 -0
- perplexity_webui_scraper/mcp/__init__.py +20 -0
- perplexity_webui_scraper/mcp/__main__.py +11 -0
- perplexity_webui_scraper/mcp/server.py +166 -0
- perplexity_webui_scraper/models.py +55 -19
- perplexity_webui_scraper/resilience.py +181 -0
- perplexity_webui_scraper/types.py +15 -5
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/METADATA +97 -7
- perplexity_webui_scraper-0.3.6.dist-info/RECORD +21 -0
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/WHEEL +1 -1
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/entry_points.txt +1 -0
- perplexity_webui_scraper-0.3.4.dist-info/RECORD +0 -16
perplexity_webui_scraper/http.py
CHANGED
|
@@ -1,59 +1,241 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
HTTP client wrapper for Perplexity API requests.
|
|
3
|
+
"""
|
|
2
4
|
|
|
3
5
|
from __future__ import annotations
|
|
4
6
|
|
|
7
|
+
from contextlib import suppress
|
|
8
|
+
from time import monotonic
|
|
5
9
|
from typing import TYPE_CHECKING, Any
|
|
6
10
|
|
|
7
11
|
from curl_cffi.requests import Response as CurlResponse
|
|
12
|
+
from curl_cffi.requests import Session
|
|
13
|
+
|
|
14
|
+
from .constants import API_BASE_URL, DEFAULT_HEADERS, ENDPOINT_ASK, ENDPOINT_SEARCH_INIT, SESSION_COOKIE_NAME
|
|
15
|
+
from .exceptions import AuthenticationError, CloudflareBlockError, PerplexityError, RateLimitError
|
|
16
|
+
from .limits import DEFAULT_TIMEOUT
|
|
17
|
+
from .logging import (
|
|
18
|
+
get_logger,
|
|
19
|
+
log_cloudflare_detected,
|
|
20
|
+
log_error,
|
|
21
|
+
log_fingerprint_rotation,
|
|
22
|
+
log_rate_limit,
|
|
23
|
+
log_request,
|
|
24
|
+
log_response,
|
|
25
|
+
log_retry,
|
|
26
|
+
log_session_created,
|
|
27
|
+
)
|
|
28
|
+
from .resilience import (
|
|
29
|
+
CLOUDFLARE_MARKERS,
|
|
30
|
+
RateLimiter,
|
|
31
|
+
RetryConfig,
|
|
32
|
+
create_retry_decorator,
|
|
33
|
+
get_random_browser_profile,
|
|
34
|
+
is_cloudflare_challenge,
|
|
35
|
+
is_cloudflare_status,
|
|
36
|
+
)
|
|
8
37
|
|
|
9
38
|
|
|
10
39
|
if TYPE_CHECKING:
|
|
11
40
|
from collections.abc import Generator
|
|
12
41
|
|
|
13
|
-
from
|
|
42
|
+
from tenacity import RetryCallState
|
|
14
43
|
|
|
15
|
-
|
|
16
|
-
API_BASE_URL,
|
|
17
|
-
DEFAULT_HEADERS,
|
|
18
|
-
ENDPOINT_ASK,
|
|
19
|
-
ENDPOINT_SEARCH_INIT,
|
|
20
|
-
SESSION_COOKIE_NAME,
|
|
21
|
-
)
|
|
22
|
-
from .exceptions import AuthenticationError, PerplexityError, RateLimitError
|
|
23
|
-
from .limits import DEFAULT_TIMEOUT
|
|
44
|
+
logger = get_logger(__name__)
|
|
24
45
|
|
|
25
46
|
|
|
26
47
|
class HTTPClient:
|
|
27
|
-
"""
|
|
48
|
+
"""
|
|
49
|
+
HTTP client wrapper with error handling for Perplexity API.
|
|
28
50
|
|
|
29
51
|
Provides a unified interface for making HTTP requests with automatic
|
|
30
|
-
error handling and
|
|
52
|
+
error handling, retry mechanisms, rate limiting, and Cloudflare bypass.
|
|
31
53
|
"""
|
|
32
54
|
|
|
33
|
-
__slots__ = (
|
|
55
|
+
__slots__ = (
|
|
56
|
+
"_impersonate",
|
|
57
|
+
"_rate_limiter",
|
|
58
|
+
"_retry_config",
|
|
59
|
+
"_rotate_fingerprint",
|
|
60
|
+
"_session",
|
|
61
|
+
"_session_token",
|
|
62
|
+
"_timeout",
|
|
63
|
+
)
|
|
34
64
|
|
|
35
65
|
def __init__(
|
|
36
66
|
self,
|
|
37
67
|
session_token: str,
|
|
38
68
|
timeout: int = DEFAULT_TIMEOUT,
|
|
39
69
|
impersonate: str = "chrome",
|
|
70
|
+
max_retries: int = 3,
|
|
71
|
+
retry_base_delay: float = 1.0,
|
|
72
|
+
retry_max_delay: float = 60.0,
|
|
73
|
+
retry_jitter: float = 0.5,
|
|
74
|
+
requests_per_second: float = 0.5,
|
|
75
|
+
rotate_fingerprint: bool = True,
|
|
40
76
|
) -> None:
|
|
41
|
-
"""Initialize the HTTP client.
|
|
77
|
+
"""Initialize the HTTP client.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
session_token: Perplexity session cookie.
|
|
81
|
+
timeout: Request timeout in seconds.
|
|
82
|
+
impersonate: Browser profile to impersonate.
|
|
83
|
+
max_retries: Maximum retry attempts for failed requests.
|
|
84
|
+
retry_base_delay: Initial delay before first retry.
|
|
85
|
+
retry_max_delay: Maximum delay between retries.
|
|
86
|
+
retry_jitter: Random jitter factor for delays.
|
|
87
|
+
requests_per_second: Rate limit (0 to disable).
|
|
88
|
+
rotate_fingerprint: Whether to rotate browser fingerprint on retries.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
logger.debug(
|
|
92
|
+
"Initializing HTTPClient | "
|
|
93
|
+
f"session_token_length={len(session_token)} "
|
|
94
|
+
f"timeout={timeout}s "
|
|
95
|
+
f"impersonate={impersonate} "
|
|
96
|
+
f"max_retries={max_retries} "
|
|
97
|
+
f"retry_base_delay={retry_base_delay}s "
|
|
98
|
+
f"retry_max_delay={retry_max_delay}s "
|
|
99
|
+
f"retry_jitter={retry_jitter} "
|
|
100
|
+
f"requests_per_second={requests_per_second} "
|
|
101
|
+
f"rotate_fingerprint={rotate_fingerprint}"
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
self._session_token = session_token
|
|
105
|
+
self._timeout = timeout
|
|
106
|
+
self._impersonate = impersonate
|
|
107
|
+
self._rotate_fingerprint = rotate_fingerprint
|
|
108
|
+
|
|
109
|
+
self._retry_config = RetryConfig(
|
|
110
|
+
max_retries=max_retries,
|
|
111
|
+
base_delay=retry_base_delay,
|
|
112
|
+
max_delay=retry_max_delay,
|
|
113
|
+
jitter=retry_jitter,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
logger.debug(
|
|
117
|
+
"RetryConfig created | "
|
|
118
|
+
f"max_retries={self._retry_config.max_retries} "
|
|
119
|
+
f"base_delay={self._retry_config.base_delay}s "
|
|
120
|
+
f"max_delay={self._retry_config.max_delay}s "
|
|
121
|
+
f"jitter={self._retry_config.jitter}"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
self._rate_limiter: RateLimiter | None = None
|
|
125
|
+
|
|
126
|
+
if requests_per_second > 0:
|
|
127
|
+
self._rate_limiter = RateLimiter(requests_per_second=requests_per_second)
|
|
128
|
+
logger.debug(f"RateLimiter enabled | requests_per_second={requests_per_second}")
|
|
129
|
+
else:
|
|
130
|
+
logger.debug("RateLimiter disabled | requests_per_second=0")
|
|
131
|
+
|
|
132
|
+
self._session = self._create_session(impersonate)
|
|
133
|
+
log_session_created(impersonate, timeout)
|
|
134
|
+
|
|
135
|
+
def _create_session(self, impersonate: str) -> Session:
|
|
136
|
+
"""Create a new HTTP session with the given browser profile."""
|
|
137
|
+
|
|
138
|
+
logger.debug(f"Creating new HTTP session | browser_profile={impersonate}")
|
|
42
139
|
|
|
43
140
|
headers: dict[str, str] = {
|
|
44
141
|
**DEFAULT_HEADERS,
|
|
45
142
|
"Referer": f"{API_BASE_URL}/",
|
|
46
143
|
"Origin": API_BASE_URL,
|
|
47
144
|
}
|
|
48
|
-
cookies: dict[str, str] = {SESSION_COOKIE_NAME:
|
|
145
|
+
cookies: dict[str, str] = {SESSION_COOKIE_NAME: self._session_token}
|
|
146
|
+
|
|
147
|
+
logger.debug(
|
|
148
|
+
f"Session configuration | headers_count={len(headers)} cookies_count={len(cookies)} base_url={API_BASE_URL}"
|
|
149
|
+
)
|
|
49
150
|
|
|
50
|
-
|
|
151
|
+
session = Session(
|
|
51
152
|
headers=headers,
|
|
52
153
|
cookies=cookies,
|
|
53
|
-
timeout=
|
|
154
|
+
timeout=self._timeout,
|
|
54
155
|
impersonate=impersonate,
|
|
55
156
|
)
|
|
56
157
|
|
|
158
|
+
logger.debug(f"HTTP session created successfully | browser_profile={impersonate}")
|
|
159
|
+
|
|
160
|
+
return session
|
|
161
|
+
|
|
162
|
+
def _rotate_session(self) -> None:
|
|
163
|
+
"""Rotate to a new browser fingerprint by recreating the session."""
|
|
164
|
+
|
|
165
|
+
if self._rotate_fingerprint:
|
|
166
|
+
old_profile = self._impersonate
|
|
167
|
+
new_profile = get_random_browser_profile()
|
|
168
|
+
|
|
169
|
+
logger.debug(f"Rotating browser fingerprint | old={old_profile} new={new_profile}")
|
|
170
|
+
log_fingerprint_rotation(old_profile, new_profile)
|
|
171
|
+
|
|
172
|
+
with suppress(Exception):
|
|
173
|
+
self._session.close()
|
|
174
|
+
logger.debug("Previous session closed")
|
|
175
|
+
|
|
176
|
+
self._impersonate = new_profile
|
|
177
|
+
self._session = self._create_session(new_profile)
|
|
178
|
+
|
|
179
|
+
logger.debug(f"Browser fingerprint rotated successfully | new_profile={new_profile}")
|
|
180
|
+
|
|
181
|
+
def _on_retry(self, retry_state: RetryCallState) -> None:
|
|
182
|
+
"""
|
|
183
|
+
Callback executed before each retry attempt.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
attempt = retry_state.attempt_number
|
|
187
|
+
exception = retry_state.outcome.exception() if retry_state.outcome else None
|
|
188
|
+
wait_time = retry_state.next_action.sleep if retry_state.next_action else 0
|
|
189
|
+
|
|
190
|
+
logger.warning(
|
|
191
|
+
f"Retry triggered | "
|
|
192
|
+
f"attempt={attempt}/{self._retry_config.max_retries} "
|
|
193
|
+
f"exception_type={type(exception).__name__ if exception else 'None'} "
|
|
194
|
+
f"exception_message={str(exception) if exception else 'None'} "
|
|
195
|
+
f"wait_seconds={wait_time:.2f}"
|
|
196
|
+
)
|
|
197
|
+
log_retry(attempt, self._retry_config.max_retries, exception, wait_time)
|
|
198
|
+
|
|
199
|
+
# Rotate fingerprint on retry to avoid detection
|
|
200
|
+
if self._rotate_fingerprint:
|
|
201
|
+
logger.debug("Rotating fingerprint due to retry")
|
|
202
|
+
self._rotate_session()
|
|
203
|
+
|
|
204
|
+
def _check_cloudflare(self, response: CurlResponse) -> None:
|
|
205
|
+
"""Check if response is a Cloudflare challenge and raise if so."""
|
|
206
|
+
|
|
207
|
+
logger.debug(f"Checking for Cloudflare challenge | status_code={response.status_code}")
|
|
208
|
+
|
|
209
|
+
if is_cloudflare_status(response.status_code):
|
|
210
|
+
logger.debug(f"Status code indicates potential Cloudflare block | status_code={response.status_code}")
|
|
211
|
+
|
|
212
|
+
try:
|
|
213
|
+
body = response.text
|
|
214
|
+
headers = dict(response.headers) if hasattr(response, "headers") else None
|
|
215
|
+
|
|
216
|
+
logger.debug(
|
|
217
|
+
f"Analyzing response for Cloudflare markers | "
|
|
218
|
+
f"body_length={len(body)} "
|
|
219
|
+
f"headers_count={len(headers) if headers else 0}"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
if is_cloudflare_challenge(body, headers):
|
|
223
|
+
# Find which markers were detected
|
|
224
|
+
markers_found = [m for m in CLOUDFLARE_MARKERS if m.lower() in body.lower()]
|
|
225
|
+
logger.warning(
|
|
226
|
+
f"Cloudflare challenge detected | "
|
|
227
|
+
f"status_code={response.status_code} "
|
|
228
|
+
f"markers_found={markers_found}"
|
|
229
|
+
)
|
|
230
|
+
log_cloudflare_detected(response.status_code, markers_found)
|
|
231
|
+
raise CloudflareBlockError()
|
|
232
|
+
else:
|
|
233
|
+
logger.debug("No Cloudflare markers found in response")
|
|
234
|
+
except CloudflareBlockError as error:
|
|
235
|
+
raise error
|
|
236
|
+
except Exception as error:
|
|
237
|
+
logger.debug(f"Error checking Cloudflare response | error={error}")
|
|
238
|
+
|
|
57
239
|
def _handle_error(self, error: Exception, context: str = "") -> None:
|
|
58
240
|
"""Handle HTTP errors and raise appropriate custom exceptions.
|
|
59
241
|
|
|
@@ -62,27 +244,70 @@ class HTTPClient:
|
|
|
62
244
|
context: Additional context for the error message.
|
|
63
245
|
|
|
64
246
|
Raises:
|
|
65
|
-
AuthenticationError: If status code is 403.
|
|
247
|
+
AuthenticationError: If status code is 403 (not Cloudflare).
|
|
66
248
|
RateLimitError: If status code is 429.
|
|
249
|
+
CloudflareBlockError: If Cloudflare challenge detected.
|
|
67
250
|
PerplexityError: For other HTTP errors.
|
|
68
251
|
"""
|
|
69
252
|
|
|
70
|
-
|
|
253
|
+
logger.debug(f"Handling error | context={context} error_type={type(error).__name__} error={error}")
|
|
254
|
+
log_error(error, context)
|
|
71
255
|
|
|
72
|
-
|
|
73
|
-
|
|
256
|
+
status_code = None
|
|
257
|
+
response = getattr(error, "response", None)
|
|
258
|
+
|
|
259
|
+
if response is not None:
|
|
260
|
+
status_code = getattr(response, "status_code", None)
|
|
261
|
+
logger.debug(f"Error has response | status_code={status_code}")
|
|
262
|
+
|
|
263
|
+
# Check for Cloudflare before handling as regular 403
|
|
264
|
+
if status_code is not None and is_cloudflare_status(status_code):
|
|
265
|
+
logger.debug(f"Checking if error is Cloudflare challenge | status_code={status_code}")
|
|
266
|
+
|
|
267
|
+
try:
|
|
268
|
+
body = response.text if hasattr(response, "text") else ""
|
|
269
|
+
headers = dict(response.headers) if hasattr(response, "headers") else None
|
|
270
|
+
|
|
271
|
+
if is_cloudflare_challenge(body, headers):
|
|
272
|
+
markers_found = [m for m in CLOUDFLARE_MARKERS if m.lower() in body.lower()]
|
|
273
|
+
logger.warning(
|
|
274
|
+
f"Cloudflare challenge confirmed in error response | "
|
|
275
|
+
f"status_code={status_code} "
|
|
276
|
+
f"markers={markers_found}"
|
|
277
|
+
)
|
|
278
|
+
log_cloudflare_detected(status_code, markers_found)
|
|
279
|
+
raise CloudflareBlockError() from error
|
|
280
|
+
except CloudflareBlockError:
|
|
281
|
+
raise
|
|
74
282
|
|
|
75
283
|
if status_code == 403:
|
|
284
|
+
logger.error(f"Authentication error | status_code=403 context={context}")
|
|
76
285
|
raise AuthenticationError() from error
|
|
77
286
|
elif status_code == 429:
|
|
287
|
+
logger.warning(f"Rate limit exceeded | status_code=429 context={context}")
|
|
78
288
|
raise RateLimitError() from error
|
|
79
289
|
elif status_code is not None:
|
|
290
|
+
logger.error(f"HTTP error | status_code={status_code} context={context} error={error}")
|
|
80
291
|
raise PerplexityError(f"{context}HTTP {status_code}: {error!s}", status_code=status_code) from error
|
|
81
292
|
else:
|
|
293
|
+
logger.error(f"Unknown error | context={context} error={error}")
|
|
82
294
|
raise PerplexityError(f"{context}{error!s}") from error
|
|
83
295
|
|
|
296
|
+
def _throttle(self) -> None:
|
|
297
|
+
"""Apply rate limiting before making a request."""
|
|
298
|
+
|
|
299
|
+
if self._rate_limiter:
|
|
300
|
+
start_time = monotonic()
|
|
301
|
+
logger.debug("Acquiring rate limiter")
|
|
302
|
+
self._rate_limiter.acquire()
|
|
303
|
+
wait_time = monotonic() - start_time
|
|
304
|
+
|
|
305
|
+
if wait_time > 0.001: # Only log if we actually waited
|
|
306
|
+
logger.debug(f"Rate limiter throttled request | wait_seconds={wait_time:.3f}")
|
|
307
|
+
log_rate_limit(wait_time)
|
|
308
|
+
|
|
84
309
|
def get(self, endpoint: str, params: dict[str, Any] | None = None) -> CurlResponse:
|
|
85
|
-
"""Make a GET request.
|
|
310
|
+
"""Make a GET request with retry and rate limiting.
|
|
86
311
|
|
|
87
312
|
Args:
|
|
88
313
|
endpoint: The API endpoint (relative to BASE_URL).
|
|
@@ -94,18 +319,67 @@ class HTTPClient:
|
|
|
94
319
|
Raises:
|
|
95
320
|
AuthenticationError: If session token is invalid.
|
|
96
321
|
RateLimitError: If rate limit is exceeded.
|
|
322
|
+
CloudflareBlockError: If Cloudflare blocks the request.
|
|
97
323
|
PerplexityError: For other errors.
|
|
98
324
|
"""
|
|
99
325
|
|
|
100
326
|
url = f"{API_BASE_URL}{endpoint}" if endpoint.startswith("/") else endpoint
|
|
101
327
|
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
328
|
+
logger.debug(f"GET request initiated | endpoint={endpoint} url={url} params={params}")
|
|
329
|
+
log_request("GET", url, params=params)
|
|
330
|
+
|
|
331
|
+
# Create retry wrapper for this specific call
|
|
332
|
+
retryable_exceptions = (RateLimitError, CloudflareBlockError, ConnectionError, TimeoutError)
|
|
333
|
+
|
|
334
|
+
@create_retry_decorator(self._retry_config, retryable_exceptions, self._on_retry)
|
|
335
|
+
def _do_get() -> CurlResponse:
|
|
336
|
+
self._throttle()
|
|
337
|
+
|
|
338
|
+
request_start = monotonic()
|
|
339
|
+
logger.debug(f"Executing GET request | url={url}")
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
response = self._session.get(url, params=params)
|
|
343
|
+
elapsed_ms = (monotonic() - request_start) * 1000
|
|
344
|
+
|
|
345
|
+
logger.debug(
|
|
346
|
+
f"GET response received | "
|
|
347
|
+
f"status_code={response.status_code} "
|
|
348
|
+
f"elapsed_ms={elapsed_ms:.2f} "
|
|
349
|
+
f"content_length={len(response.content) if hasattr(response, 'content') else 'unknown'}"
|
|
350
|
+
)
|
|
351
|
+
log_response(
|
|
352
|
+
"GET",
|
|
353
|
+
url,
|
|
354
|
+
response.status_code,
|
|
355
|
+
elapsed_ms=elapsed_ms,
|
|
356
|
+
content_length=len(response.content) if hasattr(response, "content") else None,
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
self._check_cloudflare(response)
|
|
360
|
+
response.raise_for_status()
|
|
361
|
+
|
|
362
|
+
logger.debug(f"GET request successful | endpoint={endpoint}")
|
|
363
|
+
return response
|
|
364
|
+
except Exception as error:
|
|
365
|
+
elapsed_ms = (monotonic() - request_start) * 1000
|
|
366
|
+
logger.debug(
|
|
367
|
+
f"GET request failed | "
|
|
368
|
+
f"endpoint={endpoint} "
|
|
369
|
+
f"elapsed_ms={elapsed_ms:.2f} "
|
|
370
|
+
f"error_type={type(error).__name__} "
|
|
371
|
+
f"error={error}"
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
if isinstance(error, (CloudflareBlockError, RateLimitError)):
|
|
375
|
+
raise
|
|
376
|
+
|
|
377
|
+
self._handle_error(error, f"GET {endpoint}: ")
|
|
378
|
+
|
|
379
|
+
# Never reached but satisfies type checker
|
|
380
|
+
raise error
|
|
381
|
+
|
|
382
|
+
return _do_get()
|
|
109
383
|
|
|
110
384
|
def post(
|
|
111
385
|
self,
|
|
@@ -113,7 +387,7 @@ class HTTPClient:
|
|
|
113
387
|
json: dict[str, Any] | None = None,
|
|
114
388
|
stream: bool = False,
|
|
115
389
|
) -> CurlResponse:
|
|
116
|
-
"""Make a POST request.
|
|
390
|
+
"""Make a POST request with retry and rate limiting.
|
|
117
391
|
|
|
118
392
|
Args:
|
|
119
393
|
endpoint: The API endpoint (relative to BASE_URL).
|
|
@@ -126,18 +400,62 @@ class HTTPClient:
|
|
|
126
400
|
Raises:
|
|
127
401
|
AuthenticationError: If session token is invalid.
|
|
128
402
|
RateLimitError: If rate limit is exceeded.
|
|
403
|
+
CloudflareBlockError: If Cloudflare blocks the request.
|
|
129
404
|
PerplexityError: For other errors.
|
|
130
405
|
"""
|
|
131
406
|
|
|
132
407
|
url = f"{API_BASE_URL}{endpoint}" if endpoint.startswith("/") else endpoint
|
|
408
|
+
body_size = len(str(json)) if json else 0
|
|
133
409
|
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
410
|
+
logger.debug(f"POST request initiated | endpoint={endpoint} url={url} stream={stream} body_size={body_size}")
|
|
411
|
+
log_request("POST", url, body_size=body_size)
|
|
412
|
+
|
|
413
|
+
retryable_exceptions = (RateLimitError, CloudflareBlockError, ConnectionError, TimeoutError)
|
|
414
|
+
|
|
415
|
+
@create_retry_decorator(self._retry_config, retryable_exceptions, self._on_retry)
|
|
416
|
+
def _do_post() -> CurlResponse:
|
|
417
|
+
self._throttle()
|
|
418
|
+
|
|
419
|
+
request_start = monotonic()
|
|
420
|
+
logger.debug(f"Executing POST request | url={url} stream={stream}")
|
|
421
|
+
|
|
422
|
+
try:
|
|
423
|
+
response = self._session.post(url, json=json, stream=stream)
|
|
424
|
+
elapsed_ms = (monotonic() - request_start) * 1000
|
|
425
|
+
|
|
426
|
+
logger.debug(
|
|
427
|
+
f"POST response received | "
|
|
428
|
+
f"status_code={response.status_code} "
|
|
429
|
+
f"elapsed_ms={elapsed_ms:.2f} "
|
|
430
|
+
f"stream={stream}"
|
|
431
|
+
)
|
|
432
|
+
log_response("POST", url, response.status_code, elapsed_ms=elapsed_ms)
|
|
433
|
+
|
|
434
|
+
self._check_cloudflare(response)
|
|
435
|
+
response.raise_for_status()
|
|
436
|
+
|
|
437
|
+
logger.debug(f"POST request successful | endpoint={endpoint}")
|
|
137
438
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
439
|
+
return response
|
|
440
|
+
except Exception as error:
|
|
441
|
+
elapsed_ms = (monotonic() - request_start) * 1000
|
|
442
|
+
logger.debug(
|
|
443
|
+
f"POST request failed | "
|
|
444
|
+
f"endpoint={endpoint} "
|
|
445
|
+
f"elapsed_ms={elapsed_ms:.2f} "
|
|
446
|
+
f"error_type={type(error).__name__} "
|
|
447
|
+
f"error={error}"
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
if isinstance(error, (CloudflareBlockError, RateLimitError)):
|
|
451
|
+
raise error
|
|
452
|
+
|
|
453
|
+
self._handle_error(error, f"POST {endpoint}: ")
|
|
454
|
+
|
|
455
|
+
# Never reached but satisfies type checker
|
|
456
|
+
raise error
|
|
457
|
+
|
|
458
|
+
return _do_post()
|
|
141
459
|
|
|
142
460
|
def stream_lines(self, endpoint: str, json: dict[str, Any]) -> Generator[bytes, None, None]:
|
|
143
461
|
"""Make a streaming POST request and yield lines.
|
|
@@ -152,15 +470,26 @@ class HTTPClient:
|
|
|
152
470
|
Raises:
|
|
153
471
|
AuthenticationError: If session token is invalid.
|
|
154
472
|
RateLimitError: If rate limit is exceeded.
|
|
473
|
+
CloudflareBlockError: If Cloudflare blocks the request.
|
|
155
474
|
PerplexityError: For other errors.
|
|
156
475
|
"""
|
|
157
476
|
|
|
477
|
+
logger.debug(f"Starting streaming request | endpoint={endpoint}")
|
|
478
|
+
|
|
158
479
|
response = self.post(endpoint, json=json, stream=True)
|
|
480
|
+
lines_count = 0
|
|
159
481
|
|
|
160
482
|
try:
|
|
161
|
-
|
|
483
|
+
logger.debug("Iterating stream lines")
|
|
484
|
+
|
|
485
|
+
for line in response.iter_lines():
|
|
486
|
+
lines_count += 1
|
|
487
|
+
yield line
|
|
488
|
+
|
|
489
|
+
logger.debug(f"Stream completed | total_lines={lines_count}")
|
|
162
490
|
finally:
|
|
163
491
|
response.close()
|
|
492
|
+
logger.debug(f"Stream response closed | lines_yielded={lines_count}")
|
|
164
493
|
|
|
165
494
|
def init_search(self, query: str) -> None:
|
|
166
495
|
"""Initialize a search session.
|
|
@@ -171,7 +500,9 @@ class HTTPClient:
|
|
|
171
500
|
query: The search query.
|
|
172
501
|
"""
|
|
173
502
|
|
|
503
|
+
logger.debug(f"Initializing search session | query_length={len(query)} query_preview={query[:50]}...")
|
|
174
504
|
self.get(ENDPOINT_SEARCH_INIT, params={"q": query})
|
|
505
|
+
logger.debug("Search session initialized successfully")
|
|
175
506
|
|
|
176
507
|
def stream_ask(self, payload: dict[str, Any]) -> Generator[bytes, None, None]:
|
|
177
508
|
"""Stream a prompt request to the ask endpoint.
|
|
@@ -183,15 +514,20 @@ class HTTPClient:
|
|
|
183
514
|
Response lines as bytes.
|
|
184
515
|
"""
|
|
185
516
|
|
|
517
|
+
logger.debug(f"Streaming ask request | payload_keys={list(payload.keys())}")
|
|
186
518
|
yield from self.stream_lines(ENDPOINT_ASK, json=payload)
|
|
187
519
|
|
|
188
520
|
def close(self) -> None:
|
|
189
521
|
"""Close the HTTP session."""
|
|
190
522
|
|
|
523
|
+
logger.debug("Closing HTTP client")
|
|
191
524
|
self._session.close()
|
|
525
|
+
logger.debug("HTTP client closed successfully")
|
|
192
526
|
|
|
193
527
|
def __enter__(self) -> HTTPClient:
|
|
528
|
+
logger.debug("Entering HTTPClient context manager")
|
|
194
529
|
return self
|
|
195
530
|
|
|
196
531
|
def __exit__(self, *args: Any) -> None:
|
|
532
|
+
logger.debug("Exiting HTTPClient context manager")
|
|
197
533
|
self.close()
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Upload and request limits for Perplexity WebUI Scraper.
|
|
3
|
+
"""
|
|
2
4
|
|
|
3
5
|
from __future__ import annotations
|
|
4
6
|
|
|
@@ -7,11 +9,17 @@ from typing import Final
|
|
|
7
9
|
|
|
8
10
|
# File Upload Limits
|
|
9
11
|
MAX_FILES: Final[int] = 30
|
|
10
|
-
"""
|
|
12
|
+
"""
|
|
13
|
+
Maximum number of files that can be attached to a single prompt.
|
|
14
|
+
"""
|
|
11
15
|
|
|
12
16
|
MAX_FILE_SIZE: Final[int] = 50 * 1024 * 1024 # 50 MB in bytes
|
|
13
|
-
"""
|
|
17
|
+
"""
|
|
18
|
+
Maximum file size in bytes.
|
|
19
|
+
"""
|
|
14
20
|
|
|
15
21
|
# Request Limits
|
|
16
22
|
DEFAULT_TIMEOUT: Final[int] = 30 * 60 # 30 minutes in seconds
|
|
17
|
-
"""
|
|
23
|
+
"""
|
|
24
|
+
Default request timeout in seconds.
|
|
25
|
+
"""
|