perplexity-webui-scraper 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,57 +2,236 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from contextlib import suppress
6
+ from time import monotonic
5
7
  from typing import TYPE_CHECKING, Any
6
8
 
7
9
  from curl_cffi.requests import Response as CurlResponse
10
+ from curl_cffi.requests import Session
11
+
12
+ from .constants import API_BASE_URL, DEFAULT_HEADERS, ENDPOINT_ASK, ENDPOINT_SEARCH_INIT, SESSION_COOKIE_NAME
13
+ from .exceptions import AuthenticationError, CloudflareBlockError, PerplexityError, RateLimitError
14
+ from .limits import DEFAULT_TIMEOUT
15
+ from .logging import (
16
+ get_logger,
17
+ log_cloudflare_detected,
18
+ log_error,
19
+ log_fingerprint_rotation,
20
+ log_rate_limit,
21
+ log_request,
22
+ log_response,
23
+ log_retry,
24
+ log_session_created,
25
+ )
26
+ from .resilience import (
27
+ CLOUDFLARE_MARKERS,
28
+ RateLimiter,
29
+ RetryConfig,
30
+ create_retry_decorator,
31
+ get_random_browser_profile,
32
+ is_cloudflare_challenge,
33
+ is_cloudflare_status,
34
+ )
8
35
 
9
36
 
10
37
  if TYPE_CHECKING:
11
38
  from collections.abc import Generator
12
39
 
13
- from curl_cffi.requests import Session
40
+ from tenacity import RetryCallState
14
41
 
15
- from .constants import (
16
- API_BASE_URL,
17
- DEFAULT_HEADERS,
18
- ENDPOINT_ASK,
19
- ENDPOINT_SEARCH_INIT,
20
- SESSION_COOKIE_NAME,
21
- )
22
- from .exceptions import AuthenticationError, PerplexityError, RateLimitError
23
- from .limits import DEFAULT_TIMEOUT
42
+ logger = get_logger(__name__)
24
43
 
25
44
 
26
45
  class HTTPClient:
27
- """HTTP client wrapper with error handling for Perplexity API.
46
+ """
47
+ HTTP client wrapper with error handling for Perplexity API.
28
48
 
29
49
  Provides a unified interface for making HTTP requests with automatic
30
- error handling and response processing.
50
+ error handling, retry mechanisms, rate limiting, and Cloudflare bypass.
31
51
  """
32
52
 
33
- __slots__ = ("_session",)
53
+ __slots__ = (
54
+ "_impersonate",
55
+ "_rate_limiter",
56
+ "_retry_config",
57
+ "_rotate_fingerprint",
58
+ "_session",
59
+ "_session_token",
60
+ "_timeout",
61
+ )
34
62
 
35
63
  def __init__(
36
64
  self,
37
65
  session_token: str,
38
66
  timeout: int = DEFAULT_TIMEOUT,
39
67
  impersonate: str = "chrome",
68
+ max_retries: int = 3,
69
+ retry_base_delay: float = 1.0,
70
+ retry_max_delay: float = 60.0,
71
+ retry_jitter: float = 0.5,
72
+ requests_per_second: float = 0.5,
73
+ rotate_fingerprint: bool = True,
40
74
  ) -> None:
41
- """Initialize the HTTP client."""
75
+ """Initialize the HTTP client.
76
+
77
+ Args:
78
+ session_token: Perplexity session cookie.
79
+ timeout: Request timeout in seconds.
80
+ impersonate: Browser profile to impersonate.
81
+ max_retries: Maximum retry attempts for failed requests.
82
+ retry_base_delay: Initial delay before first retry.
83
+ retry_max_delay: Maximum delay between retries.
84
+ retry_jitter: Random jitter factor for delays.
85
+ requests_per_second: Rate limit (0 to disable).
86
+ rotate_fingerprint: Whether to rotate browser fingerprint on retries.
87
+ """
88
+
89
+ logger.debug(
90
+ "Initializing HTTPClient | "
91
+ f"session_token_length={len(session_token)} "
92
+ f"timeout={timeout}s "
93
+ f"impersonate={impersonate} "
94
+ f"max_retries={max_retries} "
95
+ f"retry_base_delay={retry_base_delay}s "
96
+ f"retry_max_delay={retry_max_delay}s "
97
+ f"retry_jitter={retry_jitter} "
98
+ f"requests_per_second={requests_per_second} "
99
+ f"rotate_fingerprint={rotate_fingerprint}"
100
+ )
101
+
102
+ self._session_token = session_token
103
+ self._timeout = timeout
104
+ self._impersonate = impersonate
105
+ self._rotate_fingerprint = rotate_fingerprint
106
+
107
+ self._retry_config = RetryConfig(
108
+ max_retries=max_retries,
109
+ base_delay=retry_base_delay,
110
+ max_delay=retry_max_delay,
111
+ jitter=retry_jitter,
112
+ )
113
+
114
+ logger.debug(
115
+ "RetryConfig created | "
116
+ f"max_retries={self._retry_config.max_retries} "
117
+ f"base_delay={self._retry_config.base_delay}s "
118
+ f"max_delay={self._retry_config.max_delay}s "
119
+ f"jitter={self._retry_config.jitter}"
120
+ )
121
+
122
+ self._rate_limiter: RateLimiter | None = None
123
+
124
+ if requests_per_second > 0:
125
+ self._rate_limiter = RateLimiter(requests_per_second=requests_per_second)
126
+ logger.debug(f"RateLimiter enabled | requests_per_second={requests_per_second}")
127
+ else:
128
+ logger.debug("RateLimiter disabled | requests_per_second=0")
129
+
130
+ self._session = self._create_session(impersonate)
131
+ log_session_created(impersonate, timeout)
132
+
133
+ def _create_session(self, impersonate: str) -> Session:
134
+ """Create a new HTTP session with the given browser profile."""
135
+
136
+ logger.debug(f"Creating new HTTP session | browser_profile={impersonate}")
42
137
 
43
138
  headers: dict[str, str] = {
44
139
  **DEFAULT_HEADERS,
45
140
  "Referer": f"{API_BASE_URL}/",
46
141
  "Origin": API_BASE_URL,
47
142
  }
48
- cookies: dict[str, str] = {SESSION_COOKIE_NAME: session_token}
49
- self._session: Session = Session(
143
+ cookies: dict[str, str] = {SESSION_COOKIE_NAME: self._session_token}
144
+
145
+ logger.debug(
146
+ f"Session configuration | headers_count={len(headers)} cookies_count={len(cookies)} base_url={API_BASE_URL}"
147
+ )
148
+
149
+ session = Session(
50
150
  headers=headers,
51
151
  cookies=cookies,
52
- timeout=timeout,
152
+ timeout=self._timeout,
53
153
  impersonate=impersonate,
54
154
  )
55
155
 
156
+ logger.debug(f"HTTP session created successfully | browser_profile={impersonate}")
157
+
158
+ return session
159
+
160
+ def _rotate_session(self) -> None:
161
+ """Rotate to a new browser fingerprint by recreating the session."""
162
+
163
+ if self._rotate_fingerprint:
164
+ old_profile = self._impersonate
165
+ new_profile = get_random_browser_profile()
166
+
167
+ logger.debug(f"Rotating browser fingerprint | old={old_profile} new={new_profile}")
168
+ log_fingerprint_rotation(old_profile, new_profile)
169
+
170
+ with suppress(Exception):
171
+ self._session.close()
172
+ logger.debug("Previous session closed")
173
+
174
+ self._impersonate = new_profile
175
+ self._session = self._create_session(new_profile)
176
+
177
+ logger.debug(f"Browser fingerprint rotated successfully | new_profile={new_profile}")
178
+
179
+ def _on_retry(self, retry_state: RetryCallState) -> None:
180
+ """Callback executed before each retry attempt."""
181
+
182
+ attempt = retry_state.attempt_number
183
+ exception = retry_state.outcome.exception() if retry_state.outcome else None
184
+ wait_time = retry_state.next_action.sleep if retry_state.next_action else 0
185
+
186
+ logger.warning(
187
+ f"Retry triggered | "
188
+ f"attempt={attempt}/{self._retry_config.max_retries} "
189
+ f"exception_type={type(exception).__name__ if exception else 'None'} "
190
+ f"exception_message={str(exception) if exception else 'None'} "
191
+ f"wait_seconds={wait_time:.2f}"
192
+ )
193
+ log_retry(attempt, self._retry_config.max_retries, exception, wait_time)
194
+
195
+ # Rotate fingerprint on retry to avoid detection
196
+ if self._rotate_fingerprint:
197
+ logger.debug("Rotating fingerprint due to retry")
198
+ self._rotate_session()
199
+
200
+ def _check_cloudflare(self, response: CurlResponse) -> None:
201
+ """Check if response is a Cloudflare challenge and raise if so."""
202
+
203
+ logger.debug(f"Checking for Cloudflare challenge | status_code={response.status_code}")
204
+
205
+ if is_cloudflare_status(response.status_code):
206
+ logger.debug(f"Status code indicates potential Cloudflare block | status_code={response.status_code}")
207
+
208
+ try:
209
+ body = response.text
210
+ headers = dict(response.headers) if hasattr(response, "headers") else None
211
+
212
+ logger.debug(
213
+ f"Analyzing response for Cloudflare markers | "
214
+ f"body_length={len(body)} "
215
+ f"headers_count={len(headers) if headers else 0}"
216
+ )
217
+
218
+ if is_cloudflare_challenge(body, headers):
219
+ # Find which markers were detected
220
+ markers_found = [m for m in CLOUDFLARE_MARKERS if m.lower() in body.lower()]
221
+ logger.warning(
222
+ f"Cloudflare challenge detected | "
223
+ f"status_code={response.status_code} "
224
+ f"markers_found={markers_found}"
225
+ )
226
+ log_cloudflare_detected(response.status_code, markers_found)
227
+ raise CloudflareBlockError()
228
+ else:
229
+ logger.debug("No Cloudflare markers found in response")
230
+ except CloudflareBlockError as error:
231
+ raise error
232
+ except Exception as error:
233
+ logger.debug(f"Error checking Cloudflare response | error={error}")
234
+
56
235
  def _handle_error(self, error: Exception, context: str = "") -> None:
57
236
  """Handle HTTP errors and raise appropriate custom exceptions.
58
237
 
@@ -61,27 +240,70 @@ class HTTPClient:
61
240
  context: Additional context for the error message.
62
241
 
63
242
  Raises:
64
- AuthenticationError: If status code is 403.
243
+ AuthenticationError: If status code is 403 (not Cloudflare).
65
244
  RateLimitError: If status code is 429.
245
+ CloudflareBlockError: If Cloudflare challenge detected.
66
246
  PerplexityError: For other HTTP errors.
67
247
  """
68
248
 
69
- status_code = None
249
+ logger.debug(f"Handling error | context={context} error_type={type(error).__name__} error={error}")
250
+ log_error(error, context)
70
251
 
71
- if hasattr(error, "response") and error.response is not None:
72
- status_code = getattr(error.response, "status_code", None)
252
+ status_code = None
253
+ response = getattr(error, "response", None)
254
+
255
+ if response is not None:
256
+ status_code = getattr(response, "status_code", None)
257
+ logger.debug(f"Error has response | status_code={status_code}")
258
+
259
+ # Check for Cloudflare before handling as regular 403
260
+ if is_cloudflare_status(status_code):
261
+ logger.debug(f"Checking if error is Cloudflare challenge | status_code={status_code}")
262
+
263
+ try:
264
+ body = response.text if hasattr(response, "text") else ""
265
+ headers = dict(response.headers) if hasattr(response, "headers") else None
266
+
267
+ if is_cloudflare_challenge(body, headers):
268
+ markers_found = [m for m in CLOUDFLARE_MARKERS if m.lower() in body.lower()]
269
+ logger.warning(
270
+ f"Cloudflare challenge confirmed in error response | "
271
+ f"status_code={status_code} "
272
+ f"markers={markers_found}"
273
+ )
274
+ log_cloudflare_detected(status_code, markers_found)
275
+ raise CloudflareBlockError() from error
276
+ except CloudflareBlockError:
277
+ raise
73
278
 
74
279
  if status_code == 403:
280
+ logger.error(f"Authentication error | status_code=403 context={context}")
75
281
  raise AuthenticationError() from error
76
282
  elif status_code == 429:
283
+ logger.warning(f"Rate limit exceeded | status_code=429 context={context}")
77
284
  raise RateLimitError() from error
78
285
  elif status_code is not None:
286
+ logger.error(f"HTTP error | status_code={status_code} context={context} error={error}")
79
287
  raise PerplexityError(f"{context}HTTP {status_code}: {error!s}", status_code=status_code) from error
80
288
  else:
289
+ logger.error(f"Unknown error | context={context} error={error}")
81
290
  raise PerplexityError(f"{context}{error!s}") from error
82
291
 
292
+ def _throttle(self) -> None:
293
+ """Apply rate limiting before making a request."""
294
+
295
+ if self._rate_limiter:
296
+ start_time = monotonic()
297
+ logger.debug("Acquiring rate limiter")
298
+ self._rate_limiter.acquire()
299
+ wait_time = monotonic() - start_time
300
+
301
+ if wait_time > 0.001: # Only log if we actually waited
302
+ logger.debug(f"Rate limiter throttled request | wait_seconds={wait_time:.3f}")
303
+ log_rate_limit(wait_time)
304
+
83
305
  def get(self, endpoint: str, params: dict[str, Any] | None = None) -> CurlResponse:
84
- """Make a GET request.
306
+ """Make a GET request with retry and rate limiting.
85
307
 
86
308
  Args:
87
309
  endpoint: The API endpoint (relative to BASE_URL).
@@ -93,17 +315,67 @@ class HTTPClient:
93
315
  Raises:
94
316
  AuthenticationError: If session token is invalid.
95
317
  RateLimitError: If rate limit is exceeded.
318
+ CloudflareBlockError: If Cloudflare blocks the request.
96
319
  PerplexityError: For other errors.
97
320
  """
98
321
 
99
322
  url = f"{API_BASE_URL}{endpoint}" if endpoint.startswith("/") else endpoint
100
323
 
101
- try:
102
- response = self._session.get(url, params=params)
103
- response.raise_for_status()
104
- return response
105
- except Exception as e:
106
- self._handle_error(e, f"GET {endpoint}: ")
324
+ logger.debug(f"GET request initiated | endpoint={endpoint} url={url} params={params}")
325
+ log_request("GET", url, params=params)
326
+
327
+ # Create retry wrapper for this specific call
328
+ retryable_exceptions = (RateLimitError, CloudflareBlockError, ConnectionError, TimeoutError)
329
+
330
+ @create_retry_decorator(self._retry_config, retryable_exceptions, self._on_retry)
331
+ def _do_get() -> CurlResponse:
332
+ self._throttle()
333
+
334
+ request_start = monotonic()
335
+ logger.debug(f"Executing GET request | url={url}")
336
+
337
+ try:
338
+ response = self._session.get(url, params=params)
339
+ elapsed_ms = (monotonic() - request_start) * 1000
340
+
341
+ logger.debug(
342
+ f"GET response received | "
343
+ f"status_code={response.status_code} "
344
+ f"elapsed_ms={elapsed_ms:.2f} "
345
+ f"content_length={len(response.content) if hasattr(response, 'content') else 'unknown'}"
346
+ )
347
+ log_response(
348
+ "GET",
349
+ url,
350
+ response.status_code,
351
+ elapsed_ms=elapsed_ms,
352
+ content_length=len(response.content) if hasattr(response, "content") else None,
353
+ )
354
+
355
+ self._check_cloudflare(response)
356
+ response.raise_for_status()
357
+
358
+ logger.debug(f"GET request successful | endpoint={endpoint}")
359
+ return response
360
+ except Exception as error:
361
+ elapsed_ms = (monotonic() - request_start) * 1000
362
+ logger.debug(
363
+ f"GET request failed | "
364
+ f"endpoint={endpoint} "
365
+ f"elapsed_ms={elapsed_ms:.2f} "
366
+ f"error_type={type(error).__name__} "
367
+ f"error={error}"
368
+ )
369
+
370
+ if isinstance(error, (CloudflareBlockError, RateLimitError)):
371
+ raise
372
+
373
+ self._handle_error(error, f"GET {endpoint}: ")
374
+
375
+ # Never reached but satisfies type checker
376
+ raise error
377
+
378
+ return _do_get()
107
379
 
108
380
  def post(
109
381
  self,
@@ -111,7 +383,7 @@ class HTTPClient:
111
383
  json: dict[str, Any] | None = None,
112
384
  stream: bool = False,
113
385
  ) -> CurlResponse:
114
- """Make a POST request.
386
+ """Make a POST request with retry and rate limiting.
115
387
 
116
388
  Args:
117
389
  endpoint: The API endpoint (relative to BASE_URL).
@@ -124,17 +396,61 @@ class HTTPClient:
124
396
  Raises:
125
397
  AuthenticationError: If session token is invalid.
126
398
  RateLimitError: If rate limit is exceeded.
399
+ CloudflareBlockError: If Cloudflare blocks the request.
127
400
  PerplexityError: For other errors.
128
401
  """
129
402
 
130
403
  url = f"{API_BASE_URL}{endpoint}" if endpoint.startswith("/") else endpoint
404
+ body_size = len(str(json)) if json else 0
131
405
 
132
- try:
133
- response = self._session.post(url, json=json, stream=stream)
134
- response.raise_for_status()
135
- return response
136
- except Exception as e:
137
- self._handle_error(e, f"POST {endpoint}: ")
406
+ logger.debug(f"POST request initiated | endpoint={endpoint} url={url} stream={stream} body_size={body_size}")
407
+ log_request("POST", url, body_size=body_size)
408
+
409
+ retryable_exceptions = (RateLimitError, CloudflareBlockError, ConnectionError, TimeoutError)
410
+
411
+ @create_retry_decorator(self._retry_config, retryable_exceptions, self._on_retry)
412
+ def _do_post() -> CurlResponse:
413
+ self._throttle()
414
+
415
+ request_start = monotonic()
416
+ logger.debug(f"Executing POST request | url={url} stream={stream}")
417
+
418
+ try:
419
+ response = self._session.post(url, json=json, stream=stream)
420
+ elapsed_ms = (monotonic() - request_start) * 1000
421
+
422
+ logger.debug(
423
+ f"POST response received | "
424
+ f"status_code={response.status_code} "
425
+ f"elapsed_ms={elapsed_ms:.2f} "
426
+ f"stream={stream}"
427
+ )
428
+ log_response("POST", url, response.status_code, elapsed_ms=elapsed_ms)
429
+
430
+ self._check_cloudflare(response)
431
+ response.raise_for_status()
432
+
433
+ logger.debug(f"POST request successful | endpoint={endpoint}")
434
+ return response
435
+ except Exception as error:
436
+ elapsed_ms = (monotonic() - request_start) * 1000
437
+ logger.debug(
438
+ f"POST request failed | "
439
+ f"endpoint={endpoint} "
440
+ f"elapsed_ms={elapsed_ms:.2f} "
441
+ f"error_type={type(error).__name__} "
442
+ f"error={error}"
443
+ )
444
+
445
+ if isinstance(error, (CloudflareBlockError, RateLimitError)):
446
+ raise error
447
+
448
+ self._handle_error(error, f"POST {endpoint}: ")
449
+
450
+ # Never reached but satisfies type checker
451
+ raise error
452
+
453
+ return _do_post()
138
454
 
139
455
  def stream_lines(self, endpoint: str, json: dict[str, Any]) -> Generator[bytes, None, None]:
140
456
  """Make a streaming POST request and yield lines.
@@ -149,15 +465,26 @@ class HTTPClient:
149
465
  Raises:
150
466
  AuthenticationError: If session token is invalid.
151
467
  RateLimitError: If rate limit is exceeded.
468
+ CloudflareBlockError: If Cloudflare blocks the request.
152
469
  PerplexityError: For other errors.
153
470
  """
154
471
 
472
+ logger.debug(f"Starting streaming request | endpoint={endpoint}")
473
+
155
474
  response = self.post(endpoint, json=json, stream=True)
475
+ lines_count = 0
156
476
 
157
477
  try:
158
- yield from response.iter_lines()
478
+ logger.debug("Iterating stream lines")
479
+
480
+ for line in response.iter_lines():
481
+ lines_count += 1
482
+ yield line
483
+
484
+ logger.debug(f"Stream completed | total_lines={lines_count}")
159
485
  finally:
160
486
  response.close()
487
+ logger.debug(f"Stream response closed | lines_yielded={lines_count}")
161
488
 
162
489
  def init_search(self, query: str) -> None:
163
490
  """Initialize a search session.
@@ -168,7 +495,9 @@ class HTTPClient:
168
495
  query: The search query.
169
496
  """
170
497
 
498
+ logger.debug(f"Initializing search session | query_length={len(query)} query_preview={query[:50]}...")
171
499
  self.get(ENDPOINT_SEARCH_INIT, params={"q": query})
500
+ logger.debug("Search session initialized successfully")
172
501
 
173
502
  def stream_ask(self, payload: dict[str, Any]) -> Generator[bytes, None, None]:
174
503
  """Stream a prompt request to the ask endpoint.
@@ -180,15 +509,20 @@ class HTTPClient:
180
509
  Response lines as bytes.
181
510
  """
182
511
 
512
+ logger.debug(f"Streaming ask request | payload_keys={list(payload.keys())}")
183
513
  yield from self.stream_lines(ENDPOINT_ASK, json=payload)
184
514
 
185
515
  def close(self) -> None:
186
516
  """Close the HTTP session."""
187
517
 
518
+ logger.debug("Closing HTTP client")
188
519
  self._session.close()
520
+ logger.debug("HTTP client closed successfully")
189
521
 
190
522
  def __enter__(self) -> HTTPClient:
523
+ logger.debug("Entering HTTPClient context manager")
191
524
  return self
192
525
 
193
526
  def __exit__(self, *args: Any) -> None:
527
+ logger.debug("Exiting HTTPClient context manager")
194
528
  self.close()
@@ -10,11 +10,8 @@ MAX_FILES: Final[int] = 30
10
10
  """Maximum number of files that can be attached to a single prompt."""
11
11
 
12
12
  MAX_FILE_SIZE: Final[int] = 50 * 1024 * 1024 # 50 MB in bytes
13
- """Maximum file size in bytes (50 MB)."""
13
+ """Maximum file size in bytes."""
14
14
 
15
15
  # Request Limits
16
16
  DEFAULT_TIMEOUT: Final[int] = 30 * 60 # 30 minutes in seconds
17
- """Default request timeout in seconds (30 minutes).
18
-
19
- Set high to accommodate complex models that may take longer to respond.
20
- """
17
+ """Default request timeout in seconds"""