perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,59 +1,241 @@
1
- """HTTP client wrapper for Perplexity API requests."""
1
+ """
2
+ HTTP client wrapper for Perplexity API requests.
3
+ """
2
4
 
3
5
  from __future__ import annotations
4
6
 
7
+ from contextlib import suppress
8
+ from time import monotonic
5
9
  from typing import TYPE_CHECKING, Any
6
10
 
7
11
  from curl_cffi.requests import Response as CurlResponse
12
+ from curl_cffi.requests import Session
13
+
14
+ from .constants import API_BASE_URL, DEFAULT_HEADERS, ENDPOINT_ASK, ENDPOINT_SEARCH_INIT, SESSION_COOKIE_NAME
15
+ from .exceptions import AuthenticationError, CloudflareBlockError, PerplexityError, RateLimitError
16
+ from .limits import DEFAULT_TIMEOUT
17
+ from .logging import (
18
+ get_logger,
19
+ log_cloudflare_detected,
20
+ log_error,
21
+ log_fingerprint_rotation,
22
+ log_rate_limit,
23
+ log_request,
24
+ log_response,
25
+ log_retry,
26
+ log_session_created,
27
+ )
28
+ from .resilience import (
29
+ CLOUDFLARE_MARKERS,
30
+ RateLimiter,
31
+ RetryConfig,
32
+ create_retry_decorator,
33
+ get_random_browser_profile,
34
+ is_cloudflare_challenge,
35
+ is_cloudflare_status,
36
+ )
8
37
 
9
38
 
10
39
  if TYPE_CHECKING:
11
40
  from collections.abc import Generator
12
41
 
13
- from curl_cffi.requests import Session
42
+ from tenacity import RetryCallState
14
43
 
15
- from .constants import (
16
- API_BASE_URL,
17
- DEFAULT_HEADERS,
18
- ENDPOINT_ASK,
19
- ENDPOINT_SEARCH_INIT,
20
- SESSION_COOKIE_NAME,
21
- )
22
- from .exceptions import AuthenticationError, PerplexityError, RateLimitError
23
- from .limits import DEFAULT_TIMEOUT
44
+ logger = get_logger(__name__)
24
45
 
25
46
 
26
47
  class HTTPClient:
27
- """HTTP client wrapper with error handling for Perplexity API.
48
+ """
49
+ HTTP client wrapper with error handling for Perplexity API.
28
50
 
29
51
  Provides a unified interface for making HTTP requests with automatic
30
- error handling and response processing.
52
+ error handling, retry mechanisms, rate limiting, and Cloudflare bypass.
31
53
  """
32
54
 
33
- __slots__ = ("_session",)
55
+ __slots__ = (
56
+ "_impersonate",
57
+ "_rate_limiter",
58
+ "_retry_config",
59
+ "_rotate_fingerprint",
60
+ "_session",
61
+ "_session_token",
62
+ "_timeout",
63
+ )
34
64
 
35
65
  def __init__(
36
66
  self,
37
67
  session_token: str,
38
68
  timeout: int = DEFAULT_TIMEOUT,
39
69
  impersonate: str = "chrome",
70
+ max_retries: int = 3,
71
+ retry_base_delay: float = 1.0,
72
+ retry_max_delay: float = 60.0,
73
+ retry_jitter: float = 0.5,
74
+ requests_per_second: float = 0.5,
75
+ rotate_fingerprint: bool = True,
40
76
  ) -> None:
41
- """Initialize the HTTP client."""
77
+ """Initialize the HTTP client.
78
+
79
+ Args:
80
+ session_token: Perplexity session cookie.
81
+ timeout: Request timeout in seconds.
82
+ impersonate: Browser profile to impersonate.
83
+ max_retries: Maximum retry attempts for failed requests.
84
+ retry_base_delay: Initial delay before first retry.
85
+ retry_max_delay: Maximum delay between retries.
86
+ retry_jitter: Random jitter factor for delays.
87
+ requests_per_second: Rate limit (0 to disable).
88
+ rotate_fingerprint: Whether to rotate browser fingerprint on retries.
89
+ """
90
+
91
+ logger.debug(
92
+ "Initializing HTTPClient | "
93
+ f"session_token_length={len(session_token)} "
94
+ f"timeout={timeout}s "
95
+ f"impersonate={impersonate} "
96
+ f"max_retries={max_retries} "
97
+ f"retry_base_delay={retry_base_delay}s "
98
+ f"retry_max_delay={retry_max_delay}s "
99
+ f"retry_jitter={retry_jitter} "
100
+ f"requests_per_second={requests_per_second} "
101
+ f"rotate_fingerprint={rotate_fingerprint}"
102
+ )
103
+
104
+ self._session_token = session_token
105
+ self._timeout = timeout
106
+ self._impersonate = impersonate
107
+ self._rotate_fingerprint = rotate_fingerprint
108
+
109
+ self._retry_config = RetryConfig(
110
+ max_retries=max_retries,
111
+ base_delay=retry_base_delay,
112
+ max_delay=retry_max_delay,
113
+ jitter=retry_jitter,
114
+ )
115
+
116
+ logger.debug(
117
+ "RetryConfig created | "
118
+ f"max_retries={self._retry_config.max_retries} "
119
+ f"base_delay={self._retry_config.base_delay}s "
120
+ f"max_delay={self._retry_config.max_delay}s "
121
+ f"jitter={self._retry_config.jitter}"
122
+ )
123
+
124
+ self._rate_limiter: RateLimiter | None = None
125
+
126
+ if requests_per_second > 0:
127
+ self._rate_limiter = RateLimiter(requests_per_second=requests_per_second)
128
+ logger.debug(f"RateLimiter enabled | requests_per_second={requests_per_second}")
129
+ else:
130
+ logger.debug("RateLimiter disabled | requests_per_second=0")
131
+
132
+ self._session = self._create_session(impersonate)
133
+ log_session_created(impersonate, timeout)
134
+
135
+ def _create_session(self, impersonate: str) -> Session:
136
+ """Create a new HTTP session with the given browser profile."""
137
+
138
+ logger.debug(f"Creating new HTTP session | browser_profile={impersonate}")
42
139
 
43
140
  headers: dict[str, str] = {
44
141
  **DEFAULT_HEADERS,
45
142
  "Referer": f"{API_BASE_URL}/",
46
143
  "Origin": API_BASE_URL,
47
144
  }
48
- cookies: dict[str, str] = {SESSION_COOKIE_NAME: session_token}
145
+ cookies: dict[str, str] = {SESSION_COOKIE_NAME: self._session_token}
146
+
147
+ logger.debug(
148
+ f"Session configuration | headers_count={len(headers)} cookies_count={len(cookies)} base_url={API_BASE_URL}"
149
+ )
49
150
 
50
- self._session: Session = Session(
151
+ session = Session(
51
152
  headers=headers,
52
153
  cookies=cookies,
53
- timeout=timeout,
154
+ timeout=self._timeout,
54
155
  impersonate=impersonate,
55
156
  )
56
157
 
158
+ logger.debug(f"HTTP session created successfully | browser_profile={impersonate}")
159
+
160
+ return session
161
+
162
+ def _rotate_session(self) -> None:
163
+ """Rotate to a new browser fingerprint by recreating the session."""
164
+
165
+ if self._rotate_fingerprint:
166
+ old_profile = self._impersonate
167
+ new_profile = get_random_browser_profile()
168
+
169
+ logger.debug(f"Rotating browser fingerprint | old={old_profile} new={new_profile}")
170
+ log_fingerprint_rotation(old_profile, new_profile)
171
+
172
+ with suppress(Exception):
173
+ self._session.close()
174
+ logger.debug("Previous session closed")
175
+
176
+ self._impersonate = new_profile
177
+ self._session = self._create_session(new_profile)
178
+
179
+ logger.debug(f"Browser fingerprint rotated successfully | new_profile={new_profile}")
180
+
181
+ def _on_retry(self, retry_state: RetryCallState) -> None:
182
+ """
183
+ Callback executed before each retry attempt.
184
+ """
185
+
186
+ attempt = retry_state.attempt_number
187
+ exception = retry_state.outcome.exception() if retry_state.outcome else None
188
+ wait_time = retry_state.next_action.sleep if retry_state.next_action else 0
189
+
190
+ logger.warning(
191
+ f"Retry triggered | "
192
+ f"attempt={attempt}/{self._retry_config.max_retries} "
193
+ f"exception_type={type(exception).__name__ if exception else 'None'} "
194
+ f"exception_message={str(exception) if exception else 'None'} "
195
+ f"wait_seconds={wait_time:.2f}"
196
+ )
197
+ log_retry(attempt, self._retry_config.max_retries, exception, wait_time)
198
+
199
+ # Rotate fingerprint on retry to avoid detection
200
+ if self._rotate_fingerprint:
201
+ logger.debug("Rotating fingerprint due to retry")
202
+ self._rotate_session()
203
+
204
+ def _check_cloudflare(self, response: CurlResponse) -> None:
205
+ """Check if response is a Cloudflare challenge and raise if so."""
206
+
207
+ logger.debug(f"Checking for Cloudflare challenge | status_code={response.status_code}")
208
+
209
+ if is_cloudflare_status(response.status_code):
210
+ logger.debug(f"Status code indicates potential Cloudflare block | status_code={response.status_code}")
211
+
212
+ try:
213
+ body = response.text
214
+ headers = dict(response.headers) if hasattr(response, "headers") else None
215
+
216
+ logger.debug(
217
+ f"Analyzing response for Cloudflare markers | "
218
+ f"body_length={len(body)} "
219
+ f"headers_count={len(headers) if headers else 0}"
220
+ )
221
+
222
+ if is_cloudflare_challenge(body, headers):
223
+ # Find which markers were detected
224
+ markers_found = [m for m in CLOUDFLARE_MARKERS if m.lower() in body.lower()]
225
+ logger.warning(
226
+ f"Cloudflare challenge detected | "
227
+ f"status_code={response.status_code} "
228
+ f"markers_found={markers_found}"
229
+ )
230
+ log_cloudflare_detected(response.status_code, markers_found)
231
+ raise CloudflareBlockError()
232
+ else:
233
+ logger.debug("No Cloudflare markers found in response")
234
+ except CloudflareBlockError as error:
235
+ raise error
236
+ except Exception as error:
237
+ logger.debug(f"Error checking Cloudflare response | error={error}")
238
+
57
239
  def _handle_error(self, error: Exception, context: str = "") -> None:
58
240
  """Handle HTTP errors and raise appropriate custom exceptions.
59
241
 
@@ -62,27 +244,70 @@ class HTTPClient:
62
244
  context: Additional context for the error message.
63
245
 
64
246
  Raises:
65
- AuthenticationError: If status code is 403.
247
+ AuthenticationError: If status code is 403 (not Cloudflare).
66
248
  RateLimitError: If status code is 429.
249
+ CloudflareBlockError: If Cloudflare challenge detected.
67
250
  PerplexityError: For other HTTP errors.
68
251
  """
69
252
 
70
- status_code = None
253
+ logger.debug(f"Handling error | context={context} error_type={type(error).__name__} error={error}")
254
+ log_error(error, context)
71
255
 
72
- if hasattr(error, "response") and error.response is not None:
73
- status_code = getattr(error.response, "status_code", None)
256
+ status_code = None
257
+ response = getattr(error, "response", None)
258
+
259
+ if response is not None:
260
+ status_code = getattr(response, "status_code", None)
261
+ logger.debug(f"Error has response | status_code={status_code}")
262
+
263
+ # Check for Cloudflare before handling as regular 403
264
+ if status_code is not None and is_cloudflare_status(status_code):
265
+ logger.debug(f"Checking if error is Cloudflare challenge | status_code={status_code}")
266
+
267
+ try:
268
+ body = response.text if hasattr(response, "text") else ""
269
+ headers = dict(response.headers) if hasattr(response, "headers") else None
270
+
271
+ if is_cloudflare_challenge(body, headers):
272
+ markers_found = [m for m in CLOUDFLARE_MARKERS if m.lower() in body.lower()]
273
+ logger.warning(
274
+ f"Cloudflare challenge confirmed in error response | "
275
+ f"status_code={status_code} "
276
+ f"markers={markers_found}"
277
+ )
278
+ log_cloudflare_detected(status_code, markers_found)
279
+ raise CloudflareBlockError() from error
280
+ except CloudflareBlockError:
281
+ raise
74
282
 
75
283
  if status_code == 403:
284
+ logger.error(f"Authentication error | status_code=403 context={context}")
76
285
  raise AuthenticationError() from error
77
286
  elif status_code == 429:
287
+ logger.warning(f"Rate limit exceeded | status_code=429 context={context}")
78
288
  raise RateLimitError() from error
79
289
  elif status_code is not None:
290
+ logger.error(f"HTTP error | status_code={status_code} context={context} error={error}")
80
291
  raise PerplexityError(f"{context}HTTP {status_code}: {error!s}", status_code=status_code) from error
81
292
  else:
293
+ logger.error(f"Unknown error | context={context} error={error}")
82
294
  raise PerplexityError(f"{context}{error!s}") from error
83
295
 
296
+ def _throttle(self) -> None:
297
+ """Apply rate limiting before making a request."""
298
+
299
+ if self._rate_limiter:
300
+ start_time = monotonic()
301
+ logger.debug("Acquiring rate limiter")
302
+ self._rate_limiter.acquire()
303
+ wait_time = monotonic() - start_time
304
+
305
+ if wait_time > 0.001: # Only log if we actually waited
306
+ logger.debug(f"Rate limiter throttled request | wait_seconds={wait_time:.3f}")
307
+ log_rate_limit(wait_time)
308
+
84
309
  def get(self, endpoint: str, params: dict[str, Any] | None = None) -> CurlResponse:
85
- """Make a GET request.
310
+ """Make a GET request with retry and rate limiting.
86
311
 
87
312
  Args:
88
313
  endpoint: The API endpoint (relative to BASE_URL).
@@ -94,18 +319,67 @@ class HTTPClient:
94
319
  Raises:
95
320
  AuthenticationError: If session token is invalid.
96
321
  RateLimitError: If rate limit is exceeded.
322
+ CloudflareBlockError: If Cloudflare blocks the request.
97
323
  PerplexityError: For other errors.
98
324
  """
99
325
 
100
326
  url = f"{API_BASE_URL}{endpoint}" if endpoint.startswith("/") else endpoint
101
327
 
102
- try:
103
- response = self._session.get(url, params=params)
104
- response.raise_for_status()
105
-
106
- return response
107
- except Exception as e:
108
- self._handle_error(e, f"GET {endpoint}: ")
328
+ logger.debug(f"GET request initiated | endpoint={endpoint} url={url} params={params}")
329
+ log_request("GET", url, params=params)
330
+
331
+ # Create retry wrapper for this specific call
332
+ retryable_exceptions = (RateLimitError, CloudflareBlockError, ConnectionError, TimeoutError)
333
+
334
+ @create_retry_decorator(self._retry_config, retryable_exceptions, self._on_retry)
335
+ def _do_get() -> CurlResponse:
336
+ self._throttle()
337
+
338
+ request_start = monotonic()
339
+ logger.debug(f"Executing GET request | url={url}")
340
+
341
+ try:
342
+ response = self._session.get(url, params=params)
343
+ elapsed_ms = (monotonic() - request_start) * 1000
344
+
345
+ logger.debug(
346
+ f"GET response received | "
347
+ f"status_code={response.status_code} "
348
+ f"elapsed_ms={elapsed_ms:.2f} "
349
+ f"content_length={len(response.content) if hasattr(response, 'content') else 'unknown'}"
350
+ )
351
+ log_response(
352
+ "GET",
353
+ url,
354
+ response.status_code,
355
+ elapsed_ms=elapsed_ms,
356
+ content_length=len(response.content) if hasattr(response, "content") else None,
357
+ )
358
+
359
+ self._check_cloudflare(response)
360
+ response.raise_for_status()
361
+
362
+ logger.debug(f"GET request successful | endpoint={endpoint}")
363
+ return response
364
+ except Exception as error:
365
+ elapsed_ms = (monotonic() - request_start) * 1000
366
+ logger.debug(
367
+ f"GET request failed | "
368
+ f"endpoint={endpoint} "
369
+ f"elapsed_ms={elapsed_ms:.2f} "
370
+ f"error_type={type(error).__name__} "
371
+ f"error={error}"
372
+ )
373
+
374
+ if isinstance(error, (CloudflareBlockError, RateLimitError)):
375
+ raise
376
+
377
+ self._handle_error(error, f"GET {endpoint}: ")
378
+
379
+ # Never reached but satisfies type checker
380
+ raise error
381
+
382
+ return _do_get()
109
383
 
110
384
  def post(
111
385
  self,
@@ -113,7 +387,7 @@ class HTTPClient:
113
387
  json: dict[str, Any] | None = None,
114
388
  stream: bool = False,
115
389
  ) -> CurlResponse:
116
- """Make a POST request.
390
+ """Make a POST request with retry and rate limiting.
117
391
 
118
392
  Args:
119
393
  endpoint: The API endpoint (relative to BASE_URL).
@@ -126,18 +400,62 @@ class HTTPClient:
126
400
  Raises:
127
401
  AuthenticationError: If session token is invalid.
128
402
  RateLimitError: If rate limit is exceeded.
403
+ CloudflareBlockError: If Cloudflare blocks the request.
129
404
  PerplexityError: For other errors.
130
405
  """
131
406
 
132
407
  url = f"{API_BASE_URL}{endpoint}" if endpoint.startswith("/") else endpoint
408
+ body_size = len(str(json)) if json else 0
133
409
 
134
- try:
135
- response = self._session.post(url, json=json, stream=stream)
136
- response.raise_for_status()
410
+ logger.debug(f"POST request initiated | endpoint={endpoint} url={url} stream={stream} body_size={body_size}")
411
+ log_request("POST", url, body_size=body_size)
412
+
413
+ retryable_exceptions = (RateLimitError, CloudflareBlockError, ConnectionError, TimeoutError)
414
+
415
+ @create_retry_decorator(self._retry_config, retryable_exceptions, self._on_retry)
416
+ def _do_post() -> CurlResponse:
417
+ self._throttle()
418
+
419
+ request_start = monotonic()
420
+ logger.debug(f"Executing POST request | url={url} stream={stream}")
421
+
422
+ try:
423
+ response = self._session.post(url, json=json, stream=stream)
424
+ elapsed_ms = (monotonic() - request_start) * 1000
425
+
426
+ logger.debug(
427
+ f"POST response received | "
428
+ f"status_code={response.status_code} "
429
+ f"elapsed_ms={elapsed_ms:.2f} "
430
+ f"stream={stream}"
431
+ )
432
+ log_response("POST", url, response.status_code, elapsed_ms=elapsed_ms)
433
+
434
+ self._check_cloudflare(response)
435
+ response.raise_for_status()
436
+
437
+ logger.debug(f"POST request successful | endpoint={endpoint}")
137
438
 
138
- return response
139
- except Exception as e:
140
- self._handle_error(e, f"POST {endpoint}: ")
439
+ return response
440
+ except Exception as error:
441
+ elapsed_ms = (monotonic() - request_start) * 1000
442
+ logger.debug(
443
+ f"POST request failed | "
444
+ f"endpoint={endpoint} "
445
+ f"elapsed_ms={elapsed_ms:.2f} "
446
+ f"error_type={type(error).__name__} "
447
+ f"error={error}"
448
+ )
449
+
450
+ if isinstance(error, (CloudflareBlockError, RateLimitError)):
451
+ raise error
452
+
453
+ self._handle_error(error, f"POST {endpoint}: ")
454
+
455
+ # Never reached but satisfies type checker
456
+ raise error
457
+
458
+ return _do_post()
141
459
 
142
460
  def stream_lines(self, endpoint: str, json: dict[str, Any]) -> Generator[bytes, None, None]:
143
461
  """Make a streaming POST request and yield lines.
@@ -152,15 +470,26 @@ class HTTPClient:
152
470
  Raises:
153
471
  AuthenticationError: If session token is invalid.
154
472
  RateLimitError: If rate limit is exceeded.
473
+ CloudflareBlockError: If Cloudflare blocks the request.
155
474
  PerplexityError: For other errors.
156
475
  """
157
476
 
477
+ logger.debug(f"Starting streaming request | endpoint={endpoint}")
478
+
158
479
  response = self.post(endpoint, json=json, stream=True)
480
+ lines_count = 0
159
481
 
160
482
  try:
161
- yield from response.iter_lines()
483
+ logger.debug("Iterating stream lines")
484
+
485
+ for line in response.iter_lines():
486
+ lines_count += 1
487
+ yield line
488
+
489
+ logger.debug(f"Stream completed | total_lines={lines_count}")
162
490
  finally:
163
491
  response.close()
492
+ logger.debug(f"Stream response closed | lines_yielded={lines_count}")
164
493
 
165
494
  def init_search(self, query: str) -> None:
166
495
  """Initialize a search session.
@@ -171,7 +500,9 @@ class HTTPClient:
171
500
  query: The search query.
172
501
  """
173
502
 
503
+ logger.debug(f"Initializing search session | query_length={len(query)} query_preview={query[:50]}...")
174
504
  self.get(ENDPOINT_SEARCH_INIT, params={"q": query})
505
+ logger.debug("Search session initialized successfully")
175
506
 
176
507
  def stream_ask(self, payload: dict[str, Any]) -> Generator[bytes, None, None]:
177
508
  """Stream a prompt request to the ask endpoint.
@@ -183,15 +514,20 @@ class HTTPClient:
183
514
  Response lines as bytes.
184
515
  """
185
516
 
517
+ logger.debug(f"Streaming ask request | payload_keys={list(payload.keys())}")
186
518
  yield from self.stream_lines(ENDPOINT_ASK, json=payload)
187
519
 
188
520
  def close(self) -> None:
189
521
  """Close the HTTP session."""
190
522
 
523
+ logger.debug("Closing HTTP client")
191
524
  self._session.close()
525
+ logger.debug("HTTP client closed successfully")
192
526
 
193
527
  def __enter__(self) -> HTTPClient:
528
+ logger.debug("Entering HTTPClient context manager")
194
529
  return self
195
530
 
196
531
  def __exit__(self, *args: Any) -> None:
532
+ logger.debug("Exiting HTTPClient context manager")
197
533
  self.close()
@@ -1,4 +1,6 @@
1
- """Upload and request limits for Perplexity WebUI Scraper."""
1
+ """
2
+ Upload and request limits for Perplexity WebUI Scraper.
3
+ """
2
4
 
3
5
  from __future__ import annotations
4
6
 
@@ -7,11 +9,17 @@ from typing import Final
7
9
 
8
10
  # File Upload Limits
9
11
  MAX_FILES: Final[int] = 30
10
- """Maximum number of files that can be attached to a single prompt."""
12
+ """
13
+ Maximum number of files that can be attached to a single prompt.
14
+ """
11
15
 
12
16
  MAX_FILE_SIZE: Final[int] = 50 * 1024 * 1024 # 50 MB in bytes
13
- """Maximum file size in bytes."""
17
+ """
18
+ Maximum file size in bytes.
19
+ """
14
20
 
15
21
  # Request Limits
16
22
  DEFAULT_TIMEOUT: Final[int] = 30 * 60 # 30 minutes in seconds
17
- """Default request timeout in seconds"""
23
+ """
24
+ Default request timeout in seconds.
25
+ """