perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,33 +4,22 @@ from importlib import metadata
4
4
 
5
5
  from .config import ClientConfig, ConversationConfig
6
6
  from .core import Conversation, Perplexity
7
- from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
8
- from .exceptions import (
9
- AuthenticationError,
10
- FileUploadError,
11
- FileValidationError,
12
- PerplexityError,
13
- RateLimitError,
14
- )
7
+ from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
15
8
  from .models import Model, Models
16
9
  from .types import Coordinates, Response, SearchResultItem
17
10
 
18
11
 
19
12
  __version__: str = metadata.version("perplexity-webui-scraper")
20
13
  __all__: list[str] = [
21
- "AuthenticationError",
22
14
  "CitationMode",
23
15
  "ClientConfig",
24
16
  "Conversation",
25
17
  "ConversationConfig",
26
18
  "Coordinates",
27
- "FileUploadError",
28
- "FileValidationError",
19
+ "LogLevel",
29
20
  "Model",
30
21
  "Models",
31
22
  "Perplexity",
32
- "PerplexityError",
33
- "RateLimitError",
34
23
  "Response",
35
24
  "SearchFocus",
36
25
  "SearchResultItem",
@@ -5,10 +5,12 @@ from __future__ import annotations
5
5
  from dataclasses import dataclass
6
6
  from typing import TYPE_CHECKING
7
7
 
8
- from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
8
+ from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
9
9
 
10
10
 
11
11
  if TYPE_CHECKING:
12
+ from pathlib import Path
13
+
12
14
  from .models import Model
13
15
  from .types import Coordinates
14
16
 
@@ -30,7 +32,30 @@ class ConversationConfig:
30
32
 
31
33
  @dataclass(frozen=True, slots=True)
32
34
  class ClientConfig:
33
- """HTTP client settings."""
35
+ """
36
+ HTTP client settings.
37
+
38
+ Attributes:
39
+ timeout: Request timeout in seconds.
40
+ impersonate: Browser to impersonate (e.g., "chrome", "edge", "safari").
41
+ max_retries: Maximum retry attempts for failed requests.
42
+ retry_base_delay: Initial delay in seconds before first retry.
43
+ retry_max_delay: Maximum delay between retries.
44
+ retry_jitter: Random jitter factor (0-1) to add to delays.
45
+ requests_per_second: Rate limit for requests (0 to disable).
46
+ rotate_fingerprint: Whether to rotate browser fingerprint on retries.
47
+ logging_level: Logging verbosity level. Default is DISABLED.
48
+ log_file: Optional file path for persistent logging. If set, logs go to file only.
49
+ If None, logs go to console. All logs are appended.
50
+ """
34
51
 
35
52
  timeout: int = 3600
36
53
  impersonate: str = "chrome"
54
+ max_retries: int = 3
55
+ retry_base_delay: float = 1.0
56
+ retry_max_delay: float = 60.0
57
+ retry_jitter: float = 0.5
58
+ requests_per_second: float = 0.5
59
+ rotate_fingerprint: bool = True
60
+ logging_level: LogLevel = LogLevel.DISABLED
61
+ log_file: str | Path | None = None
@@ -26,20 +26,25 @@ from .constants import (
26
26
  USE_SCHEMATIZED_API,
27
27
  )
28
28
  from .enums import CitationMode
29
- from .exceptions import FileUploadError, FileValidationError
29
+ from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
30
30
  from .http import HTTPClient
31
31
  from .limits import MAX_FILE_SIZE, MAX_FILES
32
+ from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
32
33
  from .models import Model, Models
33
34
  from .types import Response, SearchResultItem, _FileInfo
34
35
 
35
36
 
37
+ logger = get_logger(__name__)
38
+
39
+
36
40
  class Perplexity:
37
41
  """Web scraper for Perplexity AI conversations."""
38
42
 
39
43
  __slots__ = ("_http",)
40
44
 
41
45
  def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
42
- """Initialize web scraper with session token.
46
+ """
47
+ Initialize web scraper with session token.
43
48
 
44
49
  Args:
45
50
  session_token: Perplexity session cookie (__Secure-next-auth.session-token).
@@ -53,17 +58,71 @@ class Perplexity:
53
58
  raise ValueError("session_token cannot be empty")
54
59
 
55
60
  cfg = config or ClientConfig()
56
- self._http = HTTPClient(session_token, timeout=cfg.timeout, impersonate=cfg.impersonate)
61
+
62
+ # Configure logging based on config
63
+ configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
64
+
65
+ logger.info(
66
+ "Perplexity client initializing | "
67
+ f"session_token_length={len(session_token)} "
68
+ f"logging_level={cfg.logging_level.value} "
69
+ f"log_file={cfg.log_file}"
70
+ )
71
+ logger.debug(
72
+ "Client configuration | "
73
+ f"timeout={cfg.timeout}s "
74
+ f"impersonate={cfg.impersonate} "
75
+ f"max_retries={cfg.max_retries} "
76
+ f"retry_base_delay={cfg.retry_base_delay}s "
77
+ f"retry_max_delay={cfg.retry_max_delay}s "
78
+ f"retry_jitter={cfg.retry_jitter} "
79
+ f"requests_per_second={cfg.requests_per_second} "
80
+ f"rotate_fingerprint={cfg.rotate_fingerprint}"
81
+ )
82
+
83
+ self._http = HTTPClient(
84
+ session_token,
85
+ timeout=cfg.timeout,
86
+ impersonate=cfg.impersonate,
87
+ max_retries=cfg.max_retries,
88
+ retry_base_delay=cfg.retry_base_delay,
89
+ retry_max_delay=cfg.retry_max_delay,
90
+ retry_jitter=cfg.retry_jitter,
91
+ requests_per_second=cfg.requests_per_second,
92
+ rotate_fingerprint=cfg.rotate_fingerprint,
93
+ )
94
+
95
+ logger.info("Perplexity client initialized successfully")
57
96
 
58
97
  def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
59
98
  """Create a new conversation."""
60
99
 
61
- return Conversation(self._http, config or ConversationConfig())
100
+ cfg = config or ConversationConfig()
101
+ logger.debug(
102
+ "Creating conversation | "
103
+ f"model={cfg.model} "
104
+ f"citation_mode={cfg.citation_mode} "
105
+ f"save_to_library={cfg.save_to_library} "
106
+ f"search_focus={cfg.search_focus} "
107
+ f"language={cfg.language}"
108
+ )
109
+
110
+ conversation = Conversation(self._http, cfg)
111
+
112
+ log_conversation_created(
113
+ f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
114
+ f"search_focus={cfg.search_focus}, language={cfg.language}"
115
+ )
116
+ logger.info("Conversation created successfully")
117
+
118
+ return conversation
62
119
 
63
120
  def close(self) -> None:
64
121
  """Close the client."""
65
122
 
123
+ logger.debug("Closing Perplexity client")
66
124
  self._http.close()
125
+ logger.info("Perplexity client closed")
67
126
 
68
127
  def __enter__(self) -> Perplexity:
69
128
  return self
@@ -90,6 +149,13 @@ class Conversation:
90
149
  )
91
150
 
92
151
  def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
152
+ logger.debug(
153
+ "Conversation.__init__ | "
154
+ f"model={config.model} "
155
+ f"citation_mode={config.citation_mode} "
156
+ f"save_to_library={config.save_to_library} "
157
+ f"search_focus={config.search_focus}"
158
+ )
93
159
  self._http = http
94
160
  self._config = config
95
161
  self._citation_mode = CitationMode.DEFAULT
@@ -101,6 +167,7 @@ class Conversation:
101
167
  self._search_results: list[SearchResultItem] = []
102
168
  self._raw_data: dict[str, Any] = {}
103
169
  self._stream_generator: Generator[Response, None, None] | None = None
170
+ logger.debug("Conversation initialized with empty state")
104
171
 
105
172
  @property
106
173
  def answer(self) -> str | None:
@@ -142,11 +209,29 @@ class Conversation:
142
209
  ) -> Conversation:
143
210
  """Ask a question. Returns self for method chaining or streaming iteration."""
144
211
 
212
+ logger.info(
213
+ "Conversation.ask called | "
214
+ f"query_length={len(query)} "
215
+ f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
216
+ f"model={model} "
217
+ f"files_count={len(files) if files else 0} "
218
+ f"citation_mode={citation_mode} "
219
+ f"stream={stream}"
220
+ )
221
+
145
222
  effective_model = model or self._config.model or Models.BEST
146
223
  effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
147
224
  self._citation_mode = effective_citation
225
+
226
+ logger.debug(
227
+ f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
228
+ )
229
+
230
+ log_query_sent(query, str(effective_model), bool(files))
148
231
  self._execute(query, effective_model, files, stream=stream)
149
232
 
233
+ logger.debug("Query execution completed")
234
+
150
235
  return self
151
236
 
152
237
  def _execute(
@@ -158,22 +243,49 @@ class Conversation:
158
243
  ) -> None:
159
244
  """Execute a query."""
160
245
 
246
+ logger.debug(
247
+ f"Executing query | "
248
+ f"query_length={len(query)} "
249
+ f"model={model} "
250
+ f"files_count={len(files) if files else 0} "
251
+ f"stream={stream} "
252
+ f"is_followup={self._backend_uuid is not None}"
253
+ )
254
+
161
255
  self._reset_response_state()
256
+ logger.debug("Response state reset")
162
257
 
163
258
  # Upload files
164
259
  file_urls: list[str] = []
165
260
 
166
261
  if files:
262
+ logger.debug(f"Validating {len(files)} files")
167
263
  validated = self._validate_files(files)
264
+ logger.debug(f"Validated {len(validated)} files, uploading...")
168
265
  file_urls = [self._upload_file(f) for f in validated]
266
+ logger.debug(f"Uploaded {len(file_urls)} files successfully")
169
267
 
170
268
  payload = self._build_payload(query, model, file_urls)
269
+ logger.debug(
270
+ f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
271
+ )
272
+
273
+ logger.debug("Initializing search session")
171
274
  self._http.init_search(query)
172
275
 
173
276
  if stream:
277
+ logger.debug("Starting streaming mode")
174
278
  self._stream_generator = self._stream(payload)
175
279
  else:
280
+ logger.debug("Starting complete mode (non-streaming)")
176
281
  self._complete(payload)
282
+ logger.debug(
283
+ f"Query completed | "
284
+ f"title={self._title} "
285
+ f"answer_length={len(self._answer) if self._answer else 0} "
286
+ f"chunks_count={len(self._chunks)} "
287
+ f"search_results_count={len(self._search_results)}"
288
+ )
177
289
 
178
290
  def _reset_response_state(self) -> None:
179
291
  self._title = None
@@ -237,8 +349,8 @@ class Conversation:
237
349
  is_image=mimetype.startswith("image/"),
238
350
  )
239
351
  )
240
- except FileValidationError:
241
- raise
352
+ except FileValidationError as error:
353
+ raise error
242
354
  except (FileNotFoundError, PermissionError) as error:
243
355
  raise FileValidationError(file_path, f"Cannot access file: {error}") from error
244
356
  except OSError as error:
@@ -356,12 +468,17 @@ class Conversation:
356
468
  return None
357
469
 
358
470
  def _process_data(self, data: dict[str, Any]) -> None:
471
+ """Process SSE data chunk and update conversation state."""
472
+
359
473
  if self._backend_uuid is None and "backend_uuid" in data:
360
474
  self._backend_uuid = data["backend_uuid"]
361
475
 
362
476
  if self._read_write_token is None and "read_write_token" in data:
363
477
  self._read_write_token = data["read_write_token"]
364
478
 
479
+ if self._title is None and "thread_title" in data:
480
+ self._title = data["thread_title"]
481
+
365
482
  if "blocks" in data:
366
483
  for block in data["blocks"]:
367
484
  if block.get("intended_usage") == "web_results":
@@ -385,7 +502,15 @@ class Conversation:
385
502
 
386
503
  if isinstance(json_data, list):
387
504
  for item in json_data:
388
- if item.get("step_type") == "FINAL":
505
+ step_type = item.get("step_type")
506
+
507
+ # Handle Research mode clarifying questions
508
+ if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
509
+ questions = self._extract_clarifying_questions(item)
510
+
511
+ raise ResearchClarifyingQuestionsError(questions)
512
+
513
+ if step_type == "FINAL":
389
514
  raw_content = item.get("content", {})
390
515
  answer_content = raw_content.get("answer")
391
516
 
@@ -400,7 +525,39 @@ class Conversation:
400
525
  elif isinstance(json_data, dict):
401
526
  self._update_state(data.get("thread_title"), json_data)
402
527
  else:
403
- raise ValueError("Unexpected JSON structure in 'text' field")
528
+ raise ResponseParsingError(
529
+ "Unexpected JSON structure in 'text' field",
530
+ raw_data=str(json_data),
531
+ )
532
+
533
+ def _extract_clarifying_questions(self, item: dict[str, Any]) -> list[str]:
534
+ """Extract clarifying questions from a RESEARCH_CLARIFYING_QUESTIONS step."""
535
+
536
+ questions: list[str] = []
537
+ content = item.get("content", {})
538
+
539
+ # Try different possible structures for questions
540
+ if isinstance(content, dict):
541
+ if "questions" in content:
542
+ raw_questions = content["questions"]
543
+
544
+ if isinstance(raw_questions, list):
545
+ questions = [str(q) for q in raw_questions if q]
546
+ elif "clarifying_questions" in content:
547
+ raw_questions = content["clarifying_questions"]
548
+
549
+ if isinstance(raw_questions, list):
550
+ questions = [str(q) for q in raw_questions if q]
551
+ elif not questions:
552
+ for value in content.values():
553
+ if isinstance(value, str) and "?" in value:
554
+ questions.append(value)
555
+ elif isinstance(content, list):
556
+ questions = [str(q) for q in content if q]
557
+ elif isinstance(content, str):
558
+ questions = [content]
559
+
560
+ return questions
404
561
 
405
562
  def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
406
563
  self._title = title
@@ -426,7 +583,7 @@ class Conversation:
426
583
  chunks = answer_data.get("chunks", [])
427
584
 
428
585
  if chunks:
429
- self._chunks = chunks
586
+ self._chunks = [self._format_citations(chunk) for chunk in chunks]
430
587
 
431
588
  self._raw_data = answer_data
432
589
 
@@ -6,7 +6,8 @@ from enum import Enum
6
6
 
7
7
 
8
8
  class CitationMode(str, Enum):
9
- """Citation formatting modes for response text.
9
+ """
10
+ Citation formatting modes for response text.
10
11
 
11
12
  Controls how citation markers (e.g., [1], [2]) are formatted in the response.
12
13
  """
@@ -22,7 +23,8 @@ class CitationMode(str, Enum):
22
23
 
23
24
 
24
25
  class SearchFocus(str, Enum):
25
- """Search focus types that control the type of search performed.
26
+ """
27
+ Search focus types that control the type of search performed.
26
28
 
27
29
  Determines whether to search the web or focus on writing tasks.
28
30
  """
@@ -35,7 +37,8 @@ class SearchFocus(str, Enum):
35
37
 
36
38
 
37
39
  class SourceFocus(str, Enum):
38
- """Source focus types that control which sources to prioritize.
40
+ """
41
+ Source focus types that control which sources to prioritize.
39
42
 
40
43
  Can be combined (e.g., [SourceFocus.WEB, SourceFocus.ACADEMIC]) for multi-source searches.
41
44
  """
@@ -54,7 +57,8 @@ class SourceFocus(str, Enum):
54
57
 
55
58
 
56
59
  class TimeRange(str, Enum):
57
- """Time range filters for search results.
60
+ """
61
+ Time range filters for search results.
58
62
 
59
63
  Controls how recent the sources should be.
60
64
  """
@@ -73,3 +77,29 @@ class TimeRange(str, Enum):
73
77
 
74
78
  LAST_YEAR = "YEAR"
75
79
  """Include sources from the last 365 days."""
80
+
81
+
82
+ class LogLevel(str, Enum):
83
+ """
84
+ Logging level configuration.
85
+
86
+ Controls the verbosity of logging output. DISABLED is the default.
87
+ """
88
+
89
+ DISABLED = "DISABLED"
90
+ """Completely disable all logging output. This is the default."""
91
+
92
+ DEBUG = "DEBUG"
93
+ """Show all messages including internal debug information."""
94
+
95
+ INFO = "INFO"
96
+ """Show informational messages, warnings, and errors."""
97
+
98
+ WARNING = "WARNING"
99
+ """Show only warnings and errors."""
100
+
101
+ ERROR = "ERROR"
102
+ """Show only error messages."""
103
+
104
+ CRITICAL = "CRITICAL"
105
+ """Show only critical/fatal errors."""
@@ -3,6 +3,19 @@
3
3
  from __future__ import annotations
4
4
 
5
5
 
6
+ __all__: list[str] = [
7
+ "AuthenticationError",
8
+ "CloudflareBlockError",
9
+ "FileUploadError",
10
+ "FileValidationError",
11
+ "PerplexityError",
12
+ "RateLimitError",
13
+ "ResearchClarifyingQuestionsError",
14
+ "ResponseParsingError",
15
+ "StreamingError",
16
+ ]
17
+
18
+
6
19
  class PerplexityError(Exception):
7
20
  """Base exception for all Perplexity-related errors."""
8
21
 
@@ -34,6 +47,25 @@ class RateLimitError(PerplexityError):
34
47
  )
35
48
 
36
49
 
50
+ class CloudflareBlockError(PerplexityError):
51
+ """
52
+ Raised when Cloudflare blocks the request with a challenge page.
53
+
54
+ This typically means the request triggered Cloudflare's bot detection.
55
+ The client will automatically retry with fingerprint rotation, but if
56
+ this exception is raised, all retry attempts have failed.
57
+ """
58
+
59
+ def __init__(self, message: str | None = None) -> None:
60
+ super().__init__(
61
+ message
62
+ or "Cloudflare challenge detected. The request was blocked by Cloudflare's "
63
+ "bot protection. Try waiting a few minutes before retrying, or obtain a "
64
+ "fresh session token.",
65
+ status_code=403,
66
+ )
67
+
68
+
37
69
  class FileUploadError(PerplexityError):
38
70
  """Raised when file upload fails."""
39
71
 
@@ -48,3 +80,45 @@ class FileValidationError(PerplexityError):
48
80
  def __init__(self, file_path: str, reason: str) -> None:
49
81
  self.file_path = file_path
50
82
  super().__init__(f"File validation failed for '{file_path}': {reason}")
83
+
84
+
85
+ class ResearchClarifyingQuestionsError(PerplexityError):
86
+ """
87
+ Raised when Research mode requires clarifying questions.
88
+
89
+ This library does not support programmatic interaction with clarifying questions.
90
+ Consider rephrasing your query to be more specific.
91
+
92
+ Attributes:
93
+ questions: List of clarifying questions from the API.
94
+ """
95
+
96
+ def __init__(self, questions: list[str]) -> None:
97
+ self.questions = questions
98
+ questions_text = "\n".join(f" - {q}" for q in questions) if questions else " (no questions provided)"
99
+
100
+ super().__init__(
101
+ f"Research mode is asking clarifying questions:\n{questions_text}\n\n"
102
+ "Programmatic interaction with clarifying questions is not supported. "
103
+ "Please rephrase your query to be more specific."
104
+ )
105
+
106
+
107
+ class ResponseParsingError(PerplexityError):
108
+ """
109
+ Raised when the API response cannot be parsed.
110
+
111
+ Attributes:
112
+ raw_data: The raw data that failed to parse.
113
+ """
114
+
115
+ def __init__(self, message: str, raw_data: str | None = None) -> None:
116
+ self.raw_data = raw_data
117
+ super().__init__(f"Failed to parse API response: {message}")
118
+
119
+
120
+ class StreamingError(PerplexityError):
121
+ """Raised when an error occurs during streaming."""
122
+
123
+ def __init__(self, message: str) -> None:
124
+ super().__init__(f"Streaming error: {message}")