perplexity-webui-scraper 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ from pathlib import Path
8
8
  from typing import TYPE_CHECKING, Any
9
9
  from uuid import uuid4
10
10
 
11
- from orjson import loads
11
+ from orjson import JSONDecodeError, loads
12
12
 
13
13
 
14
14
  if TYPE_CHECKING:
@@ -26,20 +26,25 @@ from .constants import (
26
26
  USE_SCHEMATIZED_API,
27
27
  )
28
28
  from .enums import CitationMode
29
- from .exceptions import FileUploadError, FileValidationError
29
+ from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
30
30
  from .http import HTTPClient
31
31
  from .limits import MAX_FILE_SIZE, MAX_FILES
32
+ from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
32
33
  from .models import Model, Models
33
34
  from .types import Response, SearchResultItem, _FileInfo
34
35
 
35
36
 
37
+ logger = get_logger(__name__)
38
+
39
+
36
40
  class Perplexity:
37
41
  """Web scraper for Perplexity AI conversations."""
38
42
 
39
43
  __slots__ = ("_http",)
40
44
 
41
45
  def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
42
- """Initialize web scraper with session token.
46
+ """
47
+ Initialize web scraper with session token.
43
48
 
44
49
  Args:
45
50
  session_token: Perplexity session cookie (__Secure-next-auth.session-token).
@@ -53,15 +58,71 @@ class Perplexity:
53
58
  raise ValueError("session_token cannot be empty")
54
59
 
55
60
  cfg = config or ClientConfig()
56
- self._http = HTTPClient(session_token, timeout=cfg.timeout, impersonate=cfg.impersonate)
61
+
62
+ # Configure logging based on config
63
+ configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
64
+
65
+ logger.info(
66
+ "Perplexity client initializing | "
67
+ f"session_token_length={len(session_token)} "
68
+ f"logging_level={cfg.logging_level.value} "
69
+ f"log_file={cfg.log_file}"
70
+ )
71
+ logger.debug(
72
+ "Client configuration | "
73
+ f"timeout={cfg.timeout}s "
74
+ f"impersonate={cfg.impersonate} "
75
+ f"max_retries={cfg.max_retries} "
76
+ f"retry_base_delay={cfg.retry_base_delay}s "
77
+ f"retry_max_delay={cfg.retry_max_delay}s "
78
+ f"retry_jitter={cfg.retry_jitter} "
79
+ f"requests_per_second={cfg.requests_per_second} "
80
+ f"rotate_fingerprint={cfg.rotate_fingerprint}"
81
+ )
82
+
83
+ self._http = HTTPClient(
84
+ session_token,
85
+ timeout=cfg.timeout,
86
+ impersonate=cfg.impersonate,
87
+ max_retries=cfg.max_retries,
88
+ retry_base_delay=cfg.retry_base_delay,
89
+ retry_max_delay=cfg.retry_max_delay,
90
+ retry_jitter=cfg.retry_jitter,
91
+ requests_per_second=cfg.requests_per_second,
92
+ rotate_fingerprint=cfg.rotate_fingerprint,
93
+ )
94
+
95
+ logger.info("Perplexity client initialized successfully")
57
96
 
58
97
  def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
59
98
  """Create a new conversation."""
60
- return Conversation(self._http, config or ConversationConfig())
99
+
100
+ cfg = config or ConversationConfig()
101
+ logger.debug(
102
+ "Creating conversation | "
103
+ f"model={cfg.model} "
104
+ f"citation_mode={cfg.citation_mode} "
105
+ f"save_to_library={cfg.save_to_library} "
106
+ f"search_focus={cfg.search_focus} "
107
+ f"language={cfg.language}"
108
+ )
109
+
110
+ conversation = Conversation(self._http, cfg)
111
+
112
+ log_conversation_created(
113
+ f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
114
+ f"search_focus={cfg.search_focus}, language={cfg.language}"
115
+ )
116
+ logger.info("Conversation created successfully")
117
+
118
+ return conversation
61
119
 
62
120
  def close(self) -> None:
63
121
  """Close the client."""
122
+
123
+ logger.debug("Closing Perplexity client")
64
124
  self._http.close()
125
+ logger.info("Perplexity client closed")
65
126
 
66
127
  def __enter__(self) -> Perplexity:
67
128
  return self
@@ -88,6 +149,13 @@ class Conversation:
88
149
  )
89
150
 
90
151
  def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
152
+ logger.debug(
153
+ "Conversation.__init__ | "
154
+ f"model={config.model} "
155
+ f"citation_mode={config.citation_mode} "
156
+ f"save_to_library={config.save_to_library} "
157
+ f"search_focus={config.search_focus}"
158
+ )
91
159
  self._http = http
92
160
  self._config = config
93
161
  self._citation_mode = CitationMode.DEFAULT
@@ -99,70 +167,125 @@ class Conversation:
99
167
  self._search_results: list[SearchResultItem] = []
100
168
  self._raw_data: dict[str, Any] = {}
101
169
  self._stream_generator: Generator[Response, None, None] | None = None
170
+ logger.debug("Conversation initialized with empty state")
102
171
 
103
172
  @property
104
173
  def answer(self) -> str | None:
105
174
  """Last response text."""
175
+
106
176
  return self._answer
107
177
 
108
178
  @property
109
179
  def title(self) -> str | None:
110
180
  """Conversation title."""
181
+
111
182
  return self._title
112
183
 
113
184
  @property
114
185
  def search_results(self) -> list[SearchResultItem]:
115
186
  """Search results from last response."""
187
+
116
188
  return self._search_results
117
189
 
118
190
  @property
119
191
  def uuid(self) -> str | None:
120
192
  """Conversation UUID."""
193
+
121
194
  return self._backend_uuid
122
195
 
123
196
  def __iter__(self) -> Generator[Response, None, None]:
124
197
  if self._stream_generator is not None:
125
198
  yield from self._stream_generator
199
+
126
200
  self._stream_generator = None
127
201
 
128
202
  def ask(
129
203
  self,
130
204
  query: str,
131
205
  model: Model | None = None,
132
- files: list[str | PathLike[str]] | None = None,
206
+ files: list[str | PathLike] | None = None,
133
207
  citation_mode: CitationMode | None = None,
134
208
  stream: bool = False,
135
209
  ) -> Conversation:
136
210
  """Ask a question. Returns self for method chaining or streaming iteration."""
211
+
212
+ logger.info(
213
+ "Conversation.ask called | "
214
+ f"query_length={len(query)} "
215
+ f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
216
+ f"model={model} "
217
+ f"files_count={len(files) if files else 0} "
218
+ f"citation_mode={citation_mode} "
219
+ f"stream={stream}"
220
+ )
221
+
137
222
  effective_model = model or self._config.model or Models.BEST
138
223
  effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
139
224
  self._citation_mode = effective_citation
225
+
226
+ logger.debug(
227
+ f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
228
+ )
229
+
230
+ log_query_sent(query, str(effective_model), bool(files))
140
231
  self._execute(query, effective_model, files, stream=stream)
232
+
233
+ logger.debug("Query execution completed")
234
+
141
235
  return self
142
236
 
143
237
  def _execute(
144
238
  self,
145
239
  query: str,
146
240
  model: Model,
147
- files: list[str | PathLike[str]] | None,
241
+ files: list[str | PathLike] | None,
148
242
  stream: bool = False,
149
243
  ) -> None:
150
244
  """Execute a query."""
245
+
246
+ logger.debug(
247
+ f"Executing query | "
248
+ f"query_length={len(query)} "
249
+ f"model={model} "
250
+ f"files_count={len(files) if files else 0} "
251
+ f"stream={stream} "
252
+ f"is_followup={self._backend_uuid is not None}"
253
+ )
254
+
151
255
  self._reset_response_state()
256
+ logger.debug("Response state reset")
152
257
 
153
258
  # Upload files
154
259
  file_urls: list[str] = []
260
+
155
261
  if files:
262
+ logger.debug(f"Validating {len(files)} files")
156
263
  validated = self._validate_files(files)
264
+ logger.debug(f"Validated {len(validated)} files, uploading...")
157
265
  file_urls = [self._upload_file(f) for f in validated]
266
+ logger.debug(f"Uploaded {len(file_urls)} files successfully")
158
267
 
159
268
  payload = self._build_payload(query, model, file_urls)
269
+ logger.debug(
270
+ f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
271
+ )
272
+
273
+ logger.debug("Initializing search session")
160
274
  self._http.init_search(query)
161
275
 
162
276
  if stream:
277
+ logger.debug("Starting streaming mode")
163
278
  self._stream_generator = self._stream(payload)
164
279
  else:
280
+ logger.debug("Starting complete mode (non-streaming)")
165
281
  self._complete(payload)
282
+ logger.debug(
283
+ f"Query completed | "
284
+ f"title={self._title} "
285
+ f"answer_length={len(self._answer) if self._answer else 0} "
286
+ f"chunks_count={len(self._chunks)} "
287
+ f"search_results_count={len(self._search_results)}"
288
+ )
166
289
 
167
290
  def _reset_response_state(self) -> None:
168
291
  self._title = None
@@ -172,15 +295,17 @@ class Conversation:
172
295
  self._raw_data = {}
173
296
  self._stream_generator = None
174
297
 
175
- def _validate_files(self, files: list[str | PathLike[str]] | None) -> list[_FileInfo]:
298
+ def _validate_files(self, files: list[str | PathLike] | None) -> list[_FileInfo]:
176
299
  if not files:
177
300
  return []
178
301
 
179
302
  seen: set[str] = set()
180
303
  file_list: list[Path] = []
304
+
181
305
  for item in files:
182
306
  if item and isinstance(item, (str, PathLike)):
183
307
  path = Path(item).resolve()
308
+
184
309
  if path.as_posix() not in seen:
185
310
  seen.add(path.as_posix())
186
311
  file_list.append(path)
@@ -203,11 +328,13 @@ class Conversation:
203
328
  raise FileValidationError(file_path, "Path is not a file")
204
329
 
205
330
  file_size = path.stat().st_size
331
+
206
332
  if file_size > MAX_FILE_SIZE:
207
333
  raise FileValidationError(
208
334
  file_path,
209
335
  f"File exceeds 50MB limit: {file_size / (1024 * 1024):.1f}MB",
210
336
  )
337
+
211
338
  if file_size == 0:
212
339
  raise FileValidationError(file_path, "File is empty")
213
340
 
@@ -222,12 +349,12 @@ class Conversation:
222
349
  is_image=mimetype.startswith("image/"),
223
350
  )
224
351
  )
225
- except FileValidationError:
226
- raise
227
- except (FileNotFoundError, PermissionError) as e:
228
- raise FileValidationError(file_path, f"Cannot access file: {e}") from e
229
- except OSError as e:
230
- raise FileValidationError(file_path, f"File system error: {e}") from e
352
+ except FileValidationError as error:
353
+ raise error
354
+ except (FileNotFoundError, PermissionError) as error:
355
+ raise FileValidationError(file_path, f"Cannot access file: {error}") from error
356
+ except OSError as error:
357
+ raise FileValidationError(file_path, f"File system error: {error}") from error
231
358
 
232
359
  return result
233
360
 
@@ -255,8 +382,8 @@ class Conversation:
255
382
  raise FileUploadError(file_info.path, "No upload URL returned")
256
383
 
257
384
  return upload_url
258
- except FileUploadError:
259
- raise
385
+ except FileUploadError as error:
386
+ raise error
260
387
  except Exception as e:
261
388
  raise FileUploadError(file_info.path, str(e)) from e
262
389
 
@@ -301,6 +428,7 @@ class Conversation:
301
428
  if self._backend_uuid is not None:
302
429
  params["last_backend_uuid"] = self._backend_uuid
303
430
  params["query_source"] = "followup"
431
+
304
432
  if self._read_write_token:
305
433
  params["read_write_token"] = self._read_write_token
306
434
 
@@ -312,6 +440,7 @@ class Conversation:
312
440
 
313
441
  def replacer(m: Match[str]) -> str:
314
442
  num = m.group(1)
443
+
315
444
  if not num.isdigit():
316
445
  return m.group(0)
317
446
 
@@ -319,8 +448,10 @@ class Conversation:
319
448
  return ""
320
449
 
321
450
  idx = int(num) - 1
451
+
322
452
  if 0 <= idx < len(self._search_results):
323
453
  url = self._search_results[idx].url or ""
454
+
324
455
  if self._citation_mode == CitationMode.MARKDOWN and url:
325
456
  return f"[{num}]({url})"
326
457
 
@@ -330,26 +461,56 @@ class Conversation:
330
461
 
331
462
  def _parse_line(self, line: str | bytes) -> dict[str, Any] | None:
332
463
  prefix = b"data: " if isinstance(line, bytes) else "data: "
464
+
333
465
  if (isinstance(line, bytes) and line.startswith(prefix)) or (isinstance(line, str) and line.startswith(prefix)):
334
466
  return loads(line[6:])
467
+
335
468
  return None
336
469
 
337
470
  def _process_data(self, data: dict[str, Any]) -> None:
471
+ """Process SSE data chunk and update conversation state."""
472
+
338
473
  if self._backend_uuid is None and "backend_uuid" in data:
339
474
  self._backend_uuid = data["backend_uuid"]
340
475
 
341
476
  if self._read_write_token is None and "read_write_token" in data:
342
477
  self._read_write_token = data["read_write_token"]
343
478
 
344
- if "text" not in data:
345
- return
479
+ if self._title is None and "thread_title" in data:
480
+ self._title = data["thread_title"]
481
+
482
+ if "blocks" in data:
483
+ for block in data["blocks"]:
484
+ if block.get("intended_usage") == "web_results":
485
+ diff = block.get("diff_block", {})
486
+
487
+ for patch in diff.get("patches", []):
488
+ if patch.get("op") == "replace" and patch.get("path") == "/web_results":
489
+ pass
490
+
491
+ if "text" not in data and "blocks" not in data:
492
+ return None
493
+
494
+ try:
495
+ json_data = loads(data["text"])
496
+ except KeyError as e:
497
+ raise ValueError("Missing 'text' field in data") from e
498
+ except JSONDecodeError as e:
499
+ raise ValueError("Invalid JSON in 'text' field") from e
346
500
 
347
- json_data = loads(data["text"])
348
501
  answer_data: dict[str, Any] = {}
349
502
 
350
503
  if isinstance(json_data, list):
351
504
  for item in json_data:
352
- if item.get("step_type") == "FINAL":
505
+ step_type = item.get("step_type")
506
+
507
+ # Handle Research mode clarifying questions
508
+ if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
509
+ questions = self._extract_clarifying_questions(item)
510
+
511
+ raise ResearchClarifyingQuestionsError(questions)
512
+
513
+ if step_type == "FINAL":
353
514
  raw_content = item.get("content", {})
354
515
  answer_content = raw_content.get("answer")
355
516
 
@@ -359,14 +520,50 @@ class Conversation:
359
520
  answer_data = raw_content
360
521
 
361
522
  self._update_state(data.get("thread_title"), answer_data)
523
+
362
524
  break
363
525
  elif isinstance(json_data, dict):
364
526
  self._update_state(data.get("thread_title"), json_data)
527
+ else:
528
+ raise ResponseParsingError(
529
+ "Unexpected JSON structure in 'text' field",
530
+ raw_data=str(json_data),
531
+ )
532
+
533
+ def _extract_clarifying_questions(self, item: dict[str, Any]) -> list[str]:
534
+ """Extract clarifying questions from a RESEARCH_CLARIFYING_QUESTIONS step."""
535
+
536
+ questions: list[str] = []
537
+ content = item.get("content", {})
538
+
539
+ # Try different possible structures for questions
540
+ if isinstance(content, dict):
541
+ if "questions" in content:
542
+ raw_questions = content["questions"]
543
+
544
+ if isinstance(raw_questions, list):
545
+ questions = [str(q) for q in raw_questions if q]
546
+ elif "clarifying_questions" in content:
547
+ raw_questions = content["clarifying_questions"]
548
+
549
+ if isinstance(raw_questions, list):
550
+ questions = [str(q) for q in raw_questions if q]
551
+ elif not questions:
552
+ for value in content.values():
553
+ if isinstance(value, str) and "?" in value:
554
+ questions.append(value)
555
+ elif isinstance(content, list):
556
+ questions = [str(q) for q in content if q]
557
+ elif isinstance(content, str):
558
+ questions = [content]
559
+
560
+ return questions
365
561
 
366
562
  def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
367
563
  self._title = title
368
564
 
369
565
  web_results = answer_data.get("web_results", [])
566
+
370
567
  if web_results:
371
568
  self._search_results = [
372
569
  SearchResultItem(
@@ -379,12 +576,14 @@ class Conversation:
379
576
  ]
380
577
 
381
578
  answer_text = answer_data.get("answer")
579
+
382
580
  if answer_text is not None:
383
581
  self._answer = self._format_citations(answer_text)
384
582
 
385
583
  chunks = answer_data.get("chunks", [])
584
+
386
585
  if chunks:
387
- self._chunks = chunks
586
+ self._chunks = [self._format_citations(chunk) for chunk in chunks]
388
587
 
389
588
  self._raw_data = answer_data
390
589
 
@@ -402,16 +601,21 @@ class Conversation:
402
601
  def _complete(self, payload: dict[str, Any]) -> None:
403
602
  for line in self._http.stream_ask(payload):
404
603
  data = self._parse_line(line)
604
+
405
605
  if data:
406
606
  self._process_data(data)
607
+
407
608
  if data.get("final"):
408
609
  break
409
610
 
410
611
  def _stream(self, payload: dict[str, Any]) -> Generator[Response, None, None]:
411
612
  for line in self._http.stream_ask(payload):
412
613
  data = self._parse_line(line)
614
+
413
615
  if data:
414
616
  self._process_data(data)
617
+
415
618
  yield self._build_response()
619
+
416
620
  if data.get("final"):
417
621
  break
@@ -6,7 +6,8 @@ from enum import Enum
6
6
 
7
7
 
8
8
  class CitationMode(str, Enum):
9
- """Citation formatting modes for response text.
9
+ """
10
+ Citation formatting modes for response text.
10
11
 
11
12
  Controls how citation markers (e.g., [1], [2]) are formatted in the response.
12
13
  """
@@ -22,7 +23,8 @@ class CitationMode(str, Enum):
22
23
 
23
24
 
24
25
  class SearchFocus(str, Enum):
25
- """Search focus types that control the type of search performed.
26
+ """
27
+ Search focus types that control the type of search performed.
26
28
 
27
29
  Determines whether to search the web or focus on writing tasks.
28
30
  """
@@ -35,7 +37,8 @@ class SearchFocus(str, Enum):
35
37
 
36
38
 
37
39
  class SourceFocus(str, Enum):
38
- """Source focus types that control which sources to prioritize.
40
+ """
41
+ Source focus types that control which sources to prioritize.
39
42
 
40
43
  Can be combined (e.g., [SourceFocus.WEB, SourceFocus.ACADEMIC]) for multi-source searches.
41
44
  """
@@ -54,7 +57,8 @@ class SourceFocus(str, Enum):
54
57
 
55
58
 
56
59
  class TimeRange(str, Enum):
57
- """Time range filters for search results.
60
+ """
61
+ Time range filters for search results.
58
62
 
59
63
  Controls how recent the sources should be.
60
64
  """
@@ -73,3 +77,29 @@ class TimeRange(str, Enum):
73
77
 
74
78
  LAST_YEAR = "YEAR"
75
79
  """Include sources from the last 365 days."""
80
+
81
+
82
+ class LogLevel(str, Enum):
83
+ """
84
+ Logging level configuration.
85
+
86
+ Controls the verbosity of logging output. DISABLED is the default.
87
+ """
88
+
89
+ DISABLED = "DISABLED"
90
+ """Completely disable all logging output. This is the default."""
91
+
92
+ DEBUG = "DEBUG"
93
+ """Show all messages including internal debug information."""
94
+
95
+ INFO = "INFO"
96
+ """Show informational messages, warnings, and errors."""
97
+
98
+ WARNING = "WARNING"
99
+ """Show only warnings and errors."""
100
+
101
+ ERROR = "ERROR"
102
+ """Show only error messages."""
103
+
104
+ CRITICAL = "CRITICAL"
105
+ """Show only critical/fatal errors."""
@@ -3,6 +3,19 @@
3
3
  from __future__ import annotations
4
4
 
5
5
 
6
+ __all__: list[str] = [
7
+ "AuthenticationError",
8
+ "CloudflareBlockError",
9
+ "FileUploadError",
10
+ "FileValidationError",
11
+ "PerplexityError",
12
+ "RateLimitError",
13
+ "ResearchClarifyingQuestionsError",
14
+ "ResponseParsingError",
15
+ "StreamingError",
16
+ ]
17
+
18
+
6
19
  class PerplexityError(Exception):
7
20
  """Base exception for all Perplexity-related errors."""
8
21
 
@@ -34,6 +47,25 @@ class RateLimitError(PerplexityError):
34
47
  )
35
48
 
36
49
 
50
+ class CloudflareBlockError(PerplexityError):
51
+ """
52
+ Raised when Cloudflare blocks the request with a challenge page.
53
+
54
+ This typically means the request triggered Cloudflare's bot detection.
55
+ The client will automatically retry with fingerprint rotation, but if
56
+ this exception is raised, all retry attempts have failed.
57
+ """
58
+
59
+ def __init__(self, message: str | None = None) -> None:
60
+ super().__init__(
61
+ message
62
+ or "Cloudflare challenge detected. The request was blocked by Cloudflare's "
63
+ "bot protection. Try waiting a few minutes before retrying, or obtain a "
64
+ "fresh session token.",
65
+ status_code=403,
66
+ )
67
+
68
+
37
69
  class FileUploadError(PerplexityError):
38
70
  """Raised when file upload fails."""
39
71
 
@@ -48,3 +80,45 @@ class FileValidationError(PerplexityError):
48
80
  def __init__(self, file_path: str, reason: str) -> None:
49
81
  self.file_path = file_path
50
82
  super().__init__(f"File validation failed for '{file_path}': {reason}")
83
+
84
+
85
+ class ResearchClarifyingQuestionsError(PerplexityError):
86
+ """
87
+ Raised when Research mode requires clarifying questions.
88
+
89
+ This library does not support programmatic interaction with clarifying questions.
90
+ Consider rephrasing your query to be more specific.
91
+
92
+ Attributes:
93
+ questions: List of clarifying questions from the API.
94
+ """
95
+
96
+ def __init__(self, questions: list[str]) -> None:
97
+ self.questions = questions
98
+ questions_text = "\n".join(f" - {q}" for q in questions) if questions else " (no questions provided)"
99
+
100
+ super().__init__(
101
+ f"Research mode is asking clarifying questions:\n{questions_text}\n\n"
102
+ "Programmatic interaction with clarifying questions is not supported. "
103
+ "Please rephrase your query to be more specific."
104
+ )
105
+
106
+
107
+ class ResponseParsingError(PerplexityError):
108
+ """
109
+ Raised when the API response cannot be parsed.
110
+
111
+ Attributes:
112
+ raw_data: The raw data that failed to parse.
113
+ """
114
+
115
+ def __init__(self, message: str, raw_data: str | None = None) -> None:
116
+ self.raw_data = raw_data
117
+ super().__init__(f"Failed to parse API response: {message}")
118
+
119
+
120
+ class StreamingError(PerplexityError):
121
+ """Raised when an error occurs during streaming."""
122
+
123
+ def __init__(self, message: str) -> None:
124
+ super().__init__(f"Streaming error: {message}")