perplexity-webui-scraper 0.3.7__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,4 @@
1
- """
2
- Core client implementation.
3
- """
1
+ """Core client implementation."""
4
2
 
5
3
  from __future__ import annotations
6
4
 
@@ -33,7 +31,7 @@ from .enums import CitationMode
33
31
  from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
34
32
  from .http import HTTPClient
35
33
  from .limits import MAX_FILE_SIZE, MAX_FILES
36
- from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
34
+ from .logging import configure_logging, get_logger
37
35
  from .models import Model, Models
38
36
  from .types import Response, SearchResultItem, _FileInfo
39
37
 
@@ -47,43 +45,14 @@ class Perplexity:
47
45
  __slots__ = ("_http",)
48
46
 
49
47
  def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
50
- """
51
- Initialize web scraper with session token.
52
-
53
- Args:
54
- session_token: Perplexity session cookie (__Secure-next-auth.session-token).
55
- config: Optional HTTP client configuration.
56
-
57
- Raises:
58
- ValueError: If session_token is empty or whitespace.
59
- """
48
+ """Initialize with session token."""
60
49
 
61
50
  if not session_token or not session_token.strip():
62
51
  raise ValueError("session_token cannot be empty")
63
52
 
64
53
  cfg = config or ClientConfig()
65
-
66
- # Configure logging based on config
67
54
  configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
68
55
 
69
- logger.info(
70
- "Perplexity client initializing | "
71
- f"session_token_length={len(session_token)} "
72
- f"logging_level={cfg.logging_level.value} "
73
- f"log_file={cfg.log_file}"
74
- )
75
- logger.debug(
76
- "Client configuration | "
77
- f"timeout={cfg.timeout}s "
78
- f"impersonate={cfg.impersonate} "
79
- f"max_retries={cfg.max_retries} "
80
- f"retry_base_delay={cfg.retry_base_delay}s "
81
- f"retry_max_delay={cfg.retry_max_delay}s "
82
- f"retry_jitter={cfg.retry_jitter} "
83
- f"requests_per_second={cfg.requests_per_second} "
84
- f"rotate_fingerprint={cfg.rotate_fingerprint}"
85
- )
86
-
87
56
  self._http = HTTPClient(
88
57
  session_token,
89
58
  timeout=cfg.timeout,
@@ -96,37 +65,17 @@ class Perplexity:
96
65
  rotate_fingerprint=cfg.rotate_fingerprint,
97
66
  )
98
67
 
99
- logger.info("Perplexity client initialized successfully")
68
+ logger.info("Perplexity client initialized")
100
69
 
101
70
  def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
102
71
  """Create a new conversation."""
103
72
 
104
- cfg = config or ConversationConfig()
105
- logger.debug(
106
- "Creating conversation | "
107
- f"model={cfg.model} "
108
- f"citation_mode={cfg.citation_mode} "
109
- f"save_to_library={cfg.save_to_library} "
110
- f"search_focus={cfg.search_focus} "
111
- f"language={cfg.language}"
112
- )
113
-
114
- conversation = Conversation(self._http, cfg)
115
-
116
- log_conversation_created(
117
- f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
118
- f"search_focus={cfg.search_focus}, language={cfg.language}"
119
- )
120
- logger.info("Conversation created successfully")
121
-
122
- return conversation
73
+ return Conversation(self._http, config or ConversationConfig())
123
74
 
124
75
  def close(self) -> None:
125
76
  """Close the client."""
126
77
 
127
- logger.debug("Closing Perplexity client")
128
78
  self._http.close()
129
- logger.info("Perplexity client closed")
130
79
 
131
80
  def __enter__(self) -> Perplexity:
132
81
  return self
@@ -153,13 +102,6 @@ class Conversation:
153
102
  )
154
103
 
155
104
  def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
156
- logger.debug(
157
- "Conversation.__init__ | "
158
- f"model={config.model} "
159
- f"citation_mode={config.citation_mode} "
160
- f"save_to_library={config.save_to_library} "
161
- f"search_focus={config.search_focus}"
162
- )
163
105
  self._http = http
164
106
  self._config = config
165
107
  self._citation_mode = CitationMode.DEFAULT
@@ -171,7 +113,6 @@ class Conversation:
171
113
  self._search_results: list[SearchResultItem] = []
172
114
  self._raw_data: dict[str, Any] = {}
173
115
  self._stream_generator: Generator[Response, None, None] | None = None
174
- logger.debug("Conversation initialized with empty state")
175
116
 
176
117
  @property
177
118
  def answer(self) -> str | None:
@@ -200,7 +141,6 @@ class Conversation:
200
141
  def __iter__(self) -> Generator[Response, None, None]:
201
142
  if self._stream_generator is not None:
202
143
  yield from self._stream_generator
203
-
204
144
  self._stream_generator = None
205
145
 
206
146
  def ask(
@@ -213,29 +153,11 @@ class Conversation:
213
153
  ) -> Conversation:
214
154
  """Ask a question. Returns self for method chaining or streaming iteration."""
215
155
 
216
- logger.info(
217
- "Conversation.ask called | "
218
- f"query_length={len(query)} "
219
- f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
220
- f"model={model} "
221
- f"files_count={len(files) if files else 0} "
222
- f"citation_mode={citation_mode} "
223
- f"stream={stream}"
224
- )
225
-
226
156
  effective_model = model or self._config.model or Models.BEST
227
157
  effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
228
158
  self._citation_mode = effective_citation
229
159
 
230
- logger.debug(
231
- f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
232
- )
233
-
234
- log_query_sent(query, str(effective_model), bool(files))
235
160
  self._execute(query, effective_model, files, stream=stream)
236
-
237
- logger.debug("Query execution completed")
238
-
239
161
  return self
240
162
 
241
163
  def _execute(
@@ -247,49 +169,20 @@ class Conversation:
247
169
  ) -> None:
248
170
  """Execute a query."""
249
171
 
250
- logger.debug(
251
- f"Executing query | "
252
- f"query_length={len(query)} "
253
- f"model={model} "
254
- f"files_count={len(files) if files else 0} "
255
- f"stream={stream} "
256
- f"is_followup={self._backend_uuid is not None}"
257
- )
258
-
259
172
  self._reset_response_state()
260
- logger.debug("Response state reset")
261
173
 
262
- # Upload files
263
174
  file_urls: list[str] = []
264
-
265
175
  if files:
266
- logger.debug(f"Validating {len(files)} files")
267
176
  validated = self._validate_files(files)
268
- logger.debug(f"Validated {len(validated)} files, uploading...")
269
177
  file_urls = [self._upload_file(f) for f in validated]
270
- logger.debug(f"Uploaded {len(file_urls)} files successfully")
271
178
 
272
179
  payload = self._build_payload(query, model, file_urls)
273
- logger.debug(
274
- f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
275
- )
276
-
277
- logger.debug("Initializing search session")
278
180
  self._http.init_search(query)
279
181
 
280
182
  if stream:
281
- logger.debug("Starting streaming mode")
282
183
  self._stream_generator = self._stream(payload)
283
184
  else:
284
- logger.debug("Starting complete mode (non-streaming)")
285
185
  self._complete(payload)
286
- logger.debug(
287
- f"Query completed | "
288
- f"title={self._title} "
289
- f"answer_length={len(self._answer) if self._answer else 0} "
290
- f"chunks_count={len(self._chunks)} "
291
- f"search_results_count={len(self._search_results)}"
292
- )
293
186
 
294
187
  def _reset_response_state(self) -> None:
295
188
  self._title = None
@@ -309,7 +202,6 @@ class Conversation:
309
202
  for item in files:
310
203
  if item and isinstance(item, (str, PathLike)):
311
204
  path = Path(item).resolve()
312
-
313
205
  if path.as_posix() not in seen:
314
206
  seen.add(path.as_posix())
315
207
  file_list.append(path)
@@ -338,7 +230,6 @@ class Conversation:
338
230
  file_path,
339
231
  f"File exceeds 50MB limit: {file_size / (1024 * 1024):.1f}MB",
340
232
  )
341
-
342
233
  if file_size == 0:
343
234
  raise FileValidationError(file_path, "File is empty")
344
235
 
@@ -388,18 +279,13 @@ class Conversation:
388
279
 
389
280
  if not s3_object_url:
390
281
  raise FileUploadError(file_info.path, "No upload URL returned")
391
-
392
282
  if not s3_bucket_url or not fields:
393
283
  raise FileUploadError(file_info.path, "Missing S3 upload credentials")
394
284
 
395
- # Upload the file to S3 using presigned POST
396
285
  file_path = Path(file_info.path)
397
-
398
286
  with file_path.open("rb") as f:
399
287
  file_content = f.read()
400
288
 
401
- # Build multipart form data using CurlMime
402
- # For S3 presigned POST, form fields must come before the file
403
289
  mime = CurlMime()
404
290
 
405
291
  for field_name, field_value in fields.items():
@@ -412,7 +298,6 @@ class Conversation:
412
298
  data=file_content,
413
299
  )
414
300
 
415
- # S3 requires a clean session
416
301
  with Session() as s3_session:
417
302
  upload_response = s3_session.post(s3_bucket_url, multipart=mime)
418
303
 
@@ -459,7 +344,7 @@ class Conversation:
459
344
  "model_preference": model.identifier,
460
345
  "mode": model.mode,
461
346
  "search_focus": cfg.search_focus.value,
462
- "search_recency_filter": cfg.time_range.value if cfg.time_range.value else None,
347
+ "search_recency_filter": cfg.time_range.value or None,
463
348
  "is_incognito": not cfg.save_to_library,
464
349
  "use_schematized_api": USE_SCHEMATIZED_API,
465
350
  "local_search_enabled": cfg.coordinates is not None,
@@ -471,7 +356,6 @@ class Conversation:
471
356
  if self._backend_uuid is not None:
472
357
  params["last_backend_uuid"] = self._backend_uuid
473
358
  params["query_source"] = "followup"
474
-
475
359
  if self._read_write_token:
476
360
  params["read_write_token"] = self._read_write_token
477
361
 
@@ -483,7 +367,6 @@ class Conversation:
483
367
 
484
368
  def replacer(m: Match[str]) -> str:
485
369
  num = m.group(1)
486
-
487
370
  if not num.isdigit():
488
371
  return m.group(0)
489
372
 
@@ -491,10 +374,8 @@ class Conversation:
491
374
  return ""
492
375
 
493
376
  idx = int(num) - 1
494
-
495
377
  if 0 <= idx < len(self._search_results):
496
378
  url = self._search_results[idx].url or ""
497
-
498
379
  if self._citation_mode == CitationMode.MARKDOWN and url:
499
380
  return f"[{num}]({url})"
500
381
 
@@ -505,7 +386,6 @@ class Conversation:
505
386
  def _parse_line(self, line: str | bytes) -> dict[str, Any] | None:
506
387
  if isinstance(line, bytes) and line.startswith(b"data: "):
507
388
  return loads(line[6:])
508
-
509
389
  if isinstance(line, str) and line.startswith("data: "):
510
390
  return loads(line[6:])
511
391
 
@@ -514,24 +394,15 @@ class Conversation:
514
394
  def _process_data(self, data: dict[str, Any]) -> None:
515
395
  """Process SSE data chunk and update conversation state."""
516
396
 
517
- if self._backend_uuid is None and "backend_uuid" in data:
397
+ if "backend_uuid" in data:
518
398
  self._backend_uuid = data["backend_uuid"]
519
399
 
520
- if self._read_write_token is None and "read_write_token" in data:
400
+ if "read_write_token" in data:
521
401
  self._read_write_token = data["read_write_token"]
522
402
 
523
- if self._title is None and "thread_title" in data:
403
+ if data.get("thread_title"):
524
404
  self._title = data["thread_title"]
525
405
 
526
- if "blocks" in data:
527
- for block in data["blocks"]:
528
- if block.get("intended_usage") == "web_results":
529
- diff = block.get("diff_block", {})
530
-
531
- for patch in diff.get("patches", []):
532
- if patch.get("op") == "replace" and patch.get("path") == "/web_results":
533
- pass
534
-
535
406
  if "text" not in data and "blocks" not in data:
536
407
  return None
537
408
 
@@ -548,10 +419,8 @@ class Conversation:
548
419
  for item in json_data:
549
420
  step_type = item.get("step_type")
550
421
 
551
- # Handle Research mode clarifying questions
552
422
  if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
553
423
  questions = self._extract_clarifying_questions(item)
554
-
555
424
  raise ResearchClarifyingQuestionsError(questions)
556
425
 
557
426
  if step_type == "FINAL":
@@ -563,11 +432,14 @@ class Conversation:
563
432
  else:
564
433
  answer_data = raw_content
565
434
 
566
- self._update_state(data.get("thread_title"), answer_data)
567
-
435
+ title = data.get("thread_title") or answer_data.get("thread_title")
436
+ self._update_state(title, answer_data)
568
437
  break
438
+
569
439
  elif isinstance(json_data, dict):
570
- self._update_state(data.get("thread_title"), json_data)
440
+ title = data.get("thread_title") or json_data.get("thread_title")
441
+ self._update_state(title, json_data)
442
+
571
443
  else:
572
444
  raise ResponseParsingError(
573
445
  "Unexpected JSON structure in 'text' field",
@@ -580,34 +452,33 @@ class Conversation:
580
452
  questions: list[str] = []
581
453
  content = item.get("content", {})
582
454
 
583
- # Try different possible structures for questions
584
455
  if isinstance(content, dict):
585
456
  if "questions" in content:
586
457
  raw_questions = content["questions"]
587
-
588
458
  if isinstance(raw_questions, list):
589
459
  questions = [str(q) for q in raw_questions if q]
590
460
  elif "clarifying_questions" in content:
591
461
  raw_questions = content["clarifying_questions"]
592
-
593
462
  if isinstance(raw_questions, list):
594
463
  questions = [str(q) for q in raw_questions if q]
595
464
  elif not questions:
596
465
  for value in content.values():
597
466
  if isinstance(value, str) and "?" in value:
598
467
  questions.append(value)
468
+
599
469
  elif isinstance(content, list):
600
470
  questions = [str(q) for q in content if q]
471
+
601
472
  elif isinstance(content, str):
602
473
  questions = [content]
603
474
 
604
475
  return questions
605
476
 
606
477
  def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
607
- self._title = title
478
+ if title is not None:
479
+ self._title = title
608
480
 
609
481
  web_results = answer_data.get("web_results", [])
610
-
611
482
  if web_results:
612
483
  self._search_results = [
613
484
  SearchResultItem(
@@ -620,12 +491,10 @@ class Conversation:
620
491
  ]
621
492
 
622
493
  answer_text = answer_data.get("answer")
623
-
624
494
  if answer_text is not None:
625
495
  self._answer = self._format_citations(answer_text)
626
496
 
627
497
  chunks = answer_data.get("chunks", [])
628
-
629
498
  if chunks:
630
499
  formatted = [self._format_citations(chunk) for chunk in chunks if chunk is not None]
631
500
  self._chunks = [c for c in formatted if c is not None]
@@ -646,21 +515,16 @@ class Conversation:
646
515
  def _complete(self, payload: dict[str, Any]) -> None:
647
516
  for line in self._http.stream_ask(payload):
648
517
  data = self._parse_line(line)
649
-
650
518
  if data:
651
519
  self._process_data(data)
652
-
653
520
  if data.get("final"):
654
521
  break
655
522
 
656
523
  def _stream(self, payload: dict[str, Any]) -> Generator[Response, None, None]:
657
524
  for line in self._http.stream_ask(payload):
658
525
  data = self._parse_line(line)
659
-
660
526
  if data:
661
527
  self._process_data(data)
662
-
663
528
  yield self._build_response()
664
-
665
529
  if data.get("final"):
666
530
  break
@@ -1,6 +1,4 @@
1
- """
2
- Enums for Perplexity WebUI Scraper configuration options.
3
- """
1
+ """Enums for configuration options."""
4
2
 
5
3
  from __future__ import annotations
6
4
 
@@ -8,140 +6,80 @@ from enum import Enum
8
6
 
9
7
 
10
8
  class CitationMode(str, Enum):
11
- """
12
- Citation formatting modes for response text.
13
-
14
- Controls how citation markers (e.g., [1], [2]) are formatted in the response.
15
- """
9
+ """Citation formatting modes for response text."""
16
10
 
17
11
  DEFAULT = "default"
18
- """
19
- Keep original Perplexity citation format (e.g., 'This is a citation[1]').
20
- """
12
+ """Keep original format (e.g., 'text[1]')."""
21
13
 
22
14
  MARKDOWN = "markdown"
23
- """
24
- Convert citations to markdown links (e.g., 'This is a citation[1](https://example.com)').
25
- """
15
+ """Convert to markdown links (e.g., 'text[1](url)')."""
26
16
 
27
17
  CLEAN = "clean"
28
- """
29
- Remove all citation markers (e.g., 'This is a citation').
30
- """
18
+ """Remove all citation markers."""
31
19
 
32
20
 
33
21
  class SearchFocus(str, Enum):
34
- """
35
- Search focus types that control the type of search performed.
36
-
37
- Determines whether to search the web or focus on writing tasks.
38
- """
22
+ """Search focus types."""
39
23
 
40
24
  WEB = "internet"
41
- """
42
- Search the web for information. Best for factual queries and research.
43
- """
25
+ """Search the web for information."""
44
26
 
45
27
  WRITING = "writing"
46
- """
47
- Focus on writing tasks. Best for creative writing, editing, and text generation.
48
- """
28
+ """Focus on writing tasks."""
49
29
 
50
30
 
51
31
  class SourceFocus(str, Enum):
52
- """
53
- Source focus types that control which sources to prioritize.
54
-
55
- Can be combined (e.g., [SourceFocus.WEB, SourceFocus.ACADEMIC]) for multi-source searches.
56
- """
32
+ """Source focus types for search prioritization."""
57
33
 
58
34
  WEB = "web"
59
- """
60
- Search across the entire internet. General web search.
61
- """
35
+ """General web search."""
62
36
 
63
37
  ACADEMIC = "scholar"
64
- """
65
- Search academic papers and scholarly articles (Google Scholar, etc.).
66
- """
38
+ """Academic papers and scholarly articles."""
67
39
 
68
40
  SOCIAL = "social"
69
- """
70
- Search social media for discussions and opinions (Reddit, Twitter, etc.).
71
- """
41
+ """Social media (Reddit, Twitter, etc.)."""
72
42
 
73
43
  FINANCE = "edgar"
74
- """
75
- Search SEC EDGAR filings for financial and corporate documents.
76
- """
44
+ """SEC EDGAR filings."""
77
45
 
78
46
 
79
47
  class TimeRange(str, Enum):
80
- """
81
- Time range filters for search results.
82
-
83
- Controls how recent the sources should be.
84
- """
48
+ """Time range filters for search results."""
85
49
 
86
50
  ALL = ""
87
- """
88
- Include sources from all time. No time restriction.
89
- """
51
+ """No time restriction."""
90
52
 
91
53
  TODAY = "DAY"
92
- """
93
- Include only sources from today (last 24 hours).
94
- """
54
+ """Last 24 hours."""
95
55
 
96
56
  LAST_WEEK = "WEEK"
97
- """
98
- Include sources from the last 7 days.
99
- """
57
+ """Last 7 days."""
100
58
 
101
59
  LAST_MONTH = "MONTH"
102
- """
103
- Include sources from the last 30 days.
104
- """
60
+ """Last 30 days."""
105
61
 
106
62
  LAST_YEAR = "YEAR"
107
- """
108
- Include sources from the last 365 days.
109
- """
63
+ """Last 365 days."""
110
64
 
111
65
 
112
66
  class LogLevel(str, Enum):
113
- """
114
- Logging level configuration.
115
-
116
- Controls the verbosity of logging output. DISABLED is the default.
117
- """
67
+ """Logging level configuration."""
118
68
 
119
69
  DISABLED = "DISABLED"
120
- """
121
- Completely disable all logging output. This is the default.
122
- """
70
+ """Disable all logging (default)."""
123
71
 
124
72
  DEBUG = "DEBUG"
125
- """
126
- Show all messages including internal debug information.
127
- """
73
+ """Show all messages including debug info."""
128
74
 
129
75
  INFO = "INFO"
130
- """
131
- Show informational messages, warnings, and errors.
132
- """
76
+ """Show info, warnings, and errors."""
133
77
 
134
78
  WARNING = "WARNING"
135
- """
136
- Show only warnings and errors.
137
- """
79
+ """Show warnings and errors only."""
138
80
 
139
81
  ERROR = "ERROR"
140
- """
141
- Show only error messages.
142
- """
82
+ """Show errors only."""
143
83
 
144
84
  CRITICAL = "CRITICAL"
145
- """
146
- Show only critical/fatal errors.
147
- """
85
+ """Show critical/fatal errors only."""