PyPI - perplexity-webui-scraper - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

perplexity-webui-scraper 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

perplexity_webui_scraper/__init__.py +4 -15
perplexity_webui_scraper/cli/get_perplexity_session_token.py +216 -0
perplexity_webui_scraper/config.py +29 -4
perplexity_webui_scraper/constants.py +9 -35
perplexity_webui_scraper/core.py +225 -21
perplexity_webui_scraper/enums.py +34 -4
perplexity_webui_scraper/exceptions.py +74 -0
perplexity_webui_scraper/http.py +370 -36
perplexity_webui_scraper/limits.py +2 -5
perplexity_webui_scraper/logging.py +256 -0
perplexity_webui_scraper/mcp/__init__.py +18 -0
perplexity_webui_scraper/mcp/__main__.py +9 -0
perplexity_webui_scraper/mcp/server.py +181 -0
perplexity_webui_scraper/models.py +34 -19
perplexity_webui_scraper/resilience.py +179 -0
perplexity_webui_scraper-0.3.5.dist-info/METADATA +304 -0
perplexity_webui_scraper-0.3.5.dist-info/RECORD +21 -0
{perplexity_webui_scraper-0.3.3.dist-info → perplexity_webui_scraper-0.3.5.dist-info}/WHEEL +1 -1
perplexity_webui_scraper-0.3.5.dist-info/entry_points.txt +4 -0
perplexity_webui_scraper-0.3.3.dist-info/METADATA +0 -166
perplexity_webui_scraper-0.3.3.dist-info/RECORD +0 -14

perplexity_webui_scraper/core.py CHANGED Viewed

@@ -8,7 +8,7 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from uuid import uuid4
-from orjson import loads
+from orjson import JSONDecodeError, loads
 if TYPE_CHECKING:
@@ -26,20 +26,25 @@ from .constants import (
     USE_SCHEMATIZED_API,
 )
 from .enums import CitationMode
-from .exceptions import FileUploadError, FileValidationError
+from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
 from .http import HTTPClient
 from .limits import MAX_FILE_SIZE, MAX_FILES
+from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
 from .models import Model, Models
 from .types import Response, SearchResultItem, _FileInfo
+logger = get_logger(__name__)
 class Perplexity:
     """Web scraper for Perplexity AI conversations."""
     __slots__ = ("_http",)
     def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
-        """Initialize web scraper with session token.
+        """
+        Initialize web scraper with session token.
         Args:
             session_token: Perplexity session cookie (__Secure-next-auth.session-token).
@@ -53,15 +58,71 @@ class Perplexity:
             raise ValueError("session_token cannot be empty")
         cfg = config or ClientConfig()
-        self._http = HTTPClient(session_token, timeout=cfg.timeout, impersonate=cfg.impersonate)
+        # Configure logging based on config
+        configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
+        logger.info(
+            "Perplexity client initializing | "
+            f"session_token_length={len(session_token)} "
+            f"logging_level={cfg.logging_level.value} "
+            f"log_file={cfg.log_file}"
+        )
+        logger.debug(
+            "Client configuration | "
+            f"timeout={cfg.timeout}s "
+            f"impersonate={cfg.impersonate} "
+            f"max_retries={cfg.max_retries} "
+            f"retry_base_delay={cfg.retry_base_delay}s "
+            f"retry_max_delay={cfg.retry_max_delay}s "
+            f"retry_jitter={cfg.retry_jitter} "
+            f"requests_per_second={cfg.requests_per_second} "
+            f"rotate_fingerprint={cfg.rotate_fingerprint}"
+        )
+        self._http = HTTPClient(
+            session_token,
+            timeout=cfg.timeout,
+            impersonate=cfg.impersonate,
+            max_retries=cfg.max_retries,
+            retry_base_delay=cfg.retry_base_delay,
+            retry_max_delay=cfg.retry_max_delay,
+            retry_jitter=cfg.retry_jitter,
+            requests_per_second=cfg.requests_per_second,
+            rotate_fingerprint=cfg.rotate_fingerprint,
+        )
+        logger.info("Perplexity client initialized successfully")
     def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
         """Create a new conversation."""
-        return Conversation(self._http, config or ConversationConfig())
+        cfg = config or ConversationConfig()
+        logger.debug(
+            "Creating conversation | "
+            f"model={cfg.model} "
+            f"citation_mode={cfg.citation_mode} "
+            f"save_to_library={cfg.save_to_library} "
+            f"search_focus={cfg.search_focus} "
+            f"language={cfg.language}"
+        )
+        conversation = Conversation(self._http, cfg)
+        log_conversation_created(
+            f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
+            f"search_focus={cfg.search_focus}, language={cfg.language}"
+        )
+        logger.info("Conversation created successfully")
+        return conversation
     def close(self) -> None:
         """Close the client."""
+        logger.debug("Closing Perplexity client")
         self._http.close()
+        logger.info("Perplexity client closed")
     def __enter__(self) -> Perplexity:
         return self
@@ -88,6 +149,13 @@ class Conversation:
     )
     def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
+        logger.debug(
+            "Conversation.__init__ | "
+            f"model={config.model} "
+            f"citation_mode={config.citation_mode} "
+            f"save_to_library={config.save_to_library} "
+            f"search_focus={config.search_focus}"
+        )
         self._http = http
         self._config = config
         self._citation_mode = CitationMode.DEFAULT
@@ -99,70 +167,125 @@ class Conversation:
         self._search_results: list[SearchResultItem] = []
         self._raw_data: dict[str, Any] = {}
         self._stream_generator: Generator[Response, None, None] | None = None
+        logger.debug("Conversation initialized with empty state")
     @property
     def answer(self) -> str | None:
         """Last response text."""
         return self._answer
     @property
     def title(self) -> str | None:
         """Conversation title."""
         return self._title
     @property
     def search_results(self) -> list[SearchResultItem]:
         """Search results from last response."""
         return self._search_results
     @property
     def uuid(self) -> str | None:
         """Conversation UUID."""
         return self._backend_uuid
     def __iter__(self) -> Generator[Response, None, None]:
         if self._stream_generator is not None:
             yield from self._stream_generator
             self._stream_generator = None
     def ask(
         self,
         query: str,
         model: Model | None = None,
-        files: list[str | PathLike[str]] | None = None,
+        files: list[str | PathLike] | None = None,
         citation_mode: CitationMode | None = None,
         stream: bool = False,
     ) -> Conversation:
         """Ask a question. Returns self for method chaining or streaming iteration."""
+        logger.info(
+            "Conversation.ask called | "
+            f"query_length={len(query)} "
+            f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
+            f"model={model} "
+            f"files_count={len(files) if files else 0} "
+            f"citation_mode={citation_mode} "
+            f"stream={stream}"
+        )
         effective_model = model or self._config.model or Models.BEST
         effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
         self._citation_mode = effective_citation
+        logger.debug(
+            f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
+        )
+        log_query_sent(query, str(effective_model), bool(files))
         self._execute(query, effective_model, files, stream=stream)
+        logger.debug("Query execution completed")
         return self
     def _execute(
         self,
         query: str,
         model: Model,
-        files: list[str | PathLike[str]] | None,
+        files: list[str | PathLike] | None,
         stream: bool = False,
     ) -> None:
         """Execute a query."""
+        logger.debug(
+            f"Executing query | "
+            f"query_length={len(query)} "
+            f"model={model} "
+            f"files_count={len(files) if files else 0} "
+            f"stream={stream} "
+            f"is_followup={self._backend_uuid is not None}"
+        )
         self._reset_response_state()
+        logger.debug("Response state reset")
         # Upload files
         file_urls: list[str] = []
         if files:
+            logger.debug(f"Validating {len(files)} files")
             validated = self._validate_files(files)
+            logger.debug(f"Validated {len(validated)} files, uploading...")
             file_urls = [self._upload_file(f) for f in validated]
+            logger.debug(f"Uploaded {len(file_urls)} files successfully")
         payload = self._build_payload(query, model, file_urls)
+        logger.debug(
+            f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
+        )
+        logger.debug("Initializing search session")
         self._http.init_search(query)
         if stream:
+            logger.debug("Starting streaming mode")
             self._stream_generator = self._stream(payload)
         else:
+            logger.debug("Starting complete mode (non-streaming)")
             self._complete(payload)
+            logger.debug(
+                f"Query completed | "
+                f"title={self._title} "
+                f"answer_length={len(self._answer) if self._answer else 0} "
+                f"chunks_count={len(self._chunks)} "
+                f"search_results_count={len(self._search_results)}"
+            )
     def _reset_response_state(self) -> None:
         self._title = None
@@ -172,15 +295,17 @@ class Conversation:
         self._raw_data = {}
         self._stream_generator = None
-    def _validate_files(self, files: list[str | PathLike[str]] | None) -> list[_FileInfo]:
+    def _validate_files(self, files: list[str | PathLike] | None) -> list[_FileInfo]:
         if not files:
             return []
         seen: set[str] = set()
         file_list: list[Path] = []
         for item in files:
             if item and isinstance(item, (str, PathLike)):
                 path = Path(item).resolve()
                 if path.as_posix() not in seen:
                     seen.add(path.as_posix())
                     file_list.append(path)
@@ -203,11 +328,13 @@ class Conversation:
                     raise FileValidationError(file_path, "Path is not a file")
                 file_size = path.stat().st_size
                 if file_size > MAX_FILE_SIZE:
                     raise FileValidationError(
                         file_path,
                         f"File exceeds 50MB limit: {file_size / (1024 * 1024):.1f}MB",
                     )
                 if file_size == 0:
                     raise FileValidationError(file_path, "File is empty")
@@ -222,12 +349,12 @@ class Conversation:
                         is_image=mimetype.startswith("image/"),
                     )
                 )
-            except FileValidationError:
-                raise
-            except (FileNotFoundError, PermissionError) as e:
-                raise FileValidationError(file_path, f"Cannot access file: {e}") from e
-            except OSError as e:
-                raise FileValidationError(file_path, f"File system error: {e}") from e
+            except FileValidationError as error:
+                raise error
+            except (FileNotFoundError, PermissionError) as error:
+                raise FileValidationError(file_path, f"Cannot access file: {error}") from error
+            except OSError as error:
+                raise FileValidationError(file_path, f"File system error: {error}") from error
         return result
@@ -255,8 +382,8 @@ class Conversation:
                 raise FileUploadError(file_info.path, "No upload URL returned")
             return upload_url
-        except FileUploadError:
-            raise
+        except FileUploadError as error:
+            raise error
         except Exception as e:
             raise FileUploadError(file_info.path, str(e)) from e
@@ -301,6 +428,7 @@ class Conversation:
         if self._backend_uuid is not None:
             params["last_backend_uuid"] = self._backend_uuid
             params["query_source"] = "followup"
             if self._read_write_token:
                 params["read_write_token"] = self._read_write_token
@@ -312,6 +440,7 @@ class Conversation:
         def replacer(m: Match[str]) -> str:
             num = m.group(1)
             if not num.isdigit():
                 return m.group(0)
@@ -319,8 +448,10 @@ class Conversation:
                 return ""
             idx = int(num) - 1
             if 0 <= idx < len(self._search_results):
                 url = self._search_results[idx].url or ""
                 if self._citation_mode == CitationMode.MARKDOWN and url:
                     return f"[{num}]({url})"
@@ -330,26 +461,56 @@ class Conversation:
     def _parse_line(self, line: str | bytes) -> dict[str, Any] | None:
         prefix = b"data: " if isinstance(line, bytes) else "data: "
         if (isinstance(line, bytes) and line.startswith(prefix)) or (isinstance(line, str) and line.startswith(prefix)):
             return loads(line[6:])
         return None
     def _process_data(self, data: dict[str, Any]) -> None:
+        """Process SSE data chunk and update conversation state."""
         if self._backend_uuid is None and "backend_uuid" in data:
             self._backend_uuid = data["backend_uuid"]
         if self._read_write_token is None and "read_write_token" in data:
             self._read_write_token = data["read_write_token"]
-        if "text" not in data:
-            return
+        if self._title is None and "thread_title" in data:
+            self._title = data["thread_title"]
+        if "blocks" in data:
+            for block in data["blocks"]:
+                if block.get("intended_usage") == "web_results":
+                    diff = block.get("diff_block", {})
+                    for patch in diff.get("patches", []):
+                        if patch.get("op") == "replace" and patch.get("path") == "/web_results":
+                            pass
+        if "text" not in data and "blocks" not in data:
+            return None
+        try:
+            json_data = loads(data["text"])
+        except KeyError as e:
+            raise ValueError("Missing 'text' field in data") from e
+        except JSONDecodeError as e:
+            raise ValueError("Invalid JSON in 'text' field") from e
-        json_data = loads(data["text"])
         answer_data: dict[str, Any] = {}
         if isinstance(json_data, list):
             for item in json_data:
-                if item.get("step_type") == "FINAL":
+                step_type = item.get("step_type")
+                # Handle Research mode clarifying questions
+                if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
+                    questions = self._extract_clarifying_questions(item)
+                    raise ResearchClarifyingQuestionsError(questions)
+                if step_type == "FINAL":
                     raw_content = item.get("content", {})
                     answer_content = raw_content.get("answer")
@@ -359,14 +520,50 @@ class Conversation:
                         answer_data = raw_content
                     self._update_state(data.get("thread_title"), answer_data)
                     break
         elif isinstance(json_data, dict):
             self._update_state(data.get("thread_title"), json_data)
+        else:
+            raise ResponseParsingError(
+                "Unexpected JSON structure in 'text' field",
+                raw_data=str(json_data),
+            )
+    def _extract_clarifying_questions(self, item: dict[str, Any]) -> list[str]:
+        """Extract clarifying questions from a RESEARCH_CLARIFYING_QUESTIONS step."""
+        questions: list[str] = []
+        content = item.get("content", {})
+        # Try different possible structures for questions
+        if isinstance(content, dict):
+            if "questions" in content:
+                raw_questions = content["questions"]
+                if isinstance(raw_questions, list):
+                    questions = [str(q) for q in raw_questions if q]
+            elif "clarifying_questions" in content:
+                raw_questions = content["clarifying_questions"]
+                if isinstance(raw_questions, list):
+                    questions = [str(q) for q in raw_questions if q]
+            elif not questions:
+                for value in content.values():
+                    if isinstance(value, str) and "?" in value:
+                        questions.append(value)
+        elif isinstance(content, list):
+            questions = [str(q) for q in content if q]
+        elif isinstance(content, str):
+            questions = [content]
+        return questions
     def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
         self._title = title
         web_results = answer_data.get("web_results", [])
         if web_results:
             self._search_results = [
                 SearchResultItem(
@@ -379,12 +576,14 @@ class Conversation:
             ]
         answer_text = answer_data.get("answer")
         if answer_text is not None:
             self._answer = self._format_citations(answer_text)
         chunks = answer_data.get("chunks", [])
         if chunks:
-            self._chunks = chunks
+            self._chunks = [self._format_citations(chunk) for chunk in chunks]
         self._raw_data = answer_data
@@ -402,16 +601,21 @@ class Conversation:
     def _complete(self, payload: dict[str, Any]) -> None:
         for line in self._http.stream_ask(payload):
             data = self._parse_line(line)
             if data:
                 self._process_data(data)
                 if data.get("final"):
                     break
     def _stream(self, payload: dict[str, Any]) -> Generator[Response, None, None]:
         for line in self._http.stream_ask(payload):
             data = self._parse_line(line)
             if data:
                 self._process_data(data)
                 yield self._build_response()
                 if data.get("final"):
                     break

perplexity_webui_scraper/enums.py CHANGED Viewed

@@ -6,7 +6,8 @@ from enum import Enum
 class CitationMode(str, Enum):
-    """Citation formatting modes for response text.
+    """
+    Citation formatting modes for response text.
     Controls how citation markers (e.g., [1], [2]) are formatted in the response.
     """
@@ -22,7 +23,8 @@ class CitationMode(str, Enum):
 class SearchFocus(str, Enum):
-    """Search focus types that control the type of search performed.
+    """
+    Search focus types that control the type of search performed.
     Determines whether to search the web or focus on writing tasks.
     """
@@ -35,7 +37,8 @@ class SearchFocus(str, Enum):
 class SourceFocus(str, Enum):
-    """Source focus types that control which sources to prioritize.
+    """
+    Source focus types that control which sources to prioritize.
     Can be combined (e.g., [SourceFocus.WEB, SourceFocus.ACADEMIC]) for multi-source searches.
     """
@@ -54,7 +57,8 @@ class SourceFocus(str, Enum):
 class TimeRange(str, Enum):
-    """Time range filters for search results.
+    """
+    Time range filters for search results.
     Controls how recent the sources should be.
     """
@@ -73,3 +77,29 @@ class TimeRange(str, Enum):
     LAST_YEAR = "YEAR"
     """Include sources from the last 365 days."""
+class LogLevel(str, Enum):
+    """
+    Logging level configuration.
+    Controls the verbosity of logging output. DISABLED is the default.
+    """
+    DISABLED = "DISABLED"
+    """Completely disable all logging output. This is the default."""
+    DEBUG = "DEBUG"
+    """Show all messages including internal debug information."""
+    INFO = "INFO"
+    """Show informational messages, warnings, and errors."""
+    WARNING = "WARNING"
+    """Show only warnings and errors."""
+    ERROR = "ERROR"
+    """Show only error messages."""
+    CRITICAL = "CRITICAL"
+    """Show only critical/fatal errors."""

perplexity_webui_scraper/exceptions.py CHANGED Viewed

@@ -3,6 +3,19 @@
 from __future__ import annotations
+__all__: list[str] = [
+    "AuthenticationError",
+    "CloudflareBlockError",
+    "FileUploadError",
+    "FileValidationError",
+    "PerplexityError",
+    "RateLimitError",
+    "ResearchClarifyingQuestionsError",
+    "ResponseParsingError",
+    "StreamingError",
+]
 class PerplexityError(Exception):
     """Base exception for all Perplexity-related errors."""
@@ -34,6 +47,25 @@ class RateLimitError(PerplexityError):
         )
+class CloudflareBlockError(PerplexityError):
+    """
+    Raised when Cloudflare blocks the request with a challenge page.
+    This typically means the request triggered Cloudflare's bot detection.
+    The client will automatically retry with fingerprint rotation, but if
+    this exception is raised, all retry attempts have failed.
+    """
+    def __init__(self, message: str | None = None) -> None:
+        super().__init__(
+            message
+            or "Cloudflare challenge detected. The request was blocked by Cloudflare's "
+            "bot protection. Try waiting a few minutes before retrying, or obtain a "
+            "fresh session token.",
+            status_code=403,
+        )
 class FileUploadError(PerplexityError):
     """Raised when file upload fails."""
@@ -48,3 +80,45 @@ class FileValidationError(PerplexityError):
     def __init__(self, file_path: str, reason: str) -> None:
         self.file_path = file_path
         super().__init__(f"File validation failed for '{file_path}': {reason}")
+class ResearchClarifyingQuestionsError(PerplexityError):
+    """
+    Raised when Research mode requires clarifying questions.
+    This library does not support programmatic interaction with clarifying questions.
+    Consider rephrasing your query to be more specific.
+    Attributes:
+        questions: List of clarifying questions from the API.
+    """
+    def __init__(self, questions: list[str]) -> None:
+        self.questions = questions
+        questions_text = "\n".join(f"  - {q}" for q in questions) if questions else "  (no questions provided)"
+        super().__init__(
+            f"Research mode is asking clarifying questions:\n{questions_text}\n\n"
+            "Programmatic interaction with clarifying questions is not supported. "
+            "Please rephrase your query to be more specific."
+        )
+class ResponseParsingError(PerplexityError):
+    """
+    Raised when the API response cannot be parsed.
+    Attributes:
+        raw_data: The raw data that failed to parse.
+    """
+    def __init__(self, message: str, raw_data: str | None = None) -> None:
+        self.raw_data = raw_data
+        super().__init__(f"Failed to parse API response: {message}")
+class StreamingError(PerplexityError):
+    """Raised when an error occurs during streaming."""
+    def __init__(self, message: str) -> None:
+        super().__init__(f"Streaming error: {message}")

perplexity-webui-scraper 0.3.3__py3-none-any.whl → 0.3.5__py3-none-any.whl

perplexity-webui-scraper 0.3.3py3-none-any.whl → 0.3.5py3-none-any.whl