PyPI - perplexity-webui-scraper - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

perplexity-webui-scraper 0.3.4py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

perplexity_webui_scraper/__init__.py +2 -13
perplexity_webui_scraper/config.py +27 -2
perplexity_webui_scraper/core.py +166 -9
perplexity_webui_scraper/enums.py +34 -4
perplexity_webui_scraper/exceptions.py +74 -0
perplexity_webui_scraper/http.py +368 -37
perplexity_webui_scraper/logging.py +256 -0
perplexity_webui_scraper/mcp/__init__.py +18 -0
perplexity_webui_scraper/mcp/__main__.py +9 -0
perplexity_webui_scraper/mcp/server.py +181 -0
perplexity_webui_scraper/resilience.py +179 -0
{perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.5.dist-info}/METADATA +98 -8
perplexity_webui_scraper-0.3.5.dist-info/RECORD +21 -0
{perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.5.dist-info}/entry_points.txt +1 -0
perplexity_webui_scraper-0.3.4.dist-info/RECORD +0 -16
{perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.5.dist-info}/WHEEL +0 -0

perplexity_webui_scraper/__init__.py CHANGED Viewed

@@ -4,33 +4,22 @@ from importlib import metadata
 from .config import ClientConfig, ConversationConfig
 from .core import Conversation, Perplexity
-from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
-from .exceptions import (
-    AuthenticationError,
-    FileUploadError,
-    FileValidationError,
-    PerplexityError,
-    RateLimitError,
-)
+from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
 from .models import Model, Models
 from .types import Coordinates, Response, SearchResultItem
 __version__: str = metadata.version("perplexity-webui-scraper")
 __all__: list[str] = [
-    "AuthenticationError",
     "CitationMode",
     "ClientConfig",
     "Conversation",
     "ConversationConfig",
     "Coordinates",
-    "FileUploadError",
-    "FileValidationError",
+    "LogLevel",
     "Model",
     "Models",
     "Perplexity",
-    "PerplexityError",
-    "RateLimitError",
     "Response",
     "SearchFocus",
     "SearchResultItem",

perplexity_webui_scraper/config.py CHANGED Viewed

@@ -5,10 +5,12 @@ from __future__ import annotations
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
-from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
+from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
 if TYPE_CHECKING:
+    from pathlib import Path
     from .models import Model
     from .types import Coordinates
@@ -30,7 +32,30 @@ class ConversationConfig:
 @dataclass(frozen=True, slots=True)
 class ClientConfig:
-    """HTTP client settings."""
+    """
+    HTTP client settings.
+    Attributes:
+        timeout: Request timeout in seconds.
+        impersonate: Browser to impersonate (e.g., "chrome", "edge", "safari").
+        max_retries: Maximum retry attempts for failed requests.
+        retry_base_delay: Initial delay in seconds before first retry.
+        retry_max_delay: Maximum delay between retries.
+        retry_jitter: Random jitter factor (0-1) to add to delays.
+        requests_per_second: Rate limit for requests (0 to disable).
+        rotate_fingerprint: Whether to rotate browser fingerprint on retries.
+        logging_level: Logging verbosity level. Default is DISABLED.
+        log_file: Optional file path for persistent logging. If set, logs go to file only.
+                  If None, logs go to console. All logs are appended.
+    """
     timeout: int = 3600
     impersonate: str = "chrome"
+    max_retries: int = 3
+    retry_base_delay: float = 1.0
+    retry_max_delay: float = 60.0
+    retry_jitter: float = 0.5
+    requests_per_second: float = 0.5
+    rotate_fingerprint: bool = True
+    logging_level: LogLevel = LogLevel.DISABLED
+    log_file: str | Path | None = None

perplexity_webui_scraper/core.py CHANGED Viewed

@@ -26,20 +26,25 @@ from .constants import (
     USE_SCHEMATIZED_API,
 )
 from .enums import CitationMode
-from .exceptions import FileUploadError, FileValidationError
+from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
 from .http import HTTPClient
 from .limits import MAX_FILE_SIZE, MAX_FILES
+from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
 from .models import Model, Models
 from .types import Response, SearchResultItem, _FileInfo
+logger = get_logger(__name__)
 class Perplexity:
     """Web scraper for Perplexity AI conversations."""
     __slots__ = ("_http",)
     def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
-        """Initialize web scraper with session token.
+        """
+        Initialize web scraper with session token.
         Args:
             session_token: Perplexity session cookie (__Secure-next-auth.session-token).
@@ -53,17 +58,71 @@ class Perplexity:
             raise ValueError("session_token cannot be empty")
         cfg = config or ClientConfig()
-        self._http = HTTPClient(session_token, timeout=cfg.timeout, impersonate=cfg.impersonate)
+        # Configure logging based on config
+        configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
+        logger.info(
+            "Perplexity client initializing | "
+            f"session_token_length={len(session_token)} "
+            f"logging_level={cfg.logging_level.value} "
+            f"log_file={cfg.log_file}"
+        )
+        logger.debug(
+            "Client configuration | "
+            f"timeout={cfg.timeout}s "
+            f"impersonate={cfg.impersonate} "
+            f"max_retries={cfg.max_retries} "
+            f"retry_base_delay={cfg.retry_base_delay}s "
+            f"retry_max_delay={cfg.retry_max_delay}s "
+            f"retry_jitter={cfg.retry_jitter} "
+            f"requests_per_second={cfg.requests_per_second} "
+            f"rotate_fingerprint={cfg.rotate_fingerprint}"
+        )
+        self._http = HTTPClient(
+            session_token,
+            timeout=cfg.timeout,
+            impersonate=cfg.impersonate,
+            max_retries=cfg.max_retries,
+            retry_base_delay=cfg.retry_base_delay,
+            retry_max_delay=cfg.retry_max_delay,
+            retry_jitter=cfg.retry_jitter,
+            requests_per_second=cfg.requests_per_second,
+            rotate_fingerprint=cfg.rotate_fingerprint,
+        )
+        logger.info("Perplexity client initialized successfully")
     def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
         """Create a new conversation."""
-        return Conversation(self._http, config or ConversationConfig())
+        cfg = config or ConversationConfig()
+        logger.debug(
+            "Creating conversation | "
+            f"model={cfg.model} "
+            f"citation_mode={cfg.citation_mode} "
+            f"save_to_library={cfg.save_to_library} "
+            f"search_focus={cfg.search_focus} "
+            f"language={cfg.language}"
+        )
+        conversation = Conversation(self._http, cfg)
+        log_conversation_created(
+            f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
+            f"search_focus={cfg.search_focus}, language={cfg.language}"
+        )
+        logger.info("Conversation created successfully")
+        return conversation
     def close(self) -> None:
         """Close the client."""
+        logger.debug("Closing Perplexity client")
         self._http.close()
+        logger.info("Perplexity client closed")
     def __enter__(self) -> Perplexity:
         return self
@@ -90,6 +149,13 @@ class Conversation:
     )
     def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
+        logger.debug(
+            "Conversation.__init__ | "
+            f"model={config.model} "
+            f"citation_mode={config.citation_mode} "
+            f"save_to_library={config.save_to_library} "
+            f"search_focus={config.search_focus}"
+        )
         self._http = http
         self._config = config
         self._citation_mode = CitationMode.DEFAULT
@@ -101,6 +167,7 @@ class Conversation:
         self._search_results: list[SearchResultItem] = []
         self._raw_data: dict[str, Any] = {}
         self._stream_generator: Generator[Response, None, None] | None = None
+        logger.debug("Conversation initialized with empty state")
     @property
     def answer(self) -> str | None:
@@ -142,11 +209,29 @@ class Conversation:
     ) -> Conversation:
         """Ask a question. Returns self for method chaining or streaming iteration."""
+        logger.info(
+            "Conversation.ask called | "
+            f"query_length={len(query)} "
+            f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
+            f"model={model} "
+            f"files_count={len(files) if files else 0} "
+            f"citation_mode={citation_mode} "
+            f"stream={stream}"
+        )
         effective_model = model or self._config.model or Models.BEST
         effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
         self._citation_mode = effective_citation
+        logger.debug(
+            f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
+        )
+        log_query_sent(query, str(effective_model), bool(files))
         self._execute(query, effective_model, files, stream=stream)
+        logger.debug("Query execution completed")
         return self
     def _execute(
@@ -158,22 +243,49 @@ class Conversation:
     ) -> None:
         """Execute a query."""
+        logger.debug(
+            f"Executing query | "
+            f"query_length={len(query)} "
+            f"model={model} "
+            f"files_count={len(files) if files else 0} "
+            f"stream={stream} "
+            f"is_followup={self._backend_uuid is not None}"
+        )
         self._reset_response_state()
+        logger.debug("Response state reset")
         # Upload files
         file_urls: list[str] = []
         if files:
+            logger.debug(f"Validating {len(files)} files")
             validated = self._validate_files(files)
+            logger.debug(f"Validated {len(validated)} files, uploading...")
             file_urls = [self._upload_file(f) for f in validated]
+            logger.debug(f"Uploaded {len(file_urls)} files successfully")
         payload = self._build_payload(query, model, file_urls)
+        logger.debug(
+            f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
+        )
+        logger.debug("Initializing search session")
         self._http.init_search(query)
         if stream:
+            logger.debug("Starting streaming mode")
             self._stream_generator = self._stream(payload)
         else:
+            logger.debug("Starting complete mode (non-streaming)")
             self._complete(payload)
+            logger.debug(
+                f"Query completed | "
+                f"title={self._title} "
+                f"answer_length={len(self._answer) if self._answer else 0} "
+                f"chunks_count={len(self._chunks)} "
+                f"search_results_count={len(self._search_results)}"
+            )
     def _reset_response_state(self) -> None:
         self._title = None
@@ -237,8 +349,8 @@ class Conversation:
                         is_image=mimetype.startswith("image/"),
                     )
                 )
-            except FileValidationError:
-                raise
+            except FileValidationError as error:
+                raise error
             except (FileNotFoundError, PermissionError) as error:
                 raise FileValidationError(file_path, f"Cannot access file: {error}") from error
             except OSError as error:
@@ -356,12 +468,17 @@ class Conversation:
         return None
     def _process_data(self, data: dict[str, Any]) -> None:
+        """Process SSE data chunk and update conversation state."""
         if self._backend_uuid is None and "backend_uuid" in data:
             self._backend_uuid = data["backend_uuid"]
         if self._read_write_token is None and "read_write_token" in data:
             self._read_write_token = data["read_write_token"]
+        if self._title is None and "thread_title" in data:
+            self._title = data["thread_title"]
         if "blocks" in data:
             for block in data["blocks"]:
                 if block.get("intended_usage") == "web_results":
@@ -385,7 +502,15 @@ class Conversation:
         if isinstance(json_data, list):
             for item in json_data:
-                if item.get("step_type") == "FINAL":
+                step_type = item.get("step_type")
+                # Handle Research mode clarifying questions
+                if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
+                    questions = self._extract_clarifying_questions(item)
+                    raise ResearchClarifyingQuestionsError(questions)
+                if step_type == "FINAL":
                     raw_content = item.get("content", {})
                     answer_content = raw_content.get("answer")
@@ -400,7 +525,39 @@ class Conversation:
         elif isinstance(json_data, dict):
             self._update_state(data.get("thread_title"), json_data)
         else:
-            raise ValueError("Unexpected JSON structure in 'text' field")
+            raise ResponseParsingError(
+                "Unexpected JSON structure in 'text' field",
+                raw_data=str(json_data),
+            )
+    def _extract_clarifying_questions(self, item: dict[str, Any]) -> list[str]:
+        """Extract clarifying questions from a RESEARCH_CLARIFYING_QUESTIONS step."""
+        questions: list[str] = []
+        content = item.get("content", {})
+        # Try different possible structures for questions
+        if isinstance(content, dict):
+            if "questions" in content:
+                raw_questions = content["questions"]
+                if isinstance(raw_questions, list):
+                    questions = [str(q) for q in raw_questions if q]
+            elif "clarifying_questions" in content:
+                raw_questions = content["clarifying_questions"]
+                if isinstance(raw_questions, list):
+                    questions = [str(q) for q in raw_questions if q]
+            elif not questions:
+                for value in content.values():
+                    if isinstance(value, str) and "?" in value:
+                        questions.append(value)
+        elif isinstance(content, list):
+            questions = [str(q) for q in content if q]
+        elif isinstance(content, str):
+            questions = [content]
+        return questions
     def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
         self._title = title
@@ -426,7 +583,7 @@ class Conversation:
         chunks = answer_data.get("chunks", [])
         if chunks:
-            self._chunks = chunks
+            self._chunks = [self._format_citations(chunk) for chunk in chunks]
         self._raw_data = answer_data

perplexity_webui_scraper/enums.py CHANGED Viewed

@@ -6,7 +6,8 @@ from enum import Enum
 class CitationMode(str, Enum):
-    """Citation formatting modes for response text.
+    """
+    Citation formatting modes for response text.
     Controls how citation markers (e.g., [1], [2]) are formatted in the response.
     """
@@ -22,7 +23,8 @@ class CitationMode(str, Enum):
 class SearchFocus(str, Enum):
-    """Search focus types that control the type of search performed.
+    """
+    Search focus types that control the type of search performed.
     Determines whether to search the web or focus on writing tasks.
     """
@@ -35,7 +37,8 @@ class SearchFocus(str, Enum):
 class SourceFocus(str, Enum):
-    """Source focus types that control which sources to prioritize.
+    """
+    Source focus types that control which sources to prioritize.
     Can be combined (e.g., [SourceFocus.WEB, SourceFocus.ACADEMIC]) for multi-source searches.
     """
@@ -54,7 +57,8 @@ class SourceFocus(str, Enum):
 class TimeRange(str, Enum):
-    """Time range filters for search results.
+    """
+    Time range filters for search results.
     Controls how recent the sources should be.
     """
@@ -73,3 +77,29 @@ class TimeRange(str, Enum):
     LAST_YEAR = "YEAR"
     """Include sources from the last 365 days."""
+class LogLevel(str, Enum):
+    """
+    Logging level configuration.
+    Controls the verbosity of logging output. DISABLED is the default.
+    """
+    DISABLED = "DISABLED"
+    """Completely disable all logging output. This is the default."""
+    DEBUG = "DEBUG"
+    """Show all messages including internal debug information."""
+    INFO = "INFO"
+    """Show informational messages, warnings, and errors."""
+    WARNING = "WARNING"
+    """Show only warnings and errors."""
+    ERROR = "ERROR"
+    """Show only error messages."""
+    CRITICAL = "CRITICAL"
+    """Show only critical/fatal errors."""

perplexity_webui_scraper/exceptions.py CHANGED Viewed

@@ -3,6 +3,19 @@
 from __future__ import annotations
+__all__: list[str] = [
+    "AuthenticationError",
+    "CloudflareBlockError",
+    "FileUploadError",
+    "FileValidationError",
+    "PerplexityError",
+    "RateLimitError",
+    "ResearchClarifyingQuestionsError",
+    "ResponseParsingError",
+    "StreamingError",
+]
 class PerplexityError(Exception):
     """Base exception for all Perplexity-related errors."""
@@ -34,6 +47,25 @@ class RateLimitError(PerplexityError):
         )
+class CloudflareBlockError(PerplexityError):
+    """
+    Raised when Cloudflare blocks the request with a challenge page.
+    This typically means the request triggered Cloudflare's bot detection.
+    The client will automatically retry with fingerprint rotation, but if
+    this exception is raised, all retry attempts have failed.
+    """
+    def __init__(self, message: str | None = None) -> None:
+        super().__init__(
+            message
+            or "Cloudflare challenge detected. The request was blocked by Cloudflare's "
+            "bot protection. Try waiting a few minutes before retrying, or obtain a "
+            "fresh session token.",
+            status_code=403,
+        )
 class FileUploadError(PerplexityError):
     """Raised when file upload fails."""
@@ -48,3 +80,45 @@ class FileValidationError(PerplexityError):
     def __init__(self, file_path: str, reason: str) -> None:
         self.file_path = file_path
         super().__init__(f"File validation failed for '{file_path}': {reason}")
+class ResearchClarifyingQuestionsError(PerplexityError):
+    """
+    Raised when Research mode requires clarifying questions.
+    This library does not support programmatic interaction with clarifying questions.
+    Consider rephrasing your query to be more specific.
+    Attributes:
+        questions: List of clarifying questions from the API.
+    """
+    def __init__(self, questions: list[str]) -> None:
+        self.questions = questions
+        questions_text = "\n".join(f"  - {q}" for q in questions) if questions else "  (no questions provided)"
+        super().__init__(
+            f"Research mode is asking clarifying questions:\n{questions_text}\n\n"
+            "Programmatic interaction with clarifying questions is not supported. "
+            "Please rephrase your query to be more specific."
+        )
+class ResponseParsingError(PerplexityError):
+    """
+    Raised when the API response cannot be parsed.
+    Attributes:
+        raw_data: The raw data that failed to parse.
+    """
+    def __init__(self, message: str, raw_data: str | None = None) -> None:
+        self.raw_data = raw_data
+        super().__init__(f"Failed to parse API response: {message}")
+class StreamingError(PerplexityError):
+    """Raised when an error occurs during streaming."""
+    def __init__(self, message: str) -> None:
+        super().__init__(f"Streaming error: {message}")

perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

perplexity-webui-scraper 0.3.4py3-none-any.whl → 0.3.5py3-none-any.whl