PyPI - perplexity-webui-scraper - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

perplexity-webui-scraper 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

perplexity_webui_scraper/__init__.py +5 -14
perplexity_webui_scraper/cli/get_perplexity_session_token.py +24 -8
perplexity_webui_scraper/config.py +33 -4
perplexity_webui_scraper/constants.py +30 -10
perplexity_webui_scraper/core.py +223 -21
perplexity_webui_scraper/enums.py +91 -19
perplexity_webui_scraper/exceptions.py +77 -1
perplexity_webui_scraper/http.py +374 -38
perplexity_webui_scraper/limits.py +12 -4
perplexity_webui_scraper/logging.py +278 -0
perplexity_webui_scraper/mcp/__init__.py +20 -0
perplexity_webui_scraper/mcp/__main__.py +11 -0
perplexity_webui_scraper/mcp/server.py +166 -0
perplexity_webui_scraper/models.py +55 -19
perplexity_webui_scraper/resilience.py +181 -0
perplexity_webui_scraper/types.py +15 -5
{perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/METADATA +97 -7
perplexity_webui_scraper-0.3.6.dist-info/RECORD +21 -0
{perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/WHEEL +1 -1
{perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/entry_points.txt +1 -0
perplexity_webui_scraper-0.3.4.dist-info/RECORD +0 -16

perplexity_webui_scraper/__init__.py CHANGED Viewed

@@ -1,36 +1,27 @@
-"""Extract AI responses from Perplexity's web interface."""
+"""
+Extract AI responses from Perplexity's web interface.
+"""
 from importlib import metadata
 from .config import ClientConfig, ConversationConfig
 from .core import Conversation, Perplexity
-from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
-from .exceptions import (
-    AuthenticationError,
-    FileUploadError,
-    FileValidationError,
-    PerplexityError,
-    RateLimitError,
-)
+from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
 from .models import Model, Models
 from .types import Coordinates, Response, SearchResultItem
 __version__: str = metadata.version("perplexity-webui-scraper")
 __all__: list[str] = [
-    "AuthenticationError",
     "CitationMode",
     "ClientConfig",
     "Conversation",
     "ConversationConfig",
     "Coordinates",
-    "FileUploadError",
-    "FileValidationError",
+    "LogLevel",
     "Model",
     "Models",
     "Perplexity",
-    "PerplexityError",
-    "RateLimitError",
     "Response",
     "SearchFocus",
     "SearchResultItem",

perplexity_webui_scraper/cli/get_perplexity_session_token.py CHANGED Viewed

@@ -1,4 +1,6 @@
-"""CLI utility for secure Perplexity authentication and session extraction."""
+"""
+CLI utility for secure Perplexity authentication and session extraction.
+"""
 from __future__ import annotations
@@ -57,7 +59,9 @@ def update_env(token: str) -> bool:
 def _initialize_session() -> tuple[Session, str]:
-    """Initialize session and obtain CSRF token."""
+    """
+    Initialize session and obtain CSRF token.
+    """
     session = Session(impersonate="chrome", headers={"Referer": BASE_URL, "Origin": BASE_URL})
@@ -73,7 +77,9 @@ def _initialize_session() -> tuple[Session, str]:
 def _request_verification_code(session: Session, csrf: str, email: str) -> None:
-    """Send verification code to user's email."""
+    """
+    Send verification code to user's email.
+    """
     with console.status("[bold green]Sending verification code...", spinner="dots"):
         r = session.post(
@@ -92,7 +98,9 @@ def _request_verification_code(session: Session, csrf: str, email: str) -> None:
 def _validate_and_get_redirect_url(session: Session, email: str, user_input: str) -> str:
-    """Validate user input (OTP or magic link) and return redirect URL."""
+    """
+    Validate user input (OTP or magic link) and return redirect URL.
+    """
     with console.status("[bold green]Validating...", spinner="dots"):
         if user_input.startswith("http"):
@@ -120,7 +128,9 @@ def _validate_and_get_redirect_url(session: Session, email: str, user_input: str
 def _extract_session_token(session: Session, redirect_url: str) -> str:
-    """Extract session token from cookies after authentication."""
+    """
+    Extract session token from cookies after authentication.
+    """
     session.get(redirect_url)
     token = session.cookies.get("__Secure-next-auth.session-token")
@@ -132,7 +142,9 @@ def _extract_session_token(session: Session, redirect_url: str) -> str:
 def _display_and_save_token(token: str) -> None:
-    """Display token and optionally save to .env file."""
+    """
+    Display token and optionally save to .env file.
+    """
     console.print("\n[bold green]✅ Token generated successfully![/bold green]")
     console.print(f"\n[bold white]Your session token:[/bold white]\n[green]{token}[/green]\n")
@@ -147,7 +159,9 @@ def _display_and_save_token(token: str) -> None:
 def _show_header() -> None:
-    """Display welcome header."""
+    """
+    Display welcome header.
+    """
     console.print(
         Panel(
@@ -161,7 +175,9 @@ def _show_header() -> None:
 def _show_exit_message() -> None:
-    """Display security note and wait for user to exit."""
+    """
+    Display security note and wait for user to exit.
+    """
     console.print("\n[bold yellow]⚠️ Security Note:[/bold yellow]")
     console.print("Press [bold white]ENTER[/bold white] to clear screen and exit.")

perplexity_webui_scraper/config.py CHANGED Viewed

@@ -1,21 +1,27 @@
-"""Configuration classes."""
+"""
+Configuration classes.
+"""
 from __future__ import annotations
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
-from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
+from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
 if TYPE_CHECKING:
+    from pathlib import Path
     from .models import Model
     from .types import Coordinates
 @dataclass(slots=True)
 class ConversationConfig:
-    """Default settings for a conversation. Can be overridden per message."""
+    """
+    Default settings for a conversation. Can be overridden per message.
+    """
     model: Model | None = None
     citation_mode: CitationMode = CitationMode.CLEAN
@@ -30,7 +36,30 @@ class ConversationConfig:
 @dataclass(frozen=True, slots=True)
 class ClientConfig:
-    """HTTP client settings."""
+    """
+    HTTP client settings.
+    Attributes:
+        timeout: Request timeout in seconds.
+        impersonate: Browser to impersonate (e.g., "chrome", "edge", "safari").
+        max_retries: Maximum retry attempts for failed requests.
+        retry_base_delay: Initial delay in seconds before first retry.
+        retry_max_delay: Maximum delay between retries.
+        retry_jitter: Random jitter factor (0-1) to add to delays.
+        requests_per_second: Rate limit for requests (0 to disable).
+        rotate_fingerprint: Whether to rotate browser fingerprint on retries.
+        logging_level: Logging verbosity level. Default is DISABLED.
+        log_file: Optional file path for persistent logging. If set, logs go to file only.
+                  If None, logs go to console. All logs are appended.
+    """
     timeout: int = 3600
     impersonate: str = "chrome"
+    max_retries: int = 3
+    retry_base_delay: float = 1.0
+    retry_max_delay: float = 60.0
+    retry_jitter: float = 0.5
+    requests_per_second: float = 0.5
+    rotate_fingerprint: bool = True
+    logging_level: LogLevel = LogLevel.DISABLED
+    log_file: str | Path | None = None

perplexity_webui_scraper/constants.py CHANGED Viewed

@@ -1,4 +1,6 @@
-"""Constants and values for the Perplexity internal API and HTTP interactions."""
+"""
+Constants and values for the Perplexity internal API and HTTP interactions.
+"""
 from __future__ import annotations
@@ -8,20 +10,30 @@ from typing import Final
 # API Configuration
 API_VERSION: Final[str] = "2.18"
-"""Current API version used by Perplexity WebUI."""
+"""
+Current API version used by Perplexity WebUI.
+"""
 API_BASE_URL: Final[str] = "https://www.perplexity.ai"
-"""Base URL for all API requests."""
+"""
+Base URL for all API requests.
+"""
 # API Endpoints
 ENDPOINT_ASK: Final[str] = "/rest/sse/perplexity_ask"
-"""SSE endpoint for sending prompts."""
+"""
+SSE endpoint for sending prompts.
+"""
 ENDPOINT_SEARCH_INIT: Final[str] = "/search/new"
-"""Endpoint to initialize a search session."""
+"""
+Endpoint to initialize a search session.
+"""
 ENDPOINT_UPLOAD: Final[str] = "/rest/uploads/batch_create_upload_urls"
-"""Endpoint for file upload URL generation."""
+"""
+Endpoint for file upload URL generation.
+"""
 # API Fixed Parameters
 SEND_BACK_TEXT: Final[bool] = True
@@ -33,10 +45,14 @@ False = API sends delta chunks only (accumulate mode).
 """
 USE_SCHEMATIZED_API: Final[bool] = False
-"""Whether to use the schematized API format."""
+"""
+Whether to use the schematized API format.
+"""
 PROMPT_SOURCE: Final[str] = "user"
-"""Source identifier for prompts."""
+"""
+Source identifier for prompts.
+"""
 # Regex Patterns (Pre-compiled for performance in streaming parsing)
 CITATION_PATTERN: Final[Pattern[str]] = compile(r"\[(\d{1,2})\]")
@@ -47,7 +63,9 @@ Uses word boundary to avoid matching things like [123].
 """
 JSON_OBJECT_PATTERN: Final[Pattern[str]] = compile(r"^\{.*\}$")
-"""Pattern to detect JSON object strings."""
+"""
+Pattern to detect JSON object strings.
+"""
 # HTTP Headers
 DEFAULT_HEADERS: Final[dict[str, str]] = {
@@ -61,4 +79,6 @@ Referer and Origin are added dynamically based on BASE_URL.
 """
 SESSION_COOKIE_NAME: Final[str] = "__Secure-next-auth.session-token"
-"""Name of the session cookie used for authentication."""
+"""
+Name of the session cookie used for authentication.
+"""

perplexity_webui_scraper/core.py CHANGED Viewed

@@ -1,4 +1,6 @@
-"""Core client implementation."""
+"""
+Core client implementation.
+"""
 from __future__ import annotations
@@ -8,6 +10,8 @@ from pathlib import Path
 from typing import TYPE_CHECKING, Any
 from uuid import uuid4
+from curl_cffi import CurlMime
+from curl_cffi.requests import Session
 from orjson import JSONDecodeError, loads
@@ -26,20 +30,25 @@ from .constants import (
     USE_SCHEMATIZED_API,
 )
 from .enums import CitationMode
-from .exceptions import FileUploadError, FileValidationError
+from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
 from .http import HTTPClient
 from .limits import MAX_FILE_SIZE, MAX_FILES
+from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
 from .models import Model, Models
 from .types import Response, SearchResultItem, _FileInfo
+logger = get_logger(__name__)
 class Perplexity:
     """Web scraper for Perplexity AI conversations."""
     __slots__ = ("_http",)
     def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
-        """Initialize web scraper with session token.
+        """
+        Initialize web scraper with session token.
         Args:
             session_token: Perplexity session cookie (__Secure-next-auth.session-token).
@@ -53,17 +62,71 @@ class Perplexity:
             raise ValueError("session_token cannot be empty")
         cfg = config or ClientConfig()
-        self._http = HTTPClient(session_token, timeout=cfg.timeout, impersonate=cfg.impersonate)
+        # Configure logging based on config
+        configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
+        logger.info(
+            "Perplexity client initializing | "
+            f"session_token_length={len(session_token)} "
+            f"logging_level={cfg.logging_level.value} "
+            f"log_file={cfg.log_file}"
+        )
+        logger.debug(
+            "Client configuration | "
+            f"timeout={cfg.timeout}s "
+            f"impersonate={cfg.impersonate} "
+            f"max_retries={cfg.max_retries} "
+            f"retry_base_delay={cfg.retry_base_delay}s "
+            f"retry_max_delay={cfg.retry_max_delay}s "
+            f"retry_jitter={cfg.retry_jitter} "
+            f"requests_per_second={cfg.requests_per_second} "
+            f"rotate_fingerprint={cfg.rotate_fingerprint}"
+        )
+        self._http = HTTPClient(
+            session_token,
+            timeout=cfg.timeout,
+            impersonate=cfg.impersonate,
+            max_retries=cfg.max_retries,
+            retry_base_delay=cfg.retry_base_delay,
+            retry_max_delay=cfg.retry_max_delay,
+            retry_jitter=cfg.retry_jitter,
+            requests_per_second=cfg.requests_per_second,
+            rotate_fingerprint=cfg.rotate_fingerprint,
+        )
+        logger.info("Perplexity client initialized successfully")
     def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
         """Create a new conversation."""
-        return Conversation(self._http, config or ConversationConfig())
+        cfg = config or ConversationConfig()
+        logger.debug(
+            "Creating conversation | "
+            f"model={cfg.model} "
+            f"citation_mode={cfg.citation_mode} "
+            f"save_to_library={cfg.save_to_library} "
+            f"search_focus={cfg.search_focus} "
+            f"language={cfg.language}"
+        )
+        conversation = Conversation(self._http, cfg)
+        log_conversation_created(
+            f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
+            f"search_focus={cfg.search_focus}, language={cfg.language}"
+        )
+        logger.info("Conversation created successfully")
+        return conversation
     def close(self) -> None:
         """Close the client."""
+        logger.debug("Closing Perplexity client")
         self._http.close()
+        logger.info("Perplexity client closed")
     def __enter__(self) -> Perplexity:
         return self
@@ -90,6 +153,13 @@ class Conversation:
     )
     def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
+        logger.debug(
+            "Conversation.__init__ | "
+            f"model={config.model} "
+            f"citation_mode={config.citation_mode} "
+            f"save_to_library={config.save_to_library} "
+            f"search_focus={config.search_focus}"
+        )
         self._http = http
         self._config = config
         self._citation_mode = CitationMode.DEFAULT
@@ -101,6 +171,7 @@ class Conversation:
         self._search_results: list[SearchResultItem] = []
         self._raw_data: dict[str, Any] = {}
         self._stream_generator: Generator[Response, None, None] | None = None
+        logger.debug("Conversation initialized with empty state")
     @property
     def answer(self) -> str | None:
@@ -142,11 +213,29 @@ class Conversation:
     ) -> Conversation:
         """Ask a question. Returns self for method chaining or streaming iteration."""
+        logger.info(
+            "Conversation.ask called | "
+            f"query_length={len(query)} "
+            f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
+            f"model={model} "
+            f"files_count={len(files) if files else 0} "
+            f"citation_mode={citation_mode} "
+            f"stream={stream}"
+        )
         effective_model = model or self._config.model or Models.BEST
         effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
         self._citation_mode = effective_citation
+        logger.debug(
+            f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
+        )
+        log_query_sent(query, str(effective_model), bool(files))
         self._execute(query, effective_model, files, stream=stream)
+        logger.debug("Query execution completed")
         return self
     def _execute(
@@ -158,22 +247,49 @@ class Conversation:
     ) -> None:
         """Execute a query."""
+        logger.debug(
+            f"Executing query | "
+            f"query_length={len(query)} "
+            f"model={model} "
+            f"files_count={len(files) if files else 0} "
+            f"stream={stream} "
+            f"is_followup={self._backend_uuid is not None}"
+        )
         self._reset_response_state()
+        logger.debug("Response state reset")
         # Upload files
         file_urls: list[str] = []
         if files:
+            logger.debug(f"Validating {len(files)} files")
             validated = self._validate_files(files)
+            logger.debug(f"Validated {len(validated)} files, uploading...")
             file_urls = [self._upload_file(f) for f in validated]
+            logger.debug(f"Uploaded {len(file_urls)} files successfully")
         payload = self._build_payload(query, model, file_urls)
+        logger.debug(
+            f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
+        )
+        logger.debug("Initializing search session")
         self._http.init_search(query)
         if stream:
+            logger.debug("Starting streaming mode")
             self._stream_generator = self._stream(payload)
         else:
+            logger.debug("Starting complete mode (non-streaming)")
             self._complete(payload)
+            logger.debug(
+                f"Query completed | "
+                f"title={self._title} "
+                f"answer_length={len(self._answer) if self._answer else 0} "
+                f"chunks_count={len(self._chunks)} "
+                f"search_results_count={len(self._search_results)}"
+            )
     def _reset_response_state(self) -> None:
         self._title = None
@@ -237,8 +353,8 @@ class Conversation:
                         is_image=mimetype.startswith("image/"),
                     )
                 )
-            except FileValidationError:
-                raise
+            except FileValidationError as error:
+                raise error
             except (FileNotFoundError, PermissionError) as error:
                 raise FileValidationError(file_path, f"Cannot access file: {error}") from error
             except OSError as error:
@@ -264,16 +380,55 @@ class Conversation:
         try:
             response = self._http.post(ENDPOINT_UPLOAD, json=json_data)
             response_data = response.json()
-            upload_url = response_data.get("results", {}).get(file_uuid, {}).get("s3_object_url")
+            result = response_data.get("results", {}).get(file_uuid, {})
+            s3_bucket_url = result.get("s3_bucket_url")
+            s3_object_url = result.get("s3_object_url")
+            fields = result.get("fields", {})
-            if not upload_url:
+            if not s3_object_url:
                 raise FileUploadError(file_info.path, "No upload URL returned")
-            return upload_url
+            if not s3_bucket_url or not fields:
+                raise FileUploadError(file_info.path, "Missing S3 upload credentials")
+            # Upload the file to S3 using presigned POST
+            file_path = Path(file_info.path)
+            with file_path.open("rb") as f:
+                file_content = f.read()
+            # Build multipart form data using CurlMime
+            # For S3 presigned POST, form fields must come before the file
+            mime = CurlMime()
+            for field_name, field_value in fields.items():
+                mime.addpart(name=field_name, data=field_value)
+            mime.addpart(
+                name="file",
+                content_type=file_info.mimetype,
+                filename=file_path.name,
+                data=file_content,
+            )
+            # S3 requires a clean session
+            with Session() as s3_session:
+                upload_response = s3_session.post(s3_bucket_url, multipart=mime)
+            mime.close()
+            if upload_response.status_code not in (200, 201, 204):
+                raise FileUploadError(
+                    file_info.path,
+                    f"S3 upload failed with status {upload_response.status_code}: {upload_response.text}",
+                )
+            return s3_object_url
         except FileUploadError as error:
             raise error
-        except Exception as e:
-            raise FileUploadError(file_info.path, str(e)) from e
+        except Exception as error:
+            raise FileUploadError(file_info.path, str(error)) from error
     def _build_payload(
         self,
@@ -348,20 +503,26 @@ class Conversation:
         return CITATION_PATTERN.sub(replacer, text)
     def _parse_line(self, line: str | bytes) -> dict[str, Any] | None:
-        prefix = b"data: " if isinstance(line, bytes) else "data: "
+        if isinstance(line, bytes) and line.startswith(b"data: "):
+            return loads(line[6:])
-        if (isinstance(line, bytes) and line.startswith(prefix)) or (isinstance(line, str) and line.startswith(prefix)):
+        if isinstance(line, str) and line.startswith("data: "):
             return loads(line[6:])
         return None
     def _process_data(self, data: dict[str, Any]) -> None:
+        """Process SSE data chunk and update conversation state."""
         if self._backend_uuid is None and "backend_uuid" in data:
             self._backend_uuid = data["backend_uuid"]
         if self._read_write_token is None and "read_write_token" in data:
             self._read_write_token = data["read_write_token"]
+        if self._title is None and "thread_title" in data:
+            self._title = data["thread_title"]
         if "blocks" in data:
             for block in data["blocks"]:
                 if block.get("intended_usage") == "web_results":
@@ -376,16 +537,24 @@ class Conversation:
         try:
             json_data = loads(data["text"])
-        except KeyError as e:
-            raise ValueError("Missing 'text' field in data") from e
-        except JSONDecodeError as e:
-            raise ValueError("Invalid JSON in 'text' field") from e
+        except KeyError as error:
+            raise ValueError("Missing 'text' field in data") from error
+        except JSONDecodeError as error:
+            raise ValueError("Invalid JSON in 'text' field") from error
         answer_data: dict[str, Any] = {}
         if isinstance(json_data, list):
             for item in json_data:
-                if item.get("step_type") == "FINAL":
+                step_type = item.get("step_type")
+                # Handle Research mode clarifying questions
+                if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
+                    questions = self._extract_clarifying_questions(item)
+                    raise ResearchClarifyingQuestionsError(questions)
+                if step_type == "FINAL":
                     raw_content = item.get("content", {})
                     answer_content = raw_content.get("answer")
@@ -400,7 +569,39 @@ class Conversation:
         elif isinstance(json_data, dict):
             self._update_state(data.get("thread_title"), json_data)
         else:
-            raise ValueError("Unexpected JSON structure in 'text' field")
+            raise ResponseParsingError(
+                "Unexpected JSON structure in 'text' field",
+                raw_data=str(json_data),
+            )
+    def _extract_clarifying_questions(self, item: dict[str, Any]) -> list[str]:
+        """Extract clarifying questions from a RESEARCH_CLARIFYING_QUESTIONS step."""
+        questions: list[str] = []
+        content = item.get("content", {})
+        # Try different possible structures for questions
+        if isinstance(content, dict):
+            if "questions" in content:
+                raw_questions = content["questions"]
+                if isinstance(raw_questions, list):
+                    questions = [str(q) for q in raw_questions if q]
+            elif "clarifying_questions" in content:
+                raw_questions = content["clarifying_questions"]
+                if isinstance(raw_questions, list):
+                    questions = [str(q) for q in raw_questions if q]
+            elif not questions:
+                for value in content.values():
+                    if isinstance(value, str) and "?" in value:
+                        questions.append(value)
+        elif isinstance(content, list):
+            questions = [str(q) for q in content if q]
+        elif isinstance(content, str):
+            questions = [content]
+        return questions
     def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
         self._title = title
@@ -426,7 +627,8 @@ class Conversation:
         chunks = answer_data.get("chunks", [])
         if chunks:
-            self._chunks = chunks
+            formatted = [self._format_citations(chunk) for chunk in chunks if chunk is not None]
+            self._chunks = [c for c in formatted if c is not None]
         self._raw_data = answer_data

perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

perplexity-webui-scraper 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl