PyPI - perplexity-webui-scraper - Versions diffs - 0.3.7__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

perplexity-webui-scraper 0.3.7py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

perplexity_webui_scraper/__init__.py +24 -3
perplexity_webui_scraper/cli/get_perplexity_session_token.py +21 -53
perplexity_webui_scraper/config.py +12 -29
perplexity_webui_scraper/constants.py +13 -51
perplexity_webui_scraper/core.py +18 -154
perplexity_webui_scraper/enums.py +26 -88
perplexity_webui_scraper/exceptions.py +29 -50
perplexity_webui_scraper/http.py +39 -332
perplexity_webui_scraper/limits.py +6 -16
perplexity_webui_scraper/logging.py +23 -180
perplexity_webui_scraper/mcp/__init__.py +2 -8
perplexity_webui_scraper/mcp/__main__.py +1 -3
perplexity_webui_scraper/mcp/server.py +105 -82
perplexity_webui_scraper/models.py +27 -71
perplexity_webui_scraper/resilience.py +17 -100
perplexity_webui_scraper/types.py +18 -25
{perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.0.dist-info}/METADATA +120 -101
perplexity_webui_scraper-0.4.0.dist-info/RECORD +21 -0
{perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.0.dist-info}/WHEEL +1 -1
perplexity_webui_scraper-0.3.7.dist-info/RECORD +0 -21
{perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.0.dist-info}/entry_points.txt +0 -0

perplexity_webui_scraper/core.py CHANGED Viewed

@@ -1,6 +1,4 @@
-"""
-Core client implementation.
-"""
+"""Core client implementation."""
 from __future__ import annotations
@@ -33,7 +31,7 @@ from .enums import CitationMode
 from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
 from .http import HTTPClient
 from .limits import MAX_FILE_SIZE, MAX_FILES
-from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
+from .logging import configure_logging, get_logger
 from .models import Model, Models
 from .types import Response, SearchResultItem, _FileInfo
@@ -47,43 +45,14 @@ class Perplexity:
     __slots__ = ("_http",)
     def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
-        """
-        Initialize web scraper with session token.
-        Args:
-            session_token: Perplexity session cookie (__Secure-next-auth.session-token).
-            config: Optional HTTP client configuration.
-        Raises:
-            ValueError: If session_token is empty or whitespace.
-        """
+        """Initialize with session token."""
         if not session_token or not session_token.strip():
             raise ValueError("session_token cannot be empty")
         cfg = config or ClientConfig()
-        # Configure logging based on config
         configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
-        logger.info(
-            "Perplexity client initializing | "
-            f"session_token_length={len(session_token)} "
-            f"logging_level={cfg.logging_level.value} "
-            f"log_file={cfg.log_file}"
-        )
-        logger.debug(
-            "Client configuration | "
-            f"timeout={cfg.timeout}s "
-            f"impersonate={cfg.impersonate} "
-            f"max_retries={cfg.max_retries} "
-            f"retry_base_delay={cfg.retry_base_delay}s "
-            f"retry_max_delay={cfg.retry_max_delay}s "
-            f"retry_jitter={cfg.retry_jitter} "
-            f"requests_per_second={cfg.requests_per_second} "
-            f"rotate_fingerprint={cfg.rotate_fingerprint}"
-        )
         self._http = HTTPClient(
             session_token,
             timeout=cfg.timeout,
@@ -96,37 +65,17 @@ class Perplexity:
             rotate_fingerprint=cfg.rotate_fingerprint,
         )
-        logger.info("Perplexity client initialized successfully")
+        logger.info("Perplexity client initialized")
     def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
         """Create a new conversation."""
-        cfg = config or ConversationConfig()
-        logger.debug(
-            "Creating conversation | "
-            f"model={cfg.model} "
-            f"citation_mode={cfg.citation_mode} "
-            f"save_to_library={cfg.save_to_library} "
-            f"search_focus={cfg.search_focus} "
-            f"language={cfg.language}"
-        )
-        conversation = Conversation(self._http, cfg)
-        log_conversation_created(
-            f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
-            f"search_focus={cfg.search_focus}, language={cfg.language}"
-        )
-        logger.info("Conversation created successfully")
-        return conversation
+        return Conversation(self._http, config or ConversationConfig())
     def close(self) -> None:
         """Close the client."""
-        logger.debug("Closing Perplexity client")
         self._http.close()
-        logger.info("Perplexity client closed")
     def __enter__(self) -> Perplexity:
         return self
@@ -153,13 +102,6 @@ class Conversation:
     )
     def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
-        logger.debug(
-            "Conversation.__init__ | "
-            f"model={config.model} "
-            f"citation_mode={config.citation_mode} "
-            f"save_to_library={config.save_to_library} "
-            f"search_focus={config.search_focus}"
-        )
         self._http = http
         self._config = config
         self._citation_mode = CitationMode.DEFAULT
@@ -171,7 +113,6 @@ class Conversation:
         self._search_results: list[SearchResultItem] = []
         self._raw_data: dict[str, Any] = {}
         self._stream_generator: Generator[Response, None, None] | None = None
-        logger.debug("Conversation initialized with empty state")
     @property
     def answer(self) -> str | None:
@@ -200,7 +141,6 @@ class Conversation:
     def __iter__(self) -> Generator[Response, None, None]:
         if self._stream_generator is not None:
             yield from self._stream_generator
             self._stream_generator = None
     def ask(
@@ -213,29 +153,11 @@ class Conversation:
     ) -> Conversation:
         """Ask a question. Returns self for method chaining or streaming iteration."""
-        logger.info(
-            "Conversation.ask called | "
-            f"query_length={len(query)} "
-            f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
-            f"model={model} "
-            f"files_count={len(files) if files else 0} "
-            f"citation_mode={citation_mode} "
-            f"stream={stream}"
-        )
         effective_model = model or self._config.model or Models.BEST
         effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
         self._citation_mode = effective_citation
-        logger.debug(
-            f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
-        )
-        log_query_sent(query, str(effective_model), bool(files))
         self._execute(query, effective_model, files, stream=stream)
-        logger.debug("Query execution completed")
         return self
     def _execute(
@@ -247,49 +169,20 @@ class Conversation:
     ) -> None:
         """Execute a query."""
-        logger.debug(
-            f"Executing query | "
-            f"query_length={len(query)} "
-            f"model={model} "
-            f"files_count={len(files) if files else 0} "
-            f"stream={stream} "
-            f"is_followup={self._backend_uuid is not None}"
-        )
         self._reset_response_state()
-        logger.debug("Response state reset")
-        # Upload files
         file_urls: list[str] = []
         if files:
-            logger.debug(f"Validating {len(files)} files")
             validated = self._validate_files(files)
-            logger.debug(f"Validated {len(validated)} files, uploading...")
             file_urls = [self._upload_file(f) for f in validated]
-            logger.debug(f"Uploaded {len(file_urls)} files successfully")
         payload = self._build_payload(query, model, file_urls)
-        logger.debug(
-            f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
-        )
-        logger.debug("Initializing search session")
         self._http.init_search(query)
         if stream:
-            logger.debug("Starting streaming mode")
             self._stream_generator = self._stream(payload)
         else:
-            logger.debug("Starting complete mode (non-streaming)")
             self._complete(payload)
-            logger.debug(
-                f"Query completed | "
-                f"title={self._title} "
-                f"answer_length={len(self._answer) if self._answer else 0} "
-                f"chunks_count={len(self._chunks)} "
-                f"search_results_count={len(self._search_results)}"
-            )
     def _reset_response_state(self) -> None:
         self._title = None
@@ -309,7 +202,6 @@ class Conversation:
         for item in files:
             if item and isinstance(item, (str, PathLike)):
                 path = Path(item).resolve()
                 if path.as_posix() not in seen:
                     seen.add(path.as_posix())
                     file_list.append(path)
@@ -338,7 +230,6 @@ class Conversation:
                         file_path,
                         f"File exceeds 50MB limit: {file_size / (1024 * 1024):.1f}MB",
                     )
                 if file_size == 0:
                     raise FileValidationError(file_path, "File is empty")
@@ -388,18 +279,13 @@ class Conversation:
             if not s3_object_url:
                 raise FileUploadError(file_info.path, "No upload URL returned")
             if not s3_bucket_url or not fields:
                 raise FileUploadError(file_info.path, "Missing S3 upload credentials")
-            # Upload the file to S3 using presigned POST
             file_path = Path(file_info.path)
             with file_path.open("rb") as f:
                 file_content = f.read()
-            # Build multipart form data using CurlMime
-            # For S3 presigned POST, form fields must come before the file
             mime = CurlMime()
             for field_name, field_value in fields.items():
@@ -412,7 +298,6 @@ class Conversation:
                 data=file_content,
             )
-            # S3 requires a clean session
             with Session() as s3_session:
                 upload_response = s3_session.post(s3_bucket_url, multipart=mime)
@@ -471,7 +356,6 @@ class Conversation:
         if self._backend_uuid is not None:
             params["last_backend_uuid"] = self._backend_uuid
             params["query_source"] = "followup"
             if self._read_write_token:
                 params["read_write_token"] = self._read_write_token
@@ -483,7 +367,6 @@ class Conversation:
         def replacer(m: Match[str]) -> str:
             num = m.group(1)
             if not num.isdigit():
                 return m.group(0)
@@ -491,10 +374,8 @@ class Conversation:
                 return ""
             idx = int(num) - 1
             if 0 <= idx < len(self._search_results):
                 url = self._search_results[idx].url or ""
                 if self._citation_mode == CitationMode.MARKDOWN and url:
                     return f"[{num}]({url})"
@@ -505,7 +386,6 @@ class Conversation:
     def _parse_line(self, line: str | bytes) -> dict[str, Any] | None:
         if isinstance(line, bytes) and line.startswith(b"data: "):
             return loads(line[6:])
         if isinstance(line, str) and line.startswith("data: "):
             return loads(line[6:])
@@ -514,24 +394,15 @@ class Conversation:
     def _process_data(self, data: dict[str, Any]) -> None:
         """Process SSE data chunk and update conversation state."""
-        if self._backend_uuid is None and "backend_uuid" in data:
+        if "backend_uuid" in data:
             self._backend_uuid = data["backend_uuid"]
-        if self._read_write_token is None and "read_write_token" in data:
+        if "read_write_token" in data:
             self._read_write_token = data["read_write_token"]
-        if self._title is None and "thread_title" in data:
+        if data.get("thread_title"):
             self._title = data["thread_title"]
-        if "blocks" in data:
-            for block in data["blocks"]:
-                if block.get("intended_usage") == "web_results":
-                    diff = block.get("diff_block", {})
-                    for patch in diff.get("patches", []):
-                        if patch.get("op") == "replace" and patch.get("path") == "/web_results":
-                            pass
         if "text" not in data and "blocks" not in data:
             return None
@@ -548,10 +419,8 @@ class Conversation:
             for item in json_data:
                 step_type = item.get("step_type")
-                # Handle Research mode clarifying questions
                 if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
                     questions = self._extract_clarifying_questions(item)
                     raise ResearchClarifyingQuestionsError(questions)
                 if step_type == "FINAL":
@@ -563,11 +432,14 @@ class Conversation:
                     else:
                         answer_data = raw_content
-                    self._update_state(data.get("thread_title"), answer_data)
+                    title = data.get("thread_title") or answer_data.get("thread_title")
+                    self._update_state(title, answer_data)
                     break
         elif isinstance(json_data, dict):
-            self._update_state(data.get("thread_title"), json_data)
+            title = data.get("thread_title") or json_data.get("thread_title")
+            self._update_state(title, json_data)
         else:
             raise ResponseParsingError(
                 "Unexpected JSON structure in 'text' field",
@@ -580,34 +452,33 @@ class Conversation:
         questions: list[str] = []
         content = item.get("content", {})
-        # Try different possible structures for questions
         if isinstance(content, dict):
             if "questions" in content:
                 raw_questions = content["questions"]
                 if isinstance(raw_questions, list):
                     questions = [str(q) for q in raw_questions if q]
             elif "clarifying_questions" in content:
                 raw_questions = content["clarifying_questions"]
                 if isinstance(raw_questions, list):
                     questions = [str(q) for q in raw_questions if q]
             elif not questions:
                 for value in content.values():
                     if isinstance(value, str) and "?" in value:
                         questions.append(value)
         elif isinstance(content, list):
             questions = [str(q) for q in content if q]
         elif isinstance(content, str):
             questions = [content]
         return questions
     def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
-        self._title = title
+        if title is not None:
+            self._title = title
         web_results = answer_data.get("web_results", [])
         if web_results:
             self._search_results = [
                 SearchResultItem(
@@ -620,12 +491,10 @@ class Conversation:
             ]
         answer_text = answer_data.get("answer")
         if answer_text is not None:
             self._answer = self._format_citations(answer_text)
         chunks = answer_data.get("chunks", [])
         if chunks:
             formatted = [self._format_citations(chunk) for chunk in chunks if chunk is not None]
             self._chunks = [c for c in formatted if c is not None]
@@ -646,21 +515,16 @@ class Conversation:
     def _complete(self, payload: dict[str, Any]) -> None:
         for line in self._http.stream_ask(payload):
             data = self._parse_line(line)
             if data:
                 self._process_data(data)
                 if data.get("final"):
                     break
     def _stream(self, payload: dict[str, Any]) -> Generator[Response, None, None]:
         for line in self._http.stream_ask(payload):
             data = self._parse_line(line)
             if data:
                 self._process_data(data)
                 yield self._build_response()
                 if data.get("final"):
                     break

perplexity_webui_scraper/enums.py CHANGED Viewed

@@ -1,6 +1,4 @@
-"""
-Enums for Perplexity WebUI Scraper configuration options.
-"""
+"""Enums for configuration options."""
 from __future__ import annotations
@@ -8,140 +6,80 @@ from enum import Enum
 class CitationMode(str, Enum):
-    """
-    Citation formatting modes for response text.
-    Controls how citation markers (e.g., [1], [2]) are formatted in the response.
-    """
+    """Citation formatting modes for response text."""
     DEFAULT = "default"
-    """
-    Keep original Perplexity citation format (e.g., 'This is a citation[1]').
-    """
+    """Keep original format (e.g., 'text[1]')."""
     MARKDOWN = "markdown"
-    """
-    Convert citations to markdown links (e.g., 'This is a citation[1](https://example.com)').
-    """
+    """Convert to markdown links (e.g., 'text[1](url)')."""
     CLEAN = "clean"
-    """
-    Remove all citation markers (e.g., 'This is a citation').
-    """
+    """Remove all citation markers."""
 class SearchFocus(str, Enum):
-    """
-    Search focus types that control the type of search performed.
-    Determines whether to search the web or focus on writing tasks.
-    """
+    """Search focus types."""
     WEB = "internet"
-    """
-    Search the web for information. Best for factual queries and research.
-    """
+    """Search the web for information."""
     WRITING = "writing"
-    """
-    Focus on writing tasks. Best for creative writing, editing, and text generation.
-    """
+    """Focus on writing tasks."""
 class SourceFocus(str, Enum):
-    """
-    Source focus types that control which sources to prioritize.
-    Can be combined (e.g., [SourceFocus.WEB, SourceFocus.ACADEMIC]) for multi-source searches.
-    """
+    """Source focus types for search prioritization."""
     WEB = "web"
-    """
-    Search across the entire internet. General web search.
-    """
+    """General web search."""
     ACADEMIC = "scholar"
-    """
-    Search academic papers and scholarly articles (Google Scholar, etc.).
-    """
+    """Academic papers and scholarly articles."""
     SOCIAL = "social"
-    """
-    Search social media for discussions and opinions (Reddit, Twitter, etc.).
-    """
+    """Social media (Reddit, Twitter, etc.)."""
     FINANCE = "edgar"
-    """
-    Search SEC EDGAR filings for financial and corporate documents.
-    """
+    """SEC EDGAR filings."""
 class TimeRange(str, Enum):
-    """
-    Time range filters for search results.
-    Controls how recent the sources should be.
-    """
+    """Time range filters for search results."""
     ALL = ""
-    """
-    Include sources from all time. No time restriction.
-    """
+    """No time restriction."""
     TODAY = "DAY"
-    """
-    Include only sources from today (last 24 hours).
-    """
+    """Last 24 hours."""
     LAST_WEEK = "WEEK"
-    """
-    Include sources from the last 7 days.
-    """
+    """Last 7 days."""
     LAST_MONTH = "MONTH"
-    """
-    Include sources from the last 30 days.
-    """
+    """Last 30 days."""
     LAST_YEAR = "YEAR"
-    """
-    Include sources from the last 365 days.
-    """
+    """Last 365 days."""
 class LogLevel(str, Enum):
-    """
-    Logging level configuration.
-    Controls the verbosity of logging output. DISABLED is the default.
-    """
+    """Logging level configuration."""
     DISABLED = "DISABLED"
-    """
-    Completely disable all logging output. This is the default.
-    """
+    """Disable all logging (default)."""
     DEBUG = "DEBUG"
-    """
-    Show all messages including internal debug information.
-    """
+    """Show all messages including debug info."""
     INFO = "INFO"
-    """
-    Show informational messages, warnings, and errors.
-    """
+    """Show info, warnings, and errors."""
     WARNING = "WARNING"
-    """
-    Show only warnings and errors.
-    """
+    """Show warnings and errors only."""
     ERROR = "ERROR"
-    """
-    Show only error messages.
-    """
+    """Show errors only."""
     CRITICAL = "CRITICAL"
-    """
-    Show only critical/fatal errors.
-    """
+    """Show critical/fatal errors only."""

perplexity-webui-scraper 0.3.7__py3-none-any.whl → 0.4.0__py3-none-any.whl

perplexity-webui-scraper 0.3.7py3-none-any.whl → 0.4.0py3-none-any.whl