perplexity-webui-scraper 0.3.7__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perplexity_webui_scraper/__init__.py +24 -3
- perplexity_webui_scraper/cli/get_perplexity_session_token.py +21 -53
- perplexity_webui_scraper/config.py +12 -29
- perplexity_webui_scraper/constants.py +13 -51
- perplexity_webui_scraper/core.py +19 -155
- perplexity_webui_scraper/enums.py +26 -88
- perplexity_webui_scraper/exceptions.py +29 -50
- perplexity_webui_scraper/http.py +39 -332
- perplexity_webui_scraper/limits.py +6 -16
- perplexity_webui_scraper/logging.py +23 -180
- perplexity_webui_scraper/mcp/__init__.py +2 -8
- perplexity_webui_scraper/mcp/__main__.py +1 -3
- perplexity_webui_scraper/mcp/server.py +105 -82
- perplexity_webui_scraper/models.py +27 -71
- perplexity_webui_scraper/resilience.py +17 -100
- perplexity_webui_scraper/types.py +18 -25
- {perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.1.dist-info}/METADATA +121 -102
- perplexity_webui_scraper-0.4.1.dist-info/RECORD +21 -0
- {perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.1.dist-info}/WHEEL +1 -1
- perplexity_webui_scraper-0.3.7.dist-info/RECORD +0 -21
- {perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.1.dist-info}/entry_points.txt +0 -0
|
@@ -1,30 +1,51 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Extract AI responses from Perplexity's web interface.
|
|
3
|
-
"""
|
|
1
|
+
"""Extract AI responses from Perplexity's web interface."""
|
|
4
2
|
|
|
5
3
|
from importlib import metadata
|
|
6
4
|
|
|
7
5
|
from .config import ClientConfig, ConversationConfig
|
|
8
6
|
from .core import Conversation, Perplexity
|
|
9
7
|
from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
|
|
8
|
+
from .exceptions import (
|
|
9
|
+
AuthenticationError,
|
|
10
|
+
FileUploadError,
|
|
11
|
+
FileValidationError,
|
|
12
|
+
HTTPError,
|
|
13
|
+
PerplexityError,
|
|
14
|
+
RateLimitError,
|
|
15
|
+
ResearchClarifyingQuestionsError,
|
|
16
|
+
ResponseParsingError,
|
|
17
|
+
StreamingError,
|
|
18
|
+
)
|
|
10
19
|
from .models import Model, Models
|
|
11
20
|
from .types import Coordinates, Response, SearchResultItem
|
|
12
21
|
|
|
13
22
|
|
|
23
|
+
ConversationConfig.model_rebuild()
|
|
24
|
+
|
|
25
|
+
|
|
14
26
|
__version__: str = metadata.version("perplexity-webui-scraper")
|
|
15
27
|
__all__: list[str] = [
|
|
28
|
+
"AuthenticationError",
|
|
16
29
|
"CitationMode",
|
|
17
30
|
"ClientConfig",
|
|
18
31
|
"Conversation",
|
|
19
32
|
"ConversationConfig",
|
|
20
33
|
"Coordinates",
|
|
34
|
+
"FileUploadError",
|
|
35
|
+
"FileValidationError",
|
|
36
|
+
"HTTPError",
|
|
21
37
|
"LogLevel",
|
|
22
38
|
"Model",
|
|
23
39
|
"Models",
|
|
24
40
|
"Perplexity",
|
|
41
|
+
"PerplexityError",
|
|
42
|
+
"RateLimitError",
|
|
43
|
+
"ResearchClarifyingQuestionsError",
|
|
25
44
|
"Response",
|
|
45
|
+
"ResponseParsingError",
|
|
26
46
|
"SearchFocus",
|
|
27
47
|
"SearchResultItem",
|
|
28
48
|
"SourceFocus",
|
|
49
|
+
"StreamingError",
|
|
29
50
|
"TimeRange",
|
|
30
51
|
]
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
CLI utility for secure Perplexity authentication and session extraction.
|
|
3
|
-
"""
|
|
1
|
+
"""CLI utility for secure Perplexity authentication and session extraction."""
|
|
4
2
|
|
|
5
3
|
from __future__ import annotations
|
|
6
4
|
|
|
@@ -9,26 +7,20 @@ from sys import exit
|
|
|
9
7
|
from typing import NoReturn
|
|
10
8
|
|
|
11
9
|
from curl_cffi.requests import Session
|
|
10
|
+
from orjson import loads
|
|
12
11
|
from rich.console import Console
|
|
13
12
|
from rich.panel import Panel
|
|
14
13
|
from rich.prompt import Confirm, Prompt
|
|
15
14
|
|
|
16
15
|
|
|
17
|
-
# Constants
|
|
18
16
|
BASE_URL: str = "https://www.perplexity.ai"
|
|
19
17
|
ENV_KEY: str = "PERPLEXITY_SESSION_TOKEN"
|
|
20
18
|
|
|
21
|
-
|
|
22
|
-
# Initialize console on stderr to ensure secure alternate screen usage
|
|
23
19
|
console = Console(stderr=True, soft_wrap=True)
|
|
24
20
|
|
|
25
21
|
|
|
26
22
|
def update_env(token: str) -> bool:
|
|
27
|
-
"""
|
|
28
|
-
Securely updates the .env file with the session token.
|
|
29
|
-
|
|
30
|
-
Preserves existing content and comments.
|
|
31
|
-
"""
|
|
23
|
+
"""Securely updates the .env file with the session token."""
|
|
32
24
|
|
|
33
25
|
path = Path(".env")
|
|
34
26
|
line_entry = f'{ENV_KEY}="{token}"'
|
|
@@ -48,26 +40,23 @@ def update_env(token: str) -> bool:
|
|
|
48
40
|
if not updated:
|
|
49
41
|
if new_lines and new_lines[-1] != "":
|
|
50
42
|
new_lines.append("")
|
|
51
|
-
|
|
52
43
|
new_lines.append(line_entry)
|
|
53
44
|
|
|
54
45
|
path.write_text("\n".join(new_lines) + "\n", encoding="utf-8")
|
|
55
|
-
|
|
56
46
|
return True
|
|
47
|
+
|
|
57
48
|
except Exception:
|
|
58
49
|
return False
|
|
59
50
|
|
|
60
51
|
|
|
61
52
|
def _initialize_session() -> tuple[Session, str]:
|
|
62
|
-
"""
|
|
63
|
-
Initialize session and obtain CSRF token.
|
|
64
|
-
"""
|
|
53
|
+
"""Initialize session and obtain CSRF token."""
|
|
65
54
|
|
|
66
55
|
session = Session(impersonate="chrome", headers={"Referer": BASE_URL, "Origin": BASE_URL})
|
|
67
56
|
|
|
68
57
|
with console.status("[bold green]Initializing secure connection...", spinner="dots"):
|
|
69
58
|
session.get(BASE_URL)
|
|
70
|
-
csrf_data = session.get(f"{BASE_URL}/api/auth/csrf").
|
|
59
|
+
csrf_data = loads(session.get(f"{BASE_URL}/api/auth/csrf").content)
|
|
71
60
|
csrf = csrf_data.get("csrfToken")
|
|
72
61
|
|
|
73
62
|
if not csrf:
|
|
@@ -77,12 +66,10 @@ def _initialize_session() -> tuple[Session, str]:
|
|
|
77
66
|
|
|
78
67
|
|
|
79
68
|
def _request_verification_code(session: Session, csrf: str, email: str) -> None:
|
|
80
|
-
"""
|
|
81
|
-
Send verification code to user's email.
|
|
82
|
-
"""
|
|
69
|
+
"""Send verification code to user's email."""
|
|
83
70
|
|
|
84
71
|
with console.status("[bold green]Sending verification code...", spinner="dots"):
|
|
85
|
-
|
|
72
|
+
response = session.post(
|
|
86
73
|
f"{BASE_URL}/api/auth/signin/email?version=2.18&source=default",
|
|
87
74
|
json={
|
|
88
75
|
"email": email,
|
|
@@ -93,20 +80,18 @@ def _request_verification_code(session: Session, csrf: str, email: str) -> None:
|
|
|
93
80
|
},
|
|
94
81
|
)
|
|
95
82
|
|
|
96
|
-
if
|
|
97
|
-
raise ValueError(f"Authentication request failed: {
|
|
83
|
+
if response.status_code != 200:
|
|
84
|
+
raise ValueError(f"Authentication request failed: {response.text}")
|
|
98
85
|
|
|
99
86
|
|
|
100
87
|
def _validate_and_get_redirect_url(session: Session, email: str, user_input: str) -> str:
|
|
101
|
-
"""
|
|
102
|
-
Validate user input (OTP or magic link) and return redirect URL.
|
|
103
|
-
"""
|
|
88
|
+
"""Validate user input (OTP or magic link) and return redirect URL."""
|
|
104
89
|
|
|
105
90
|
with console.status("[bold green]Validating...", spinner="dots"):
|
|
106
91
|
if user_input.startswith("http"):
|
|
107
92
|
return user_input
|
|
108
93
|
|
|
109
|
-
|
|
94
|
+
response_otp = session.post(
|
|
110
95
|
f"{BASE_URL}/api/auth/otp-redirect-link",
|
|
111
96
|
json={
|
|
112
97
|
"email": email,
|
|
@@ -116,10 +101,10 @@ def _validate_and_get_redirect_url(session: Session, email: str, user_input: str
|
|
|
116
101
|
},
|
|
117
102
|
)
|
|
118
103
|
|
|
119
|
-
if
|
|
104
|
+
if response_otp.status_code != 200:
|
|
120
105
|
raise ValueError("Invalid verification code.")
|
|
121
106
|
|
|
122
|
-
redirect_path =
|
|
107
|
+
redirect_path = loads(response_otp.content).get("redirect")
|
|
123
108
|
|
|
124
109
|
if not redirect_path:
|
|
125
110
|
raise ValueError("No redirect URL received.")
|
|
@@ -128,9 +113,7 @@ def _validate_and_get_redirect_url(session: Session, email: str, user_input: str
|
|
|
128
113
|
|
|
129
114
|
|
|
130
115
|
def _extract_session_token(session: Session, redirect_url: str) -> str:
|
|
131
|
-
"""
|
|
132
|
-
Extract session token from cookies after authentication.
|
|
133
|
-
"""
|
|
116
|
+
"""Extract session token from cookies after authentication."""
|
|
134
117
|
|
|
135
118
|
session.get(redirect_url)
|
|
136
119
|
token = session.cookies.get("__Secure-next-auth.session-token")
|
|
@@ -142,9 +125,7 @@ def _extract_session_token(session: Session, redirect_url: str) -> str:
|
|
|
142
125
|
|
|
143
126
|
|
|
144
127
|
def _display_and_save_token(token: str) -> None:
|
|
145
|
-
"""
|
|
146
|
-
Display token and optionally save to .env file.
|
|
147
|
-
"""
|
|
128
|
+
"""Display token and optionally save to .env file."""
|
|
148
129
|
|
|
149
130
|
console.print("\n[bold green]✅ Token generated successfully![/bold green]")
|
|
150
131
|
console.print(f"\n[bold white]Your session token:[/bold white]\n[green]{token}[/green]\n")
|
|
@@ -159,9 +140,7 @@ def _display_and_save_token(token: str) -> None:
|
|
|
159
140
|
|
|
160
141
|
|
|
161
142
|
def _show_header() -> None:
|
|
162
|
-
"""
|
|
163
|
-
Display welcome header.
|
|
164
|
-
"""
|
|
143
|
+
"""Display welcome header."""
|
|
165
144
|
|
|
166
145
|
console.print(
|
|
167
146
|
Panel(
|
|
@@ -175,9 +154,7 @@ def _show_header() -> None:
|
|
|
175
154
|
|
|
176
155
|
|
|
177
156
|
def _show_exit_message() -> None:
|
|
178
|
-
"""
|
|
179
|
-
Display security note and wait for user to exit.
|
|
180
|
-
"""
|
|
157
|
+
"""Display security note and wait for user to exit."""
|
|
181
158
|
|
|
182
159
|
console.print("\n[bold yellow]⚠️ Security Note:[/bold yellow]")
|
|
183
160
|
console.print("Press [bold white]ENTER[/bold white] to clear screen and exit.")
|
|
@@ -185,46 +162,37 @@ def _show_exit_message() -> None:
|
|
|
185
162
|
|
|
186
163
|
|
|
187
164
|
def get_token() -> NoReturn:
|
|
188
|
-
"""
|
|
189
|
-
Executes the authentication flow within an ephemeral terminal screen.
|
|
190
|
-
|
|
191
|
-
Handles CSRF, Email OTP/Link validation, and secure token display.
|
|
192
|
-
"""
|
|
165
|
+
"""Executes the authentication flow within an ephemeral terminal screen."""
|
|
193
166
|
|
|
194
167
|
with console.screen():
|
|
195
168
|
try:
|
|
196
169
|
_show_header()
|
|
197
170
|
|
|
198
|
-
# Step 1: Initialize session and get CSRF token
|
|
199
171
|
session, csrf = _initialize_session()
|
|
200
172
|
|
|
201
|
-
# Step 2: Get email and request verification code
|
|
202
173
|
console.print("\n[bold cyan]Step 1: Email Verification[/bold cyan]")
|
|
203
174
|
email = Prompt.ask(" Enter your Perplexity email", console=console)
|
|
204
175
|
_request_verification_code(session, csrf, email)
|
|
205
176
|
|
|
206
|
-
# Step 3: Get and validate user input (OTP or magic link)
|
|
207
177
|
console.print("\n[bold cyan]Step 2: Verification[/bold cyan]")
|
|
208
178
|
console.print(" Check your email for a [bold]6-digit code[/bold] or [bold]magic link[/bold].")
|
|
209
179
|
user_input = Prompt.ask(" Enter code or paste link", console=console).strip()
|
|
210
180
|
redirect_url = _validate_and_get_redirect_url(session, email, user_input)
|
|
211
181
|
|
|
212
|
-
# Step 4: Extract session token
|
|
213
182
|
token = _extract_session_token(session, redirect_url)
|
|
214
183
|
|
|
215
|
-
# Step 5: Display and optionally save token
|
|
216
184
|
_display_and_save_token(token)
|
|
217
185
|
|
|
218
|
-
# Step 6: Exit
|
|
219
186
|
_show_exit_message()
|
|
220
187
|
|
|
221
188
|
exit(0)
|
|
189
|
+
|
|
222
190
|
except KeyboardInterrupt:
|
|
223
191
|
exit(0)
|
|
192
|
+
|
|
224
193
|
except Exception as error:
|
|
225
194
|
console.print(f"\n[bold red]⛔ Error:[/bold red] {error}")
|
|
226
195
|
console.input("[dim]Press ENTER to exit...[/dim]")
|
|
227
|
-
|
|
228
196
|
exit(1)
|
|
229
197
|
|
|
230
198
|
|
|
@@ -1,27 +1,24 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Configuration classes.
|
|
3
|
-
"""
|
|
1
|
+
"""Configuration classes."""
|
|
4
2
|
|
|
5
3
|
from __future__ import annotations
|
|
6
4
|
|
|
7
|
-
from
|
|
5
|
+
from os import PathLike # noqa: TC003
|
|
8
6
|
from typing import TYPE_CHECKING
|
|
9
7
|
|
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
|
9
|
+
|
|
10
10
|
from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
|
|
11
11
|
|
|
12
12
|
|
|
13
13
|
if TYPE_CHECKING:
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
|
|
16
14
|
from .models import Model
|
|
17
15
|
from .types import Coordinates
|
|
18
16
|
|
|
19
17
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"""
|
|
18
|
+
class ConversationConfig(BaseModel):
|
|
19
|
+
"""Default settings for a conversation."""
|
|
20
|
+
|
|
21
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
25
22
|
|
|
26
23
|
model: Model | None = None
|
|
27
24
|
citation_mode: CitationMode = CitationMode.CLEAN
|
|
@@ -34,24 +31,10 @@ class ConversationConfig:
|
|
|
34
31
|
coordinates: Coordinates | None = None
|
|
35
32
|
|
|
36
33
|
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
"""
|
|
40
|
-
HTTP client settings.
|
|
34
|
+
class ClientConfig(BaseModel):
|
|
35
|
+
"""HTTP client settings."""
|
|
41
36
|
|
|
42
|
-
|
|
43
|
-
timeout: Request timeout in seconds.
|
|
44
|
-
impersonate: Browser to impersonate (e.g., "chrome", "edge", "safari").
|
|
45
|
-
max_retries: Maximum retry attempts for failed requests.
|
|
46
|
-
retry_base_delay: Initial delay in seconds before first retry.
|
|
47
|
-
retry_max_delay: Maximum delay between retries.
|
|
48
|
-
retry_jitter: Random jitter factor (0-1) to add to delays.
|
|
49
|
-
requests_per_second: Rate limit for requests (0 to disable).
|
|
50
|
-
rotate_fingerprint: Whether to rotate browser fingerprint on retries.
|
|
51
|
-
logging_level: Logging verbosity level. Default is DISABLED.
|
|
52
|
-
log_file: Optional file path for persistent logging. If set, logs go to file only.
|
|
53
|
-
If None, logs go to console. All logs are appended.
|
|
54
|
-
"""
|
|
37
|
+
model_config = ConfigDict(frozen=True)
|
|
55
38
|
|
|
56
39
|
timeout: int = 3600
|
|
57
40
|
impersonate: str = "chrome"
|
|
@@ -62,4 +45,4 @@ class ClientConfig:
|
|
|
62
45
|
requests_per_second: float = 0.5
|
|
63
46
|
rotate_fingerprint: bool = True
|
|
64
47
|
logging_level: LogLevel = LogLevel.DISABLED
|
|
65
|
-
log_file: str |
|
|
48
|
+
log_file: str | PathLike[str] | None = None
|
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Constants and values for the Perplexity internal API and HTTP interactions.
|
|
3
|
-
"""
|
|
1
|
+
"""Constants and values for the Perplexity internal API."""
|
|
4
2
|
|
|
5
3
|
from __future__ import annotations
|
|
6
4
|
|
|
@@ -8,77 +6,41 @@ from re import Pattern, compile
|
|
|
8
6
|
from typing import Final
|
|
9
7
|
|
|
10
8
|
|
|
11
|
-
# API Configuration
|
|
12
9
|
API_VERSION: Final[str] = "2.18"
|
|
13
|
-
"""
|
|
14
|
-
Current API version used by Perplexity WebUI.
|
|
15
|
-
"""
|
|
10
|
+
"""Current API version used by Perplexity WebUI."""
|
|
16
11
|
|
|
17
12
|
API_BASE_URL: Final[str] = "https://www.perplexity.ai"
|
|
18
|
-
"""
|
|
19
|
-
Base URL for all API requests.
|
|
20
|
-
"""
|
|
13
|
+
"""Base URL for all API requests."""
|
|
21
14
|
|
|
22
|
-
# API Endpoints
|
|
23
15
|
ENDPOINT_ASK: Final[str] = "/rest/sse/perplexity_ask"
|
|
24
|
-
"""
|
|
25
|
-
SSE endpoint for sending prompts.
|
|
26
|
-
"""
|
|
16
|
+
"""SSE endpoint for sending prompts."""
|
|
27
17
|
|
|
28
18
|
ENDPOINT_SEARCH_INIT: Final[str] = "/search/new"
|
|
29
|
-
"""
|
|
30
|
-
Endpoint to initialize a search session.
|
|
31
|
-
"""
|
|
19
|
+
"""Endpoint to initialize a search session."""
|
|
32
20
|
|
|
33
21
|
ENDPOINT_UPLOAD: Final[str] = "/rest/uploads/batch_create_upload_urls"
|
|
34
|
-
"""
|
|
35
|
-
Endpoint for file upload URL generation.
|
|
36
|
-
"""
|
|
22
|
+
"""Endpoint for file upload URL generation."""
|
|
37
23
|
|
|
38
|
-
# API Fixed Parameters
|
|
39
24
|
SEND_BACK_TEXT: Final[bool] = True
|
|
40
|
-
"""
|
|
41
|
-
Whether to receive full text in each streaming chunk.
|
|
42
|
-
|
|
43
|
-
True = API sends complete text each chunk (replace mode).
|
|
44
|
-
False = API sends delta chunks only (accumulate mode).
|
|
45
|
-
"""
|
|
25
|
+
"""Whether to receive full text in each streaming chunk (replace mode)."""
|
|
46
26
|
|
|
47
27
|
USE_SCHEMATIZED_API: Final[bool] = False
|
|
48
|
-
"""
|
|
49
|
-
Whether to use the schematized API format.
|
|
50
|
-
"""
|
|
28
|
+
"""Whether to use the schematized API format."""
|
|
51
29
|
|
|
52
30
|
PROMPT_SOURCE: Final[str] = "user"
|
|
53
|
-
"""
|
|
54
|
-
Source identifier for prompts.
|
|
55
|
-
"""
|
|
31
|
+
"""Source identifier for prompts."""
|
|
56
32
|
|
|
57
|
-
# Regex Patterns (Pre-compiled for performance in streaming parsing)
|
|
58
33
|
CITATION_PATTERN: Final[Pattern[str]] = compile(r"\[(\d{1,2})\]")
|
|
59
|
-
"""
|
|
60
|
-
Regex pattern for matching citation markers like [1], [2], etc.
|
|
61
|
-
|
|
62
|
-
Uses word boundary to avoid matching things like [123].
|
|
63
|
-
"""
|
|
34
|
+
"""Regex pattern for matching citation markers like [1], [2]."""
|
|
64
35
|
|
|
65
36
|
JSON_OBJECT_PATTERN: Final[Pattern[str]] = compile(r"^\{.*\}$")
|
|
66
|
-
"""
|
|
67
|
-
Pattern to detect JSON object strings.
|
|
68
|
-
"""
|
|
37
|
+
"""Pattern to detect JSON object strings."""
|
|
69
38
|
|
|
70
|
-
# HTTP Headers
|
|
71
39
|
DEFAULT_HEADERS: Final[dict[str, str]] = {
|
|
72
40
|
"Accept": "text/event-stream, application/json",
|
|
73
41
|
"Content-Type": "application/json",
|
|
74
42
|
}
|
|
75
|
-
"""
|
|
76
|
-
Default HTTP headers for API requests.
|
|
77
|
-
|
|
78
|
-
Referer and Origin are added dynamically based on BASE_URL.
|
|
79
|
-
"""
|
|
43
|
+
"""Default HTTP headers for API requests."""
|
|
80
44
|
|
|
81
45
|
SESSION_COOKIE_NAME: Final[str] = "__Secure-next-auth.session-token"
|
|
82
|
-
"""
|
|
83
|
-
Name of the session cookie used for authentication.
|
|
84
|
-
"""
|
|
46
|
+
"""Name of the session cookie used for authentication."""
|