perplexity-webui-scraper 0.3.7__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perplexity_webui_scraper/__init__.py +24 -3
- perplexity_webui_scraper/cli/get_perplexity_session_token.py +21 -53
- perplexity_webui_scraper/config.py +12 -29
- perplexity_webui_scraper/constants.py +13 -51
- perplexity_webui_scraper/core.py +18 -154
- perplexity_webui_scraper/enums.py +26 -88
- perplexity_webui_scraper/exceptions.py +29 -50
- perplexity_webui_scraper/http.py +39 -332
- perplexity_webui_scraper/limits.py +6 -16
- perplexity_webui_scraper/logging.py +23 -180
- perplexity_webui_scraper/mcp/__init__.py +2 -8
- perplexity_webui_scraper/mcp/__main__.py +1 -3
- perplexity_webui_scraper/mcp/server.py +105 -82
- perplexity_webui_scraper/models.py +27 -71
- perplexity_webui_scraper/resilience.py +17 -100
- perplexity_webui_scraper/types.py +18 -25
- {perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.0.dist-info}/METADATA +120 -101
- perplexity_webui_scraper-0.4.0.dist-info/RECORD +21 -0
- {perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.0.dist-info}/WHEEL +1 -1
- perplexity_webui_scraper-0.3.7.dist-info/RECORD +0 -21
- {perplexity_webui_scraper-0.3.7.dist-info → perplexity_webui_scraper-0.4.0.dist-info}/entry_points.txt +0 -0
|
@@ -1,6 +1,4 @@
|
|
|
1
|
-
"""
|
|
2
|
-
AI model definitions for Perplexity WebUI Scraper.
|
|
3
|
-
"""
|
|
1
|
+
"""AI model definitions."""
|
|
4
2
|
|
|
5
3
|
from __future__ import annotations
|
|
6
4
|
|
|
@@ -9,101 +7,59 @@ from dataclasses import dataclass
|
|
|
9
7
|
|
|
10
8
|
@dataclass(frozen=True, slots=True)
|
|
11
9
|
class Model:
|
|
12
|
-
"""
|
|
13
|
-
AI model configuration.
|
|
14
|
-
|
|
15
|
-
Attributes:
|
|
16
|
-
identifier: Model identifier used by the API.
|
|
17
|
-
mode: Model execution mode. Default: "copilot".
|
|
18
|
-
"""
|
|
10
|
+
"""AI model configuration."""
|
|
19
11
|
|
|
20
12
|
identifier: str
|
|
21
13
|
mode: str = "copilot"
|
|
22
14
|
|
|
23
15
|
|
|
24
16
|
class Models:
|
|
25
|
-
"""
|
|
26
|
-
Available AI models with their configurations.
|
|
27
|
-
|
|
28
|
-
All models use the "copilot" mode which enables web search.
|
|
29
|
-
"""
|
|
17
|
+
"""Available AI models (all use copilot mode with web search)."""
|
|
30
18
|
|
|
31
|
-
|
|
32
|
-
"""
|
|
33
|
-
Research - Fast and thorough for routine research.
|
|
34
|
-
"""
|
|
19
|
+
DEEP_RESEARCH = Model(identifier="pplx_alpha")
|
|
20
|
+
"""Deep Research - Create in-depth reports with more sources, charts, and advanced reasoning."""
|
|
35
21
|
|
|
36
|
-
|
|
37
|
-
"""
|
|
38
|
-
Labs - Multi-step tasks with advanced troubleshooting.
|
|
39
|
-
"""
|
|
22
|
+
CREATE_FILES_AND_APPS = Model(identifier="pplx_beta")
|
|
23
|
+
"""Create files and apps (previously known as Labs) - Turn your ideas into docs, slides, dashboards, and more."""
|
|
40
24
|
|
|
41
|
-
BEST = Model(identifier="
|
|
42
|
-
"""
|
|
43
|
-
Best - Automatically selects the most responsive model based on the query.
|
|
44
|
-
"""
|
|
25
|
+
BEST = Model(identifier="pplx_pro")
|
|
26
|
+
"""Best - Automatically selects the best model based on the query."""
|
|
45
27
|
|
|
46
28
|
SONAR = Model(identifier="experimental")
|
|
47
|
-
"""
|
|
48
|
-
Sonar - Perplexity's fast model.
|
|
49
|
-
"""
|
|
29
|
+
"""Sonar - Perplexity's latest model."""
|
|
50
30
|
|
|
51
31
|
GEMINI_3_FLASH = Model(identifier="gemini30flash")
|
|
52
|
-
"""
|
|
53
|
-
Gemini 3 Flash - Google's fast reasoning model.
|
|
54
|
-
"""
|
|
32
|
+
"""Gemini 3 Flash - Google's fast model."""
|
|
55
33
|
|
|
56
34
|
GEMINI_3_FLASH_THINKING = Model(identifier="gemini30flash_high")
|
|
57
|
-
"""
|
|
58
|
-
Gemini 3 Flash Thinking - Google's fast reasoning model with enhanced thinking.
|
|
59
|
-
"""
|
|
35
|
+
"""Gemini 3 Flash Thinking - Google's fast model (thinking)."""
|
|
60
36
|
|
|
61
|
-
|
|
62
|
-
"""
|
|
63
|
-
Gemini 3 Pro - Google's newest reasoning model.
|
|
64
|
-
"""
|
|
37
|
+
GEMINI_3_PRO_THINKING = Model(identifier="gemini30pro")
|
|
38
|
+
"""Gemini 3 Pro Thinking - Google's most advanced model (thinking)."""
|
|
65
39
|
|
|
66
40
|
GPT_52 = Model(identifier="gpt52")
|
|
67
|
-
"""
|
|
68
|
-
GPT-5.2 - OpenAI's latest model.
|
|
69
|
-
"""
|
|
41
|
+
"""GPT-5.2 - OpenAI's latest model."""
|
|
70
42
|
|
|
71
43
|
GPT_52_THINKING = Model(identifier="gpt52_thinking")
|
|
72
|
-
"""
|
|
73
|
-
GPT-5.2 Thinking - OpenAI's latest model with thinking.
|
|
74
|
-
"""
|
|
44
|
+
"""GPT-5.2 Thinking - OpenAI's latest model (thinking)."""
|
|
75
45
|
|
|
76
46
|
CLAUDE_45_SONNET = Model(identifier="claude45sonnet")
|
|
77
|
-
"""
|
|
78
|
-
Claude Sonnet 4.5 - Anthropic's newest advanced model.
|
|
79
|
-
"""
|
|
47
|
+
"""Claude Sonnet 4.5 - Anthropic's fast model."""
|
|
80
48
|
|
|
81
49
|
CLAUDE_45_SONNET_THINKING = Model(identifier="claude45sonnetthinking")
|
|
82
|
-
"""
|
|
83
|
-
Claude Sonnet 4.5 Thinking - Anthropic's newest reasoning model.
|
|
84
|
-
"""
|
|
50
|
+
"""Claude Sonnet 4.5 Thinking - Anthropic's fast model (thinking)."""
|
|
85
51
|
|
|
86
|
-
CLAUDE_45_OPUS = Model(identifier="claude45opus")
|
|
87
|
-
"""
|
|
88
|
-
Claude Opus 4.5 - Anthropic's Opus reasoning model.
|
|
89
|
-
"""
|
|
52
|
+
CLAUDE_45_OPUS = Model(identifier="claude45opus") # TODO: check correct identifier
|
|
53
|
+
"""Claude Opus 4.5 - Anthropic's Opus reasoning model."""
|
|
90
54
|
|
|
91
|
-
CLAUDE_45_OPUS_THINKING = Model(identifier="claude45opusthinking")
|
|
92
|
-
"""
|
|
93
|
-
Claude Opus 4.5 Thinking - Anthropic's Opus reasoning model with thinking.
|
|
94
|
-
"""
|
|
55
|
+
CLAUDE_45_OPUS_THINKING = Model(identifier="claude45opusthinking") # TODO: check correct identifier
|
|
56
|
+
"""Claude Opus 4.5 Thinking - Anthropic's Opus reasoning model (thinking)."""
|
|
95
57
|
|
|
96
58
|
GROK_41 = Model(identifier="grok41nonreasoning")
|
|
97
|
-
"""
|
|
98
|
-
Grok 4.1 - xAI's latest advanced model.
|
|
99
|
-
"""
|
|
59
|
+
"""Grok 4.1 - xAI's latest model."""
|
|
100
60
|
|
|
101
61
|
GROK_41_THINKING = Model(identifier="grok41reasoning")
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
""
|
|
105
|
-
|
|
106
|
-
KIMI_K2_THINKING = Model(identifier="kimik2thinking")
|
|
107
|
-
"""
|
|
108
|
-
Kimi K2 Thinking - Moonshot AI's latest reasoning model.
|
|
109
|
-
"""
|
|
62
|
+
"""Grok 4.1 Thinking - xAI's latest model (thinking)."""
|
|
63
|
+
|
|
64
|
+
KIMI_K25_THINKING = Model(identifier="kimik25thinking")
|
|
65
|
+
"""Kimi K2.5 Thinking - Moonshot AI's latest model."""
|
|
@@ -1,29 +1,24 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Resilience utilities for HTTP requests.
|
|
3
|
-
|
|
4
|
-
Provides retry mechanisms, rate limiting, and Cloudflare bypass utilities
|
|
5
|
-
using the tenacity library for robust retry handling.
|
|
6
|
-
"""
|
|
1
|
+
"""Resilience utilities for HTTP requests."""
|
|
7
2
|
|
|
8
3
|
from __future__ import annotations
|
|
9
4
|
|
|
10
|
-
from collections.abc import Callable
|
|
11
5
|
from dataclasses import dataclass, field
|
|
12
|
-
import
|
|
6
|
+
from random import choice
|
|
13
7
|
from threading import Lock
|
|
14
|
-
import
|
|
15
|
-
from typing import TYPE_CHECKING,
|
|
8
|
+
from time import monotonic, sleep
|
|
9
|
+
from typing import TYPE_CHECKING, TypeVar
|
|
16
10
|
|
|
17
|
-
from tenacity import
|
|
11
|
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential_jitter
|
|
18
12
|
|
|
19
13
|
|
|
20
14
|
if TYPE_CHECKING:
|
|
21
15
|
from collections.abc import Callable
|
|
22
16
|
|
|
23
|
-
|
|
17
|
+
from tenacity import RetryCallState
|
|
24
18
|
|
|
25
19
|
|
|
26
|
-
|
|
20
|
+
T = TypeVar("T")
|
|
21
|
+
|
|
27
22
|
BROWSER_PROFILES: tuple[str, ...] = (
|
|
28
23
|
"chrome",
|
|
29
24
|
"chrome110",
|
|
@@ -41,29 +36,10 @@ BROWSER_PROFILES: tuple[str, ...] = (
|
|
|
41
36
|
"safari17_2_ios",
|
|
42
37
|
)
|
|
43
38
|
|
|
44
|
-
# Cloudflare challenge detection markers
|
|
45
|
-
CLOUDFLARE_MARKERS: tuple[str, ...] = (
|
|
46
|
-
"cf-ray",
|
|
47
|
-
"cf-mitigated",
|
|
48
|
-
"__cf_chl_",
|
|
49
|
-
"Checking your browser",
|
|
50
|
-
"Just a moment...",
|
|
51
|
-
"cloudflare",
|
|
52
|
-
"Enable JavaScript and cookies to continue",
|
|
53
|
-
"challenge-platform",
|
|
54
|
-
)
|
|
55
|
-
|
|
56
39
|
|
|
57
40
|
@dataclass(slots=True)
|
|
58
41
|
class RetryConfig:
|
|
59
|
-
"""Configuration for retry behavior.
|
|
60
|
-
|
|
61
|
-
Attributes:
|
|
62
|
-
max_retries: Maximum number of retry attempts.
|
|
63
|
-
base_delay: Initial delay in seconds before first retry.
|
|
64
|
-
max_delay: Maximum delay between retries.
|
|
65
|
-
jitter: Random jitter factor to add to delays (0-1).
|
|
66
|
-
"""
|
|
42
|
+
"""Configuration for retry behavior."""
|
|
67
43
|
|
|
68
44
|
max_retries: int = 3
|
|
69
45
|
base_delay: float = 1.0
|
|
@@ -73,23 +49,17 @@ class RetryConfig:
|
|
|
73
49
|
|
|
74
50
|
@dataclass
|
|
75
51
|
class RateLimiter:
|
|
76
|
-
"""Token bucket rate limiter
|
|
77
|
-
|
|
78
|
-
Attributes:
|
|
79
|
-
requests_per_second: Maximum requests allowed per second.
|
|
80
|
-
"""
|
|
52
|
+
"""Token bucket rate limiter."""
|
|
81
53
|
|
|
82
54
|
requests_per_second: float = 0.5
|
|
83
55
|
_last_request: float = field(default=0.0, init=False)
|
|
84
56
|
_lock: Lock = field(default_factory=Lock, init=False)
|
|
85
57
|
|
|
86
58
|
def acquire(self) -> None:
|
|
87
|
-
"""
|
|
88
|
-
Wait until a request can be made within rate limits.
|
|
89
|
-
"""
|
|
59
|
+
"""Wait until a request can be made within rate limits."""
|
|
90
60
|
|
|
91
61
|
with self._lock:
|
|
92
|
-
now =
|
|
62
|
+
now = monotonic()
|
|
93
63
|
min_interval = 1.0 / self.requests_per_second
|
|
94
64
|
|
|
95
65
|
if self._last_request > 0:
|
|
@@ -97,59 +67,15 @@ class RateLimiter:
|
|
|
97
67
|
wait_time = min_interval - elapsed
|
|
98
68
|
|
|
99
69
|
if wait_time > 0:
|
|
100
|
-
|
|
70
|
+
sleep(wait_time)
|
|
101
71
|
|
|
102
|
-
self._last_request =
|
|
72
|
+
self._last_request = monotonic()
|
|
103
73
|
|
|
104
74
|
|
|
105
75
|
def get_random_browser_profile() -> str:
|
|
106
|
-
"""Get a random browser profile for fingerprint rotation.
|
|
107
|
-
|
|
108
|
-
Returns:
|
|
109
|
-
A browser profile identifier compatible with curl_cffi.
|
|
110
|
-
"""
|
|
111
|
-
|
|
112
|
-
return random.choice(BROWSER_PROFILES)
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
def is_cloudflare_challenge(response_text: str, headers: dict[str, Any] | None = None) -> bool:
|
|
116
|
-
"""Detect if a response is a Cloudflare challenge page.
|
|
117
|
-
|
|
118
|
-
Args:
|
|
119
|
-
response_text: The response body text.
|
|
120
|
-
headers: Optional response headers.
|
|
76
|
+
"""Get a random browser profile for fingerprint rotation."""
|
|
121
77
|
|
|
122
|
-
|
|
123
|
-
True if Cloudflare challenge markers are detected.
|
|
124
|
-
"""
|
|
125
|
-
|
|
126
|
-
text_lower = response_text.lower()
|
|
127
|
-
|
|
128
|
-
for marker in CLOUDFLARE_MARKERS:
|
|
129
|
-
if marker.lower() in text_lower:
|
|
130
|
-
return True
|
|
131
|
-
|
|
132
|
-
if headers:
|
|
133
|
-
for key in headers:
|
|
134
|
-
key_lower = key.lower()
|
|
135
|
-
|
|
136
|
-
if "cf-" in key_lower or "cloudflare" in key_lower:
|
|
137
|
-
return True
|
|
138
|
-
|
|
139
|
-
return False
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
def is_cloudflare_status(status_code: int) -> bool:
|
|
143
|
-
"""Check if status code indicates a potential Cloudflare block.
|
|
144
|
-
|
|
145
|
-
Args:
|
|
146
|
-
status_code: HTTP status code.
|
|
147
|
-
|
|
148
|
-
Returns:
|
|
149
|
-
True if status code is commonly used by Cloudflare challenges.
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
return status_code in (403, 503, 520, 521, 522, 523, 524, 525, 526)
|
|
78
|
+
return choice(BROWSER_PROFILES)
|
|
153
79
|
|
|
154
80
|
|
|
155
81
|
def create_retry_decorator(
|
|
@@ -157,16 +83,7 @@ def create_retry_decorator(
|
|
|
157
83
|
retryable_exceptions: tuple[type[Exception], ...],
|
|
158
84
|
on_retry: Callable[[RetryCallState], None] | None = None,
|
|
159
85
|
) -> Callable[[Callable[..., T]], Callable[..., T]]:
|
|
160
|
-
"""Create a tenacity retry decorator with the given configuration.
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
config: Retry configuration.
|
|
164
|
-
retryable_exceptions: Tuple of exception types to retry on.
|
|
165
|
-
on_retry: Optional callback to execute on each retry.
|
|
166
|
-
|
|
167
|
-
Returns:
|
|
168
|
-
A retry decorator configured with the given settings.
|
|
169
|
-
"""
|
|
86
|
+
"""Create a tenacity retry decorator with the given configuration."""
|
|
170
87
|
|
|
171
88
|
return retry(
|
|
172
89
|
stop=stop_after_attempt(config.max_retries + 1),
|
|
@@ -1,54 +1,47 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Response types and data models.
|
|
3
|
-
"""
|
|
1
|
+
"""Response types and data models."""
|
|
4
2
|
|
|
5
3
|
from __future__ import annotations
|
|
6
4
|
|
|
7
|
-
from dataclasses import dataclass
|
|
5
|
+
from dataclasses import dataclass
|
|
8
6
|
from typing import Any
|
|
9
7
|
|
|
8
|
+
from pydantic import BaseModel, ConfigDict
|
|
10
9
|
|
|
11
|
-
|
|
12
|
-
class Coordinates:
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
|
|
10
|
+
|
|
11
|
+
class Coordinates(BaseModel):
|
|
12
|
+
"""Geographic coordinates (lat/lng)."""
|
|
13
|
+
|
|
14
|
+
model_config = ConfigDict(frozen=True)
|
|
16
15
|
|
|
17
16
|
latitude: float
|
|
18
17
|
longitude: float
|
|
19
18
|
|
|
20
19
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
"""
|
|
20
|
+
class SearchResultItem(BaseModel):
|
|
21
|
+
"""A single search result."""
|
|
22
|
+
|
|
23
|
+
model_config = ConfigDict(frozen=True)
|
|
26
24
|
|
|
27
25
|
title: str | None = None
|
|
28
26
|
snippet: str | None = None
|
|
29
27
|
url: str | None = None
|
|
30
28
|
|
|
31
29
|
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
"""
|
|
35
|
-
Response from Perplexity AI.
|
|
36
|
-
"""
|
|
30
|
+
class Response(BaseModel):
|
|
31
|
+
"""Response from Perplexity AI."""
|
|
37
32
|
|
|
38
33
|
title: str | None = None
|
|
39
34
|
answer: str | None = None
|
|
40
|
-
chunks: list[str] =
|
|
35
|
+
chunks: list[str] = []
|
|
41
36
|
last_chunk: str | None = None
|
|
42
|
-
search_results: list[SearchResultItem] =
|
|
37
|
+
search_results: list[SearchResultItem] = []
|
|
43
38
|
conversation_uuid: str | None = None
|
|
44
|
-
raw_data: dict[str, Any] =
|
|
39
|
+
raw_data: dict[str, Any] = {}
|
|
45
40
|
|
|
46
41
|
|
|
47
42
|
@dataclass(frozen=True, slots=True)
|
|
48
43
|
class _FileInfo:
|
|
49
|
-
"""
|
|
50
|
-
Internal file info for uploads.
|
|
51
|
-
"""
|
|
44
|
+
"""Internal file info for uploads."""
|
|
52
45
|
|
|
53
46
|
path: str
|
|
54
47
|
size: int
|