perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- perplexity_webui_scraper/__init__.py +5 -14
- perplexity_webui_scraper/cli/get_perplexity_session_token.py +24 -8
- perplexity_webui_scraper/config.py +33 -4
- perplexity_webui_scraper/constants.py +30 -10
- perplexity_webui_scraper/core.py +223 -21
- perplexity_webui_scraper/enums.py +91 -19
- perplexity_webui_scraper/exceptions.py +77 -1
- perplexity_webui_scraper/http.py +374 -38
- perplexity_webui_scraper/limits.py +12 -4
- perplexity_webui_scraper/logging.py +278 -0
- perplexity_webui_scraper/mcp/__init__.py +20 -0
- perplexity_webui_scraper/mcp/__main__.py +11 -0
- perplexity_webui_scraper/mcp/server.py +166 -0
- perplexity_webui_scraper/models.py +55 -19
- perplexity_webui_scraper/resilience.py +181 -0
- perplexity_webui_scraper/types.py +15 -5
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/METADATA +97 -7
- perplexity_webui_scraper-0.3.6.dist-info/RECORD +21 -0
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/WHEEL +1 -1
- {perplexity_webui_scraper-0.3.4.dist-info → perplexity_webui_scraper-0.3.6.dist-info}/entry_points.txt +1 -0
- perplexity_webui_scraper-0.3.4.dist-info/RECORD +0 -16
|
@@ -1,36 +1,27 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Extract AI responses from Perplexity's web interface.
|
|
3
|
+
"""
|
|
2
4
|
|
|
3
5
|
from importlib import metadata
|
|
4
6
|
|
|
5
7
|
from .config import ClientConfig, ConversationConfig
|
|
6
8
|
from .core import Conversation, Perplexity
|
|
7
|
-
from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
|
|
8
|
-
from .exceptions import (
|
|
9
|
-
AuthenticationError,
|
|
10
|
-
FileUploadError,
|
|
11
|
-
FileValidationError,
|
|
12
|
-
PerplexityError,
|
|
13
|
-
RateLimitError,
|
|
14
|
-
)
|
|
9
|
+
from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
|
|
15
10
|
from .models import Model, Models
|
|
16
11
|
from .types import Coordinates, Response, SearchResultItem
|
|
17
12
|
|
|
18
13
|
|
|
19
14
|
__version__: str = metadata.version("perplexity-webui-scraper")
|
|
20
15
|
__all__: list[str] = [
|
|
21
|
-
"AuthenticationError",
|
|
22
16
|
"CitationMode",
|
|
23
17
|
"ClientConfig",
|
|
24
18
|
"Conversation",
|
|
25
19
|
"ConversationConfig",
|
|
26
20
|
"Coordinates",
|
|
27
|
-
"
|
|
28
|
-
"FileValidationError",
|
|
21
|
+
"LogLevel",
|
|
29
22
|
"Model",
|
|
30
23
|
"Models",
|
|
31
24
|
"Perplexity",
|
|
32
|
-
"PerplexityError",
|
|
33
|
-
"RateLimitError",
|
|
34
25
|
"Response",
|
|
35
26
|
"SearchFocus",
|
|
36
27
|
"SearchResultItem",
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
CLI utility for secure Perplexity authentication and session extraction.
|
|
3
|
+
"""
|
|
2
4
|
|
|
3
5
|
from __future__ import annotations
|
|
4
6
|
|
|
@@ -57,7 +59,9 @@ def update_env(token: str) -> bool:
|
|
|
57
59
|
|
|
58
60
|
|
|
59
61
|
def _initialize_session() -> tuple[Session, str]:
|
|
60
|
-
"""
|
|
62
|
+
"""
|
|
63
|
+
Initialize session and obtain CSRF token.
|
|
64
|
+
"""
|
|
61
65
|
|
|
62
66
|
session = Session(impersonate="chrome", headers={"Referer": BASE_URL, "Origin": BASE_URL})
|
|
63
67
|
|
|
@@ -73,7 +77,9 @@ def _initialize_session() -> tuple[Session, str]:
|
|
|
73
77
|
|
|
74
78
|
|
|
75
79
|
def _request_verification_code(session: Session, csrf: str, email: str) -> None:
|
|
76
|
-
"""
|
|
80
|
+
"""
|
|
81
|
+
Send verification code to user's email.
|
|
82
|
+
"""
|
|
77
83
|
|
|
78
84
|
with console.status("[bold green]Sending verification code...", spinner="dots"):
|
|
79
85
|
r = session.post(
|
|
@@ -92,7 +98,9 @@ def _request_verification_code(session: Session, csrf: str, email: str) -> None:
|
|
|
92
98
|
|
|
93
99
|
|
|
94
100
|
def _validate_and_get_redirect_url(session: Session, email: str, user_input: str) -> str:
|
|
95
|
-
"""
|
|
101
|
+
"""
|
|
102
|
+
Validate user input (OTP or magic link) and return redirect URL.
|
|
103
|
+
"""
|
|
96
104
|
|
|
97
105
|
with console.status("[bold green]Validating...", spinner="dots"):
|
|
98
106
|
if user_input.startswith("http"):
|
|
@@ -120,7 +128,9 @@ def _validate_and_get_redirect_url(session: Session, email: str, user_input: str
|
|
|
120
128
|
|
|
121
129
|
|
|
122
130
|
def _extract_session_token(session: Session, redirect_url: str) -> str:
|
|
123
|
-
"""
|
|
131
|
+
"""
|
|
132
|
+
Extract session token from cookies after authentication.
|
|
133
|
+
"""
|
|
124
134
|
|
|
125
135
|
session.get(redirect_url)
|
|
126
136
|
token = session.cookies.get("__Secure-next-auth.session-token")
|
|
@@ -132,7 +142,9 @@ def _extract_session_token(session: Session, redirect_url: str) -> str:
|
|
|
132
142
|
|
|
133
143
|
|
|
134
144
|
def _display_and_save_token(token: str) -> None:
|
|
135
|
-
"""
|
|
145
|
+
"""
|
|
146
|
+
Display token and optionally save to .env file.
|
|
147
|
+
"""
|
|
136
148
|
|
|
137
149
|
console.print("\n[bold green]✅ Token generated successfully![/bold green]")
|
|
138
150
|
console.print(f"\n[bold white]Your session token:[/bold white]\n[green]{token}[/green]\n")
|
|
@@ -147,7 +159,9 @@ def _display_and_save_token(token: str) -> None:
|
|
|
147
159
|
|
|
148
160
|
|
|
149
161
|
def _show_header() -> None:
|
|
150
|
-
"""
|
|
162
|
+
"""
|
|
163
|
+
Display welcome header.
|
|
164
|
+
"""
|
|
151
165
|
|
|
152
166
|
console.print(
|
|
153
167
|
Panel(
|
|
@@ -161,7 +175,9 @@ def _show_header() -> None:
|
|
|
161
175
|
|
|
162
176
|
|
|
163
177
|
def _show_exit_message() -> None:
|
|
164
|
-
"""
|
|
178
|
+
"""
|
|
179
|
+
Display security note and wait for user to exit.
|
|
180
|
+
"""
|
|
165
181
|
|
|
166
182
|
console.print("\n[bold yellow]⚠️ Security Note:[/bold yellow]")
|
|
167
183
|
console.print("Press [bold white]ENTER[/bold white] to clear screen and exit.")
|
|
@@ -1,21 +1,27 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Configuration classes.
|
|
3
|
+
"""
|
|
2
4
|
|
|
3
5
|
from __future__ import annotations
|
|
4
6
|
|
|
5
7
|
from dataclasses import dataclass
|
|
6
8
|
from typing import TYPE_CHECKING
|
|
7
9
|
|
|
8
|
-
from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
|
|
10
|
+
from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
|
|
9
11
|
|
|
10
12
|
|
|
11
13
|
if TYPE_CHECKING:
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
12
16
|
from .models import Model
|
|
13
17
|
from .types import Coordinates
|
|
14
18
|
|
|
15
19
|
|
|
16
20
|
@dataclass(slots=True)
|
|
17
21
|
class ConversationConfig:
|
|
18
|
-
"""
|
|
22
|
+
"""
|
|
23
|
+
Default settings for a conversation. Can be overridden per message.
|
|
24
|
+
"""
|
|
19
25
|
|
|
20
26
|
model: Model | None = None
|
|
21
27
|
citation_mode: CitationMode = CitationMode.CLEAN
|
|
@@ -30,7 +36,30 @@ class ConversationConfig:
|
|
|
30
36
|
|
|
31
37
|
@dataclass(frozen=True, slots=True)
|
|
32
38
|
class ClientConfig:
|
|
33
|
-
"""
|
|
39
|
+
"""
|
|
40
|
+
HTTP client settings.
|
|
41
|
+
|
|
42
|
+
Attributes:
|
|
43
|
+
timeout: Request timeout in seconds.
|
|
44
|
+
impersonate: Browser to impersonate (e.g., "chrome", "edge", "safari").
|
|
45
|
+
max_retries: Maximum retry attempts for failed requests.
|
|
46
|
+
retry_base_delay: Initial delay in seconds before first retry.
|
|
47
|
+
retry_max_delay: Maximum delay between retries.
|
|
48
|
+
retry_jitter: Random jitter factor (0-1) to add to delays.
|
|
49
|
+
requests_per_second: Rate limit for requests (0 to disable).
|
|
50
|
+
rotate_fingerprint: Whether to rotate browser fingerprint on retries.
|
|
51
|
+
logging_level: Logging verbosity level. Default is DISABLED.
|
|
52
|
+
log_file: Optional file path for persistent logging. If set, logs go to file only.
|
|
53
|
+
If None, logs go to console. All logs are appended.
|
|
54
|
+
"""
|
|
34
55
|
|
|
35
56
|
timeout: int = 3600
|
|
36
57
|
impersonate: str = "chrome"
|
|
58
|
+
max_retries: int = 3
|
|
59
|
+
retry_base_delay: float = 1.0
|
|
60
|
+
retry_max_delay: float = 60.0
|
|
61
|
+
retry_jitter: float = 0.5
|
|
62
|
+
requests_per_second: float = 0.5
|
|
63
|
+
rotate_fingerprint: bool = True
|
|
64
|
+
logging_level: LogLevel = LogLevel.DISABLED
|
|
65
|
+
log_file: str | Path | None = None
|
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Constants and values for the Perplexity internal API and HTTP interactions.
|
|
3
|
+
"""
|
|
2
4
|
|
|
3
5
|
from __future__ import annotations
|
|
4
6
|
|
|
@@ -8,20 +10,30 @@ from typing import Final
|
|
|
8
10
|
|
|
9
11
|
# API Configuration
|
|
10
12
|
API_VERSION: Final[str] = "2.18"
|
|
11
|
-
"""
|
|
13
|
+
"""
|
|
14
|
+
Current API version used by Perplexity WebUI.
|
|
15
|
+
"""
|
|
12
16
|
|
|
13
17
|
API_BASE_URL: Final[str] = "https://www.perplexity.ai"
|
|
14
|
-
"""
|
|
18
|
+
"""
|
|
19
|
+
Base URL for all API requests.
|
|
20
|
+
"""
|
|
15
21
|
|
|
16
22
|
# API Endpoints
|
|
17
23
|
ENDPOINT_ASK: Final[str] = "/rest/sse/perplexity_ask"
|
|
18
|
-
"""
|
|
24
|
+
"""
|
|
25
|
+
SSE endpoint for sending prompts.
|
|
26
|
+
"""
|
|
19
27
|
|
|
20
28
|
ENDPOINT_SEARCH_INIT: Final[str] = "/search/new"
|
|
21
|
-
"""
|
|
29
|
+
"""
|
|
30
|
+
Endpoint to initialize a search session.
|
|
31
|
+
"""
|
|
22
32
|
|
|
23
33
|
ENDPOINT_UPLOAD: Final[str] = "/rest/uploads/batch_create_upload_urls"
|
|
24
|
-
"""
|
|
34
|
+
"""
|
|
35
|
+
Endpoint for file upload URL generation.
|
|
36
|
+
"""
|
|
25
37
|
|
|
26
38
|
# API Fixed Parameters
|
|
27
39
|
SEND_BACK_TEXT: Final[bool] = True
|
|
@@ -33,10 +45,14 @@ False = API sends delta chunks only (accumulate mode).
|
|
|
33
45
|
"""
|
|
34
46
|
|
|
35
47
|
USE_SCHEMATIZED_API: Final[bool] = False
|
|
36
|
-
"""
|
|
48
|
+
"""
|
|
49
|
+
Whether to use the schematized API format.
|
|
50
|
+
"""
|
|
37
51
|
|
|
38
52
|
PROMPT_SOURCE: Final[str] = "user"
|
|
39
|
-
"""
|
|
53
|
+
"""
|
|
54
|
+
Source identifier for prompts.
|
|
55
|
+
"""
|
|
40
56
|
|
|
41
57
|
# Regex Patterns (Pre-compiled for performance in streaming parsing)
|
|
42
58
|
CITATION_PATTERN: Final[Pattern[str]] = compile(r"\[(\d{1,2})\]")
|
|
@@ -47,7 +63,9 @@ Uses word boundary to avoid matching things like [123].
|
|
|
47
63
|
"""
|
|
48
64
|
|
|
49
65
|
JSON_OBJECT_PATTERN: Final[Pattern[str]] = compile(r"^\{.*\}$")
|
|
50
|
-
"""
|
|
66
|
+
"""
|
|
67
|
+
Pattern to detect JSON object strings.
|
|
68
|
+
"""
|
|
51
69
|
|
|
52
70
|
# HTTP Headers
|
|
53
71
|
DEFAULT_HEADERS: Final[dict[str, str]] = {
|
|
@@ -61,4 +79,6 @@ Referer and Origin are added dynamically based on BASE_URL.
|
|
|
61
79
|
"""
|
|
62
80
|
|
|
63
81
|
SESSION_COOKIE_NAME: Final[str] = "__Secure-next-auth.session-token"
|
|
64
|
-
"""
|
|
82
|
+
"""
|
|
83
|
+
Name of the session cookie used for authentication.
|
|
84
|
+
"""
|
perplexity_webui_scraper/core.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Core client implementation.
|
|
3
|
+
"""
|
|
2
4
|
|
|
3
5
|
from __future__ import annotations
|
|
4
6
|
|
|
@@ -8,6 +10,8 @@ from pathlib import Path
|
|
|
8
10
|
from typing import TYPE_CHECKING, Any
|
|
9
11
|
from uuid import uuid4
|
|
10
12
|
|
|
13
|
+
from curl_cffi import CurlMime
|
|
14
|
+
from curl_cffi.requests import Session
|
|
11
15
|
from orjson import JSONDecodeError, loads
|
|
12
16
|
|
|
13
17
|
|
|
@@ -26,20 +30,25 @@ from .constants import (
|
|
|
26
30
|
USE_SCHEMATIZED_API,
|
|
27
31
|
)
|
|
28
32
|
from .enums import CitationMode
|
|
29
|
-
from .exceptions import FileUploadError, FileValidationError
|
|
33
|
+
from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
|
|
30
34
|
from .http import HTTPClient
|
|
31
35
|
from .limits import MAX_FILE_SIZE, MAX_FILES
|
|
36
|
+
from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
|
|
32
37
|
from .models import Model, Models
|
|
33
38
|
from .types import Response, SearchResultItem, _FileInfo
|
|
34
39
|
|
|
35
40
|
|
|
41
|
+
logger = get_logger(__name__)
|
|
42
|
+
|
|
43
|
+
|
|
36
44
|
class Perplexity:
|
|
37
45
|
"""Web scraper for Perplexity AI conversations."""
|
|
38
46
|
|
|
39
47
|
__slots__ = ("_http",)
|
|
40
48
|
|
|
41
49
|
def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
|
|
42
|
-
"""
|
|
50
|
+
"""
|
|
51
|
+
Initialize web scraper with session token.
|
|
43
52
|
|
|
44
53
|
Args:
|
|
45
54
|
session_token: Perplexity session cookie (__Secure-next-auth.session-token).
|
|
@@ -53,17 +62,71 @@ class Perplexity:
|
|
|
53
62
|
raise ValueError("session_token cannot be empty")
|
|
54
63
|
|
|
55
64
|
cfg = config or ClientConfig()
|
|
56
|
-
|
|
65
|
+
|
|
66
|
+
# Configure logging based on config
|
|
67
|
+
configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
|
|
68
|
+
|
|
69
|
+
logger.info(
|
|
70
|
+
"Perplexity client initializing | "
|
|
71
|
+
f"session_token_length={len(session_token)} "
|
|
72
|
+
f"logging_level={cfg.logging_level.value} "
|
|
73
|
+
f"log_file={cfg.log_file}"
|
|
74
|
+
)
|
|
75
|
+
logger.debug(
|
|
76
|
+
"Client configuration | "
|
|
77
|
+
f"timeout={cfg.timeout}s "
|
|
78
|
+
f"impersonate={cfg.impersonate} "
|
|
79
|
+
f"max_retries={cfg.max_retries} "
|
|
80
|
+
f"retry_base_delay={cfg.retry_base_delay}s "
|
|
81
|
+
f"retry_max_delay={cfg.retry_max_delay}s "
|
|
82
|
+
f"retry_jitter={cfg.retry_jitter} "
|
|
83
|
+
f"requests_per_second={cfg.requests_per_second} "
|
|
84
|
+
f"rotate_fingerprint={cfg.rotate_fingerprint}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
self._http = HTTPClient(
|
|
88
|
+
session_token,
|
|
89
|
+
timeout=cfg.timeout,
|
|
90
|
+
impersonate=cfg.impersonate,
|
|
91
|
+
max_retries=cfg.max_retries,
|
|
92
|
+
retry_base_delay=cfg.retry_base_delay,
|
|
93
|
+
retry_max_delay=cfg.retry_max_delay,
|
|
94
|
+
retry_jitter=cfg.retry_jitter,
|
|
95
|
+
requests_per_second=cfg.requests_per_second,
|
|
96
|
+
rotate_fingerprint=cfg.rotate_fingerprint,
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
logger.info("Perplexity client initialized successfully")
|
|
57
100
|
|
|
58
101
|
def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
|
|
59
102
|
"""Create a new conversation."""
|
|
60
103
|
|
|
61
|
-
|
|
104
|
+
cfg = config or ConversationConfig()
|
|
105
|
+
logger.debug(
|
|
106
|
+
"Creating conversation | "
|
|
107
|
+
f"model={cfg.model} "
|
|
108
|
+
f"citation_mode={cfg.citation_mode} "
|
|
109
|
+
f"save_to_library={cfg.save_to_library} "
|
|
110
|
+
f"search_focus={cfg.search_focus} "
|
|
111
|
+
f"language={cfg.language}"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
conversation = Conversation(self._http, cfg)
|
|
115
|
+
|
|
116
|
+
log_conversation_created(
|
|
117
|
+
f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
|
|
118
|
+
f"search_focus={cfg.search_focus}, language={cfg.language}"
|
|
119
|
+
)
|
|
120
|
+
logger.info("Conversation created successfully")
|
|
121
|
+
|
|
122
|
+
return conversation
|
|
62
123
|
|
|
63
124
|
def close(self) -> None:
|
|
64
125
|
"""Close the client."""
|
|
65
126
|
|
|
127
|
+
logger.debug("Closing Perplexity client")
|
|
66
128
|
self._http.close()
|
|
129
|
+
logger.info("Perplexity client closed")
|
|
67
130
|
|
|
68
131
|
def __enter__(self) -> Perplexity:
|
|
69
132
|
return self
|
|
@@ -90,6 +153,13 @@ class Conversation:
|
|
|
90
153
|
)
|
|
91
154
|
|
|
92
155
|
def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
|
|
156
|
+
logger.debug(
|
|
157
|
+
"Conversation.__init__ | "
|
|
158
|
+
f"model={config.model} "
|
|
159
|
+
f"citation_mode={config.citation_mode} "
|
|
160
|
+
f"save_to_library={config.save_to_library} "
|
|
161
|
+
f"search_focus={config.search_focus}"
|
|
162
|
+
)
|
|
93
163
|
self._http = http
|
|
94
164
|
self._config = config
|
|
95
165
|
self._citation_mode = CitationMode.DEFAULT
|
|
@@ -101,6 +171,7 @@ class Conversation:
|
|
|
101
171
|
self._search_results: list[SearchResultItem] = []
|
|
102
172
|
self._raw_data: dict[str, Any] = {}
|
|
103
173
|
self._stream_generator: Generator[Response, None, None] | None = None
|
|
174
|
+
logger.debug("Conversation initialized with empty state")
|
|
104
175
|
|
|
105
176
|
@property
|
|
106
177
|
def answer(self) -> str | None:
|
|
@@ -142,11 +213,29 @@ class Conversation:
|
|
|
142
213
|
) -> Conversation:
|
|
143
214
|
"""Ask a question. Returns self for method chaining or streaming iteration."""
|
|
144
215
|
|
|
216
|
+
logger.info(
|
|
217
|
+
"Conversation.ask called | "
|
|
218
|
+
f"query_length={len(query)} "
|
|
219
|
+
f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
|
|
220
|
+
f"model={model} "
|
|
221
|
+
f"files_count={len(files) if files else 0} "
|
|
222
|
+
f"citation_mode={citation_mode} "
|
|
223
|
+
f"stream={stream}"
|
|
224
|
+
)
|
|
225
|
+
|
|
145
226
|
effective_model = model or self._config.model or Models.BEST
|
|
146
227
|
effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
|
|
147
228
|
self._citation_mode = effective_citation
|
|
229
|
+
|
|
230
|
+
logger.debug(
|
|
231
|
+
f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
|
|
232
|
+
)
|
|
233
|
+
|
|
234
|
+
log_query_sent(query, str(effective_model), bool(files))
|
|
148
235
|
self._execute(query, effective_model, files, stream=stream)
|
|
149
236
|
|
|
237
|
+
logger.debug("Query execution completed")
|
|
238
|
+
|
|
150
239
|
return self
|
|
151
240
|
|
|
152
241
|
def _execute(
|
|
@@ -158,22 +247,49 @@ class Conversation:
|
|
|
158
247
|
) -> None:
|
|
159
248
|
"""Execute a query."""
|
|
160
249
|
|
|
250
|
+
logger.debug(
|
|
251
|
+
f"Executing query | "
|
|
252
|
+
f"query_length={len(query)} "
|
|
253
|
+
f"model={model} "
|
|
254
|
+
f"files_count={len(files) if files else 0} "
|
|
255
|
+
f"stream={stream} "
|
|
256
|
+
f"is_followup={self._backend_uuid is not None}"
|
|
257
|
+
)
|
|
258
|
+
|
|
161
259
|
self._reset_response_state()
|
|
260
|
+
logger.debug("Response state reset")
|
|
162
261
|
|
|
163
262
|
# Upload files
|
|
164
263
|
file_urls: list[str] = []
|
|
165
264
|
|
|
166
265
|
if files:
|
|
266
|
+
logger.debug(f"Validating {len(files)} files")
|
|
167
267
|
validated = self._validate_files(files)
|
|
268
|
+
logger.debug(f"Validated {len(validated)} files, uploading...")
|
|
168
269
|
file_urls = [self._upload_file(f) for f in validated]
|
|
270
|
+
logger.debug(f"Uploaded {len(file_urls)} files successfully")
|
|
169
271
|
|
|
170
272
|
payload = self._build_payload(query, model, file_urls)
|
|
273
|
+
logger.debug(
|
|
274
|
+
f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
logger.debug("Initializing search session")
|
|
171
278
|
self._http.init_search(query)
|
|
172
279
|
|
|
173
280
|
if stream:
|
|
281
|
+
logger.debug("Starting streaming mode")
|
|
174
282
|
self._stream_generator = self._stream(payload)
|
|
175
283
|
else:
|
|
284
|
+
logger.debug("Starting complete mode (non-streaming)")
|
|
176
285
|
self._complete(payload)
|
|
286
|
+
logger.debug(
|
|
287
|
+
f"Query completed | "
|
|
288
|
+
f"title={self._title} "
|
|
289
|
+
f"answer_length={len(self._answer) if self._answer else 0} "
|
|
290
|
+
f"chunks_count={len(self._chunks)} "
|
|
291
|
+
f"search_results_count={len(self._search_results)}"
|
|
292
|
+
)
|
|
177
293
|
|
|
178
294
|
def _reset_response_state(self) -> None:
|
|
179
295
|
self._title = None
|
|
@@ -237,8 +353,8 @@ class Conversation:
|
|
|
237
353
|
is_image=mimetype.startswith("image/"),
|
|
238
354
|
)
|
|
239
355
|
)
|
|
240
|
-
except FileValidationError:
|
|
241
|
-
raise
|
|
356
|
+
except FileValidationError as error:
|
|
357
|
+
raise error
|
|
242
358
|
except (FileNotFoundError, PermissionError) as error:
|
|
243
359
|
raise FileValidationError(file_path, f"Cannot access file: {error}") from error
|
|
244
360
|
except OSError as error:
|
|
@@ -264,16 +380,55 @@ class Conversation:
|
|
|
264
380
|
try:
|
|
265
381
|
response = self._http.post(ENDPOINT_UPLOAD, json=json_data)
|
|
266
382
|
response_data = response.json()
|
|
267
|
-
|
|
383
|
+
result = response_data.get("results", {}).get(file_uuid, {})
|
|
384
|
+
|
|
385
|
+
s3_bucket_url = result.get("s3_bucket_url")
|
|
386
|
+
s3_object_url = result.get("s3_object_url")
|
|
387
|
+
fields = result.get("fields", {})
|
|
268
388
|
|
|
269
|
-
if not
|
|
389
|
+
if not s3_object_url:
|
|
270
390
|
raise FileUploadError(file_info.path, "No upload URL returned")
|
|
271
391
|
|
|
272
|
-
|
|
392
|
+
if not s3_bucket_url or not fields:
|
|
393
|
+
raise FileUploadError(file_info.path, "Missing S3 upload credentials")
|
|
394
|
+
|
|
395
|
+
# Upload the file to S3 using presigned POST
|
|
396
|
+
file_path = Path(file_info.path)
|
|
397
|
+
|
|
398
|
+
with file_path.open("rb") as f:
|
|
399
|
+
file_content = f.read()
|
|
400
|
+
|
|
401
|
+
# Build multipart form data using CurlMime
|
|
402
|
+
# For S3 presigned POST, form fields must come before the file
|
|
403
|
+
mime = CurlMime()
|
|
404
|
+
|
|
405
|
+
for field_name, field_value in fields.items():
|
|
406
|
+
mime.addpart(name=field_name, data=field_value)
|
|
407
|
+
|
|
408
|
+
mime.addpart(
|
|
409
|
+
name="file",
|
|
410
|
+
content_type=file_info.mimetype,
|
|
411
|
+
filename=file_path.name,
|
|
412
|
+
data=file_content,
|
|
413
|
+
)
|
|
414
|
+
|
|
415
|
+
# S3 requires a clean session
|
|
416
|
+
with Session() as s3_session:
|
|
417
|
+
upload_response = s3_session.post(s3_bucket_url, multipart=mime)
|
|
418
|
+
|
|
419
|
+
mime.close()
|
|
420
|
+
|
|
421
|
+
if upload_response.status_code not in (200, 201, 204):
|
|
422
|
+
raise FileUploadError(
|
|
423
|
+
file_info.path,
|
|
424
|
+
f"S3 upload failed with status {upload_response.status_code}: {upload_response.text}",
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
return s3_object_url
|
|
273
428
|
except FileUploadError as error:
|
|
274
429
|
raise error
|
|
275
|
-
except Exception as
|
|
276
|
-
raise FileUploadError(file_info.path, str(
|
|
430
|
+
except Exception as error:
|
|
431
|
+
raise FileUploadError(file_info.path, str(error)) from error
|
|
277
432
|
|
|
278
433
|
def _build_payload(
|
|
279
434
|
self,
|
|
@@ -348,20 +503,26 @@ class Conversation:
|
|
|
348
503
|
return CITATION_PATTERN.sub(replacer, text)
|
|
349
504
|
|
|
350
505
|
def _parse_line(self, line: str | bytes) -> dict[str, Any] | None:
|
|
351
|
-
|
|
506
|
+
if isinstance(line, bytes) and line.startswith(b"data: "):
|
|
507
|
+
return loads(line[6:])
|
|
352
508
|
|
|
353
|
-
if
|
|
509
|
+
if isinstance(line, str) and line.startswith("data: "):
|
|
354
510
|
return loads(line[6:])
|
|
355
511
|
|
|
356
512
|
return None
|
|
357
513
|
|
|
358
514
|
def _process_data(self, data: dict[str, Any]) -> None:
|
|
515
|
+
"""Process SSE data chunk and update conversation state."""
|
|
516
|
+
|
|
359
517
|
if self._backend_uuid is None and "backend_uuid" in data:
|
|
360
518
|
self._backend_uuid = data["backend_uuid"]
|
|
361
519
|
|
|
362
520
|
if self._read_write_token is None and "read_write_token" in data:
|
|
363
521
|
self._read_write_token = data["read_write_token"]
|
|
364
522
|
|
|
523
|
+
if self._title is None and "thread_title" in data:
|
|
524
|
+
self._title = data["thread_title"]
|
|
525
|
+
|
|
365
526
|
if "blocks" in data:
|
|
366
527
|
for block in data["blocks"]:
|
|
367
528
|
if block.get("intended_usage") == "web_results":
|
|
@@ -376,16 +537,24 @@ class Conversation:
|
|
|
376
537
|
|
|
377
538
|
try:
|
|
378
539
|
json_data = loads(data["text"])
|
|
379
|
-
except KeyError as
|
|
380
|
-
raise ValueError("Missing 'text' field in data") from
|
|
381
|
-
except JSONDecodeError as
|
|
382
|
-
raise ValueError("Invalid JSON in 'text' field") from
|
|
540
|
+
except KeyError as error:
|
|
541
|
+
raise ValueError("Missing 'text' field in data") from error
|
|
542
|
+
except JSONDecodeError as error:
|
|
543
|
+
raise ValueError("Invalid JSON in 'text' field") from error
|
|
383
544
|
|
|
384
545
|
answer_data: dict[str, Any] = {}
|
|
385
546
|
|
|
386
547
|
if isinstance(json_data, list):
|
|
387
548
|
for item in json_data:
|
|
388
|
-
|
|
549
|
+
step_type = item.get("step_type")
|
|
550
|
+
|
|
551
|
+
# Handle Research mode clarifying questions
|
|
552
|
+
if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
|
|
553
|
+
questions = self._extract_clarifying_questions(item)
|
|
554
|
+
|
|
555
|
+
raise ResearchClarifyingQuestionsError(questions)
|
|
556
|
+
|
|
557
|
+
if step_type == "FINAL":
|
|
389
558
|
raw_content = item.get("content", {})
|
|
390
559
|
answer_content = raw_content.get("answer")
|
|
391
560
|
|
|
@@ -400,7 +569,39 @@ class Conversation:
|
|
|
400
569
|
elif isinstance(json_data, dict):
|
|
401
570
|
self._update_state(data.get("thread_title"), json_data)
|
|
402
571
|
else:
|
|
403
|
-
raise
|
|
572
|
+
raise ResponseParsingError(
|
|
573
|
+
"Unexpected JSON structure in 'text' field",
|
|
574
|
+
raw_data=str(json_data),
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
def _extract_clarifying_questions(self, item: dict[str, Any]) -> list[str]:
|
|
578
|
+
"""Extract clarifying questions from a RESEARCH_CLARIFYING_QUESTIONS step."""
|
|
579
|
+
|
|
580
|
+
questions: list[str] = []
|
|
581
|
+
content = item.get("content", {})
|
|
582
|
+
|
|
583
|
+
# Try different possible structures for questions
|
|
584
|
+
if isinstance(content, dict):
|
|
585
|
+
if "questions" in content:
|
|
586
|
+
raw_questions = content["questions"]
|
|
587
|
+
|
|
588
|
+
if isinstance(raw_questions, list):
|
|
589
|
+
questions = [str(q) for q in raw_questions if q]
|
|
590
|
+
elif "clarifying_questions" in content:
|
|
591
|
+
raw_questions = content["clarifying_questions"]
|
|
592
|
+
|
|
593
|
+
if isinstance(raw_questions, list):
|
|
594
|
+
questions = [str(q) for q in raw_questions if q]
|
|
595
|
+
elif not questions:
|
|
596
|
+
for value in content.values():
|
|
597
|
+
if isinstance(value, str) and "?" in value:
|
|
598
|
+
questions.append(value)
|
|
599
|
+
elif isinstance(content, list):
|
|
600
|
+
questions = [str(q) for q in content if q]
|
|
601
|
+
elif isinstance(content, str):
|
|
602
|
+
questions = [content]
|
|
603
|
+
|
|
604
|
+
return questions
|
|
404
605
|
|
|
405
606
|
def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
|
|
406
607
|
self._title = title
|
|
@@ -426,7 +627,8 @@ class Conversation:
|
|
|
426
627
|
chunks = answer_data.get("chunks", [])
|
|
427
628
|
|
|
428
629
|
if chunks:
|
|
429
|
-
self.
|
|
630
|
+
formatted = [self._format_citations(chunk) for chunk in chunks if chunk is not None]
|
|
631
|
+
self._chunks = [c for c in formatted if c is not None]
|
|
430
632
|
|
|
431
633
|
self._raw_data = answer_data
|
|
432
634
|
|