perplexity-webui-scraper 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,36 +1,27 @@
1
- """Extract AI responses from Perplexity's web interface."""
1
+ """
2
+ Extract AI responses from Perplexity's web interface.
3
+ """
2
4
 
3
5
  from importlib import metadata
4
6
 
5
7
  from .config import ClientConfig, ConversationConfig
6
8
  from .core import Conversation, Perplexity
7
- from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
8
- from .exceptions import (
9
- AuthenticationError,
10
- FileUploadError,
11
- FileValidationError,
12
- PerplexityError,
13
- RateLimitError,
14
- )
9
+ from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
15
10
  from .models import Model, Models
16
11
  from .types import Coordinates, Response, SearchResultItem
17
12
 
18
13
 
19
14
  __version__: str = metadata.version("perplexity-webui-scraper")
20
15
  __all__: list[str] = [
21
- "AuthenticationError",
22
16
  "CitationMode",
23
17
  "ClientConfig",
24
18
  "Conversation",
25
19
  "ConversationConfig",
26
20
  "Coordinates",
27
- "FileUploadError",
28
- "FileValidationError",
21
+ "LogLevel",
29
22
  "Model",
30
23
  "Models",
31
24
  "Perplexity",
32
- "PerplexityError",
33
- "RateLimitError",
34
25
  "Response",
35
26
  "SearchFocus",
36
27
  "SearchResultItem",
@@ -1,4 +1,6 @@
1
- """CLI utility for secure Perplexity authentication and session extraction."""
1
+ """
2
+ CLI utility for secure Perplexity authentication and session extraction.
3
+ """
2
4
 
3
5
  from __future__ import annotations
4
6
 
@@ -57,7 +59,9 @@ def update_env(token: str) -> bool:
57
59
 
58
60
 
59
61
  def _initialize_session() -> tuple[Session, str]:
60
- """Initialize session and obtain CSRF token."""
62
+ """
63
+ Initialize session and obtain CSRF token.
64
+ """
61
65
 
62
66
  session = Session(impersonate="chrome", headers={"Referer": BASE_URL, "Origin": BASE_URL})
63
67
 
@@ -73,7 +77,9 @@ def _initialize_session() -> tuple[Session, str]:
73
77
 
74
78
 
75
79
  def _request_verification_code(session: Session, csrf: str, email: str) -> None:
76
- """Send verification code to user's email."""
80
+ """
81
+ Send verification code to user's email.
82
+ """
77
83
 
78
84
  with console.status("[bold green]Sending verification code...", spinner="dots"):
79
85
  r = session.post(
@@ -92,7 +98,9 @@ def _request_verification_code(session: Session, csrf: str, email: str) -> None:
92
98
 
93
99
 
94
100
  def _validate_and_get_redirect_url(session: Session, email: str, user_input: str) -> str:
95
- """Validate user input (OTP or magic link) and return redirect URL."""
101
+ """
102
+ Validate user input (OTP or magic link) and return redirect URL.
103
+ """
96
104
 
97
105
  with console.status("[bold green]Validating...", spinner="dots"):
98
106
  if user_input.startswith("http"):
@@ -120,7 +128,9 @@ def _validate_and_get_redirect_url(session: Session, email: str, user_input: str
120
128
 
121
129
 
122
130
  def _extract_session_token(session: Session, redirect_url: str) -> str:
123
- """Extract session token from cookies after authentication."""
131
+ """
132
+ Extract session token from cookies after authentication.
133
+ """
124
134
 
125
135
  session.get(redirect_url)
126
136
  token = session.cookies.get("__Secure-next-auth.session-token")
@@ -132,7 +142,9 @@ def _extract_session_token(session: Session, redirect_url: str) -> str:
132
142
 
133
143
 
134
144
  def _display_and_save_token(token: str) -> None:
135
- """Display token and optionally save to .env file."""
145
+ """
146
+ Display token and optionally save to .env file.
147
+ """
136
148
 
137
149
  console.print("\n[bold green]✅ Token generated successfully![/bold green]")
138
150
  console.print(f"\n[bold white]Your session token:[/bold white]\n[green]{token}[/green]\n")
@@ -147,7 +159,9 @@ def _display_and_save_token(token: str) -> None:
147
159
 
148
160
 
149
161
  def _show_header() -> None:
150
- """Display welcome header."""
162
+ """
163
+ Display welcome header.
164
+ """
151
165
 
152
166
  console.print(
153
167
  Panel(
@@ -161,7 +175,9 @@ def _show_header() -> None:
161
175
 
162
176
 
163
177
  def _show_exit_message() -> None:
164
- """Display security note and wait for user to exit."""
178
+ """
179
+ Display security note and wait for user to exit.
180
+ """
165
181
 
166
182
  console.print("\n[bold yellow]⚠️ Security Note:[/bold yellow]")
167
183
  console.print("Press [bold white]ENTER[/bold white] to clear screen and exit.")
@@ -1,21 +1,27 @@
1
- """Configuration classes."""
1
+ """
2
+ Configuration classes.
3
+ """
2
4
 
3
5
  from __future__ import annotations
4
6
 
5
7
  from dataclasses import dataclass
6
8
  from typing import TYPE_CHECKING
7
9
 
8
- from .enums import CitationMode, SearchFocus, SourceFocus, TimeRange
10
+ from .enums import CitationMode, LogLevel, SearchFocus, SourceFocus, TimeRange
9
11
 
10
12
 
11
13
  if TYPE_CHECKING:
14
+ from pathlib import Path
15
+
12
16
  from .models import Model
13
17
  from .types import Coordinates
14
18
 
15
19
 
16
20
  @dataclass(slots=True)
17
21
  class ConversationConfig:
18
- """Default settings for a conversation. Can be overridden per message."""
22
+ """
23
+ Default settings for a conversation. Can be overridden per message.
24
+ """
19
25
 
20
26
  model: Model | None = None
21
27
  citation_mode: CitationMode = CitationMode.CLEAN
@@ -30,7 +36,30 @@ class ConversationConfig:
30
36
 
31
37
  @dataclass(frozen=True, slots=True)
32
38
  class ClientConfig:
33
- """HTTP client settings."""
39
+ """
40
+ HTTP client settings.
41
+
42
+ Attributes:
43
+ timeout: Request timeout in seconds.
44
+ impersonate: Browser to impersonate (e.g., "chrome", "edge", "safari").
45
+ max_retries: Maximum retry attempts for failed requests.
46
+ retry_base_delay: Initial delay in seconds before first retry.
47
+ retry_max_delay: Maximum delay between retries.
48
+ retry_jitter: Random jitter factor (0-1) to add to delays.
49
+ requests_per_second: Rate limit for requests (0 to disable).
50
+ rotate_fingerprint: Whether to rotate browser fingerprint on retries.
51
+ logging_level: Logging verbosity level. Default is DISABLED.
52
+ log_file: Optional file path for persistent logging. If set, logs go to file only.
53
+ If None, logs go to console. All logs are appended.
54
+ """
34
55
 
35
56
  timeout: int = 3600
36
57
  impersonate: str = "chrome"
58
+ max_retries: int = 3
59
+ retry_base_delay: float = 1.0
60
+ retry_max_delay: float = 60.0
61
+ retry_jitter: float = 0.5
62
+ requests_per_second: float = 0.5
63
+ rotate_fingerprint: bool = True
64
+ logging_level: LogLevel = LogLevel.DISABLED
65
+ log_file: str | Path | None = None
@@ -1,4 +1,6 @@
1
- """Constants and values for the Perplexity internal API and HTTP interactions."""
1
+ """
2
+ Constants and values for the Perplexity internal API and HTTP interactions.
3
+ """
2
4
 
3
5
  from __future__ import annotations
4
6
 
@@ -8,20 +10,30 @@ from typing import Final
8
10
 
9
11
  # API Configuration
10
12
  API_VERSION: Final[str] = "2.18"
11
- """Current API version used by Perplexity WebUI."""
13
+ """
14
+ Current API version used by Perplexity WebUI.
15
+ """
12
16
 
13
17
  API_BASE_URL: Final[str] = "https://www.perplexity.ai"
14
- """Base URL for all API requests."""
18
+ """
19
+ Base URL for all API requests.
20
+ """
15
21
 
16
22
  # API Endpoints
17
23
  ENDPOINT_ASK: Final[str] = "/rest/sse/perplexity_ask"
18
- """SSE endpoint for sending prompts."""
24
+ """
25
+ SSE endpoint for sending prompts.
26
+ """
19
27
 
20
28
  ENDPOINT_SEARCH_INIT: Final[str] = "/search/new"
21
- """Endpoint to initialize a search session."""
29
+ """
30
+ Endpoint to initialize a search session.
31
+ """
22
32
 
23
33
  ENDPOINT_UPLOAD: Final[str] = "/rest/uploads/batch_create_upload_urls"
24
- """Endpoint for file upload URL generation."""
34
+ """
35
+ Endpoint for file upload URL generation.
36
+ """
25
37
 
26
38
  # API Fixed Parameters
27
39
  SEND_BACK_TEXT: Final[bool] = True
@@ -33,10 +45,14 @@ False = API sends delta chunks only (accumulate mode).
33
45
  """
34
46
 
35
47
  USE_SCHEMATIZED_API: Final[bool] = False
36
- """Whether to use the schematized API format."""
48
+ """
49
+ Whether to use the schematized API format.
50
+ """
37
51
 
38
52
  PROMPT_SOURCE: Final[str] = "user"
39
- """Source identifier for prompts."""
53
+ """
54
+ Source identifier for prompts.
55
+ """
40
56
 
41
57
  # Regex Patterns (Pre-compiled for performance in streaming parsing)
42
58
  CITATION_PATTERN: Final[Pattern[str]] = compile(r"\[(\d{1,2})\]")
@@ -47,7 +63,9 @@ Uses word boundary to avoid matching things like [123].
47
63
  """
48
64
 
49
65
  JSON_OBJECT_PATTERN: Final[Pattern[str]] = compile(r"^\{.*\}$")
50
- """Pattern to detect JSON object strings."""
66
+ """
67
+ Pattern to detect JSON object strings.
68
+ """
51
69
 
52
70
  # HTTP Headers
53
71
  DEFAULT_HEADERS: Final[dict[str, str]] = {
@@ -61,4 +79,6 @@ Referer and Origin are added dynamically based on BASE_URL.
61
79
  """
62
80
 
63
81
  SESSION_COOKIE_NAME: Final[str] = "__Secure-next-auth.session-token"
64
- """Name of the session cookie used for authentication."""
82
+ """
83
+ Name of the session cookie used for authentication.
84
+ """
@@ -1,4 +1,6 @@
1
- """Core client implementation."""
1
+ """
2
+ Core client implementation.
3
+ """
2
4
 
3
5
  from __future__ import annotations
4
6
 
@@ -8,6 +10,8 @@ from pathlib import Path
8
10
  from typing import TYPE_CHECKING, Any
9
11
  from uuid import uuid4
10
12
 
13
+ from curl_cffi import CurlMime
14
+ from curl_cffi.requests import Session
11
15
  from orjson import JSONDecodeError, loads
12
16
 
13
17
 
@@ -26,20 +30,25 @@ from .constants import (
26
30
  USE_SCHEMATIZED_API,
27
31
  )
28
32
  from .enums import CitationMode
29
- from .exceptions import FileUploadError, FileValidationError
33
+ from .exceptions import FileUploadError, FileValidationError, ResearchClarifyingQuestionsError, ResponseParsingError
30
34
  from .http import HTTPClient
31
35
  from .limits import MAX_FILE_SIZE, MAX_FILES
36
+ from .logging import configure_logging, get_logger, log_conversation_created, log_query_sent
32
37
  from .models import Model, Models
33
38
  from .types import Response, SearchResultItem, _FileInfo
34
39
 
35
40
 
41
+ logger = get_logger(__name__)
42
+
43
+
36
44
  class Perplexity:
37
45
  """Web scraper for Perplexity AI conversations."""
38
46
 
39
47
  __slots__ = ("_http",)
40
48
 
41
49
  def __init__(self, session_token: str, config: ClientConfig | None = None) -> None:
42
- """Initialize web scraper with session token.
50
+ """
51
+ Initialize web scraper with session token.
43
52
 
44
53
  Args:
45
54
  session_token: Perplexity session cookie (__Secure-next-auth.session-token).
@@ -53,17 +62,71 @@ class Perplexity:
53
62
  raise ValueError("session_token cannot be empty")
54
63
 
55
64
  cfg = config or ClientConfig()
56
- self._http = HTTPClient(session_token, timeout=cfg.timeout, impersonate=cfg.impersonate)
65
+
66
+ # Configure logging based on config
67
+ configure_logging(level=cfg.logging_level, log_file=cfg.log_file)
68
+
69
+ logger.info(
70
+ "Perplexity client initializing | "
71
+ f"session_token_length={len(session_token)} "
72
+ f"logging_level={cfg.logging_level.value} "
73
+ f"log_file={cfg.log_file}"
74
+ )
75
+ logger.debug(
76
+ "Client configuration | "
77
+ f"timeout={cfg.timeout}s "
78
+ f"impersonate={cfg.impersonate} "
79
+ f"max_retries={cfg.max_retries} "
80
+ f"retry_base_delay={cfg.retry_base_delay}s "
81
+ f"retry_max_delay={cfg.retry_max_delay}s "
82
+ f"retry_jitter={cfg.retry_jitter} "
83
+ f"requests_per_second={cfg.requests_per_second} "
84
+ f"rotate_fingerprint={cfg.rotate_fingerprint}"
85
+ )
86
+
87
+ self._http = HTTPClient(
88
+ session_token,
89
+ timeout=cfg.timeout,
90
+ impersonate=cfg.impersonate,
91
+ max_retries=cfg.max_retries,
92
+ retry_base_delay=cfg.retry_base_delay,
93
+ retry_max_delay=cfg.retry_max_delay,
94
+ retry_jitter=cfg.retry_jitter,
95
+ requests_per_second=cfg.requests_per_second,
96
+ rotate_fingerprint=cfg.rotate_fingerprint,
97
+ )
98
+
99
+ logger.info("Perplexity client initialized successfully")
57
100
 
58
101
  def create_conversation(self, config: ConversationConfig | None = None) -> Conversation:
59
102
  """Create a new conversation."""
60
103
 
61
- return Conversation(self._http, config or ConversationConfig())
104
+ cfg = config or ConversationConfig()
105
+ logger.debug(
106
+ "Creating conversation | "
107
+ f"model={cfg.model} "
108
+ f"citation_mode={cfg.citation_mode} "
109
+ f"save_to_library={cfg.save_to_library} "
110
+ f"search_focus={cfg.search_focus} "
111
+ f"language={cfg.language}"
112
+ )
113
+
114
+ conversation = Conversation(self._http, cfg)
115
+
116
+ log_conversation_created(
117
+ f"model={cfg.model}, citation_mode={cfg.citation_mode}, "
118
+ f"search_focus={cfg.search_focus}, language={cfg.language}"
119
+ )
120
+ logger.info("Conversation created successfully")
121
+
122
+ return conversation
62
123
 
63
124
  def close(self) -> None:
64
125
  """Close the client."""
65
126
 
127
+ logger.debug("Closing Perplexity client")
66
128
  self._http.close()
129
+ logger.info("Perplexity client closed")
67
130
 
68
131
  def __enter__(self) -> Perplexity:
69
132
  return self
@@ -90,6 +153,13 @@ class Conversation:
90
153
  )
91
154
 
92
155
  def __init__(self, http: HTTPClient, config: ConversationConfig) -> None:
156
+ logger.debug(
157
+ "Conversation.__init__ | "
158
+ f"model={config.model} "
159
+ f"citation_mode={config.citation_mode} "
160
+ f"save_to_library={config.save_to_library} "
161
+ f"search_focus={config.search_focus}"
162
+ )
93
163
  self._http = http
94
164
  self._config = config
95
165
  self._citation_mode = CitationMode.DEFAULT
@@ -101,6 +171,7 @@ class Conversation:
101
171
  self._search_results: list[SearchResultItem] = []
102
172
  self._raw_data: dict[str, Any] = {}
103
173
  self._stream_generator: Generator[Response, None, None] | None = None
174
+ logger.debug("Conversation initialized with empty state")
104
175
 
105
176
  @property
106
177
  def answer(self) -> str | None:
@@ -142,11 +213,29 @@ class Conversation:
142
213
  ) -> Conversation:
143
214
  """Ask a question. Returns self for method chaining or streaming iteration."""
144
215
 
216
+ logger.info(
217
+ "Conversation.ask called | "
218
+ f"query_length={len(query)} "
219
+ f"query_preview={query[:100]}{'...' if len(query) > 100 else ''} "
220
+ f"model={model} "
221
+ f"files_count={len(files) if files else 0} "
222
+ f"citation_mode={citation_mode} "
223
+ f"stream={stream}"
224
+ )
225
+
145
226
  effective_model = model or self._config.model or Models.BEST
146
227
  effective_citation = citation_mode if citation_mode is not None else self._config.citation_mode
147
228
  self._citation_mode = effective_citation
229
+
230
+ logger.debug(
231
+ f"Effective parameters | effective_model={effective_model} effective_citation={effective_citation}"
232
+ )
233
+
234
+ log_query_sent(query, str(effective_model), bool(files))
148
235
  self._execute(query, effective_model, files, stream=stream)
149
236
 
237
+ logger.debug("Query execution completed")
238
+
150
239
  return self
151
240
 
152
241
  def _execute(
@@ -158,22 +247,49 @@ class Conversation:
158
247
  ) -> None:
159
248
  """Execute a query."""
160
249
 
250
+ logger.debug(
251
+ f"Executing query | "
252
+ f"query_length={len(query)} "
253
+ f"model={model} "
254
+ f"files_count={len(files) if files else 0} "
255
+ f"stream={stream} "
256
+ f"is_followup={self._backend_uuid is not None}"
257
+ )
258
+
161
259
  self._reset_response_state()
260
+ logger.debug("Response state reset")
162
261
 
163
262
  # Upload files
164
263
  file_urls: list[str] = []
165
264
 
166
265
  if files:
266
+ logger.debug(f"Validating {len(files)} files")
167
267
  validated = self._validate_files(files)
268
+ logger.debug(f"Validated {len(validated)} files, uploading...")
168
269
  file_urls = [self._upload_file(f) for f in validated]
270
+ logger.debug(f"Uploaded {len(file_urls)} files successfully")
169
271
 
170
272
  payload = self._build_payload(query, model, file_urls)
273
+ logger.debug(
274
+ f"Payload built | payload_keys={list(payload.keys())} params_keys={list(payload.get('params', {}).keys())}"
275
+ )
276
+
277
+ logger.debug("Initializing search session")
171
278
  self._http.init_search(query)
172
279
 
173
280
  if stream:
281
+ logger.debug("Starting streaming mode")
174
282
  self._stream_generator = self._stream(payload)
175
283
  else:
284
+ logger.debug("Starting complete mode (non-streaming)")
176
285
  self._complete(payload)
286
+ logger.debug(
287
+ f"Query completed | "
288
+ f"title={self._title} "
289
+ f"answer_length={len(self._answer) if self._answer else 0} "
290
+ f"chunks_count={len(self._chunks)} "
291
+ f"search_results_count={len(self._search_results)}"
292
+ )
177
293
 
178
294
  def _reset_response_state(self) -> None:
179
295
  self._title = None
@@ -237,8 +353,8 @@ class Conversation:
237
353
  is_image=mimetype.startswith("image/"),
238
354
  )
239
355
  )
240
- except FileValidationError:
241
- raise
356
+ except FileValidationError as error:
357
+ raise error
242
358
  except (FileNotFoundError, PermissionError) as error:
243
359
  raise FileValidationError(file_path, f"Cannot access file: {error}") from error
244
360
  except OSError as error:
@@ -264,16 +380,55 @@ class Conversation:
264
380
  try:
265
381
  response = self._http.post(ENDPOINT_UPLOAD, json=json_data)
266
382
  response_data = response.json()
267
- upload_url = response_data.get("results", {}).get(file_uuid, {}).get("s3_object_url")
383
+ result = response_data.get("results", {}).get(file_uuid, {})
384
+
385
+ s3_bucket_url = result.get("s3_bucket_url")
386
+ s3_object_url = result.get("s3_object_url")
387
+ fields = result.get("fields", {})
268
388
 
269
- if not upload_url:
389
+ if not s3_object_url:
270
390
  raise FileUploadError(file_info.path, "No upload URL returned")
271
391
 
272
- return upload_url
392
+ if not s3_bucket_url or not fields:
393
+ raise FileUploadError(file_info.path, "Missing S3 upload credentials")
394
+
395
+ # Upload the file to S3 using presigned POST
396
+ file_path = Path(file_info.path)
397
+
398
+ with file_path.open("rb") as f:
399
+ file_content = f.read()
400
+
401
+ # Build multipart form data using CurlMime
402
+ # For S3 presigned POST, form fields must come before the file
403
+ mime = CurlMime()
404
+
405
+ for field_name, field_value in fields.items():
406
+ mime.addpart(name=field_name, data=field_value)
407
+
408
+ mime.addpart(
409
+ name="file",
410
+ content_type=file_info.mimetype,
411
+ filename=file_path.name,
412
+ data=file_content,
413
+ )
414
+
415
+ # S3 requires a clean session
416
+ with Session() as s3_session:
417
+ upload_response = s3_session.post(s3_bucket_url, multipart=mime)
418
+
419
+ mime.close()
420
+
421
+ if upload_response.status_code not in (200, 201, 204):
422
+ raise FileUploadError(
423
+ file_info.path,
424
+ f"S3 upload failed with status {upload_response.status_code}: {upload_response.text}",
425
+ )
426
+
427
+ return s3_object_url
273
428
  except FileUploadError as error:
274
429
  raise error
275
- except Exception as e:
276
- raise FileUploadError(file_info.path, str(e)) from e
430
+ except Exception as error:
431
+ raise FileUploadError(file_info.path, str(error)) from error
277
432
 
278
433
  def _build_payload(
279
434
  self,
@@ -348,20 +503,26 @@ class Conversation:
348
503
  return CITATION_PATTERN.sub(replacer, text)
349
504
 
350
505
  def _parse_line(self, line: str | bytes) -> dict[str, Any] | None:
351
- prefix = b"data: " if isinstance(line, bytes) else "data: "
506
+ if isinstance(line, bytes) and line.startswith(b"data: "):
507
+ return loads(line[6:])
352
508
 
353
- if (isinstance(line, bytes) and line.startswith(prefix)) or (isinstance(line, str) and line.startswith(prefix)):
509
+ if isinstance(line, str) and line.startswith("data: "):
354
510
  return loads(line[6:])
355
511
 
356
512
  return None
357
513
 
358
514
  def _process_data(self, data: dict[str, Any]) -> None:
515
+ """Process SSE data chunk and update conversation state."""
516
+
359
517
  if self._backend_uuid is None and "backend_uuid" in data:
360
518
  self._backend_uuid = data["backend_uuid"]
361
519
 
362
520
  if self._read_write_token is None and "read_write_token" in data:
363
521
  self._read_write_token = data["read_write_token"]
364
522
 
523
+ if self._title is None and "thread_title" in data:
524
+ self._title = data["thread_title"]
525
+
365
526
  if "blocks" in data:
366
527
  for block in data["blocks"]:
367
528
  if block.get("intended_usage") == "web_results":
@@ -376,16 +537,24 @@ class Conversation:
376
537
 
377
538
  try:
378
539
  json_data = loads(data["text"])
379
- except KeyError as e:
380
- raise ValueError("Missing 'text' field in data") from e
381
- except JSONDecodeError as e:
382
- raise ValueError("Invalid JSON in 'text' field") from e
540
+ except KeyError as error:
541
+ raise ValueError("Missing 'text' field in data") from error
542
+ except JSONDecodeError as error:
543
+ raise ValueError("Invalid JSON in 'text' field") from error
383
544
 
384
545
  answer_data: dict[str, Any] = {}
385
546
 
386
547
  if isinstance(json_data, list):
387
548
  for item in json_data:
388
- if item.get("step_type") == "FINAL":
549
+ step_type = item.get("step_type")
550
+
551
+ # Handle Research mode clarifying questions
552
+ if step_type == "RESEARCH_CLARIFYING_QUESTIONS":
553
+ questions = self._extract_clarifying_questions(item)
554
+
555
+ raise ResearchClarifyingQuestionsError(questions)
556
+
557
+ if step_type == "FINAL":
389
558
  raw_content = item.get("content", {})
390
559
  answer_content = raw_content.get("answer")
391
560
 
@@ -400,7 +569,39 @@ class Conversation:
400
569
  elif isinstance(json_data, dict):
401
570
  self._update_state(data.get("thread_title"), json_data)
402
571
  else:
403
- raise ValueError("Unexpected JSON structure in 'text' field")
572
+ raise ResponseParsingError(
573
+ "Unexpected JSON structure in 'text' field",
574
+ raw_data=str(json_data),
575
+ )
576
+
577
+ def _extract_clarifying_questions(self, item: dict[str, Any]) -> list[str]:
578
+ """Extract clarifying questions from a RESEARCH_CLARIFYING_QUESTIONS step."""
579
+
580
+ questions: list[str] = []
581
+ content = item.get("content", {})
582
+
583
+ # Try different possible structures for questions
584
+ if isinstance(content, dict):
585
+ if "questions" in content:
586
+ raw_questions = content["questions"]
587
+
588
+ if isinstance(raw_questions, list):
589
+ questions = [str(q) for q in raw_questions if q]
590
+ elif "clarifying_questions" in content:
591
+ raw_questions = content["clarifying_questions"]
592
+
593
+ if isinstance(raw_questions, list):
594
+ questions = [str(q) for q in raw_questions if q]
595
+ elif not questions:
596
+ for value in content.values():
597
+ if isinstance(value, str) and "?" in value:
598
+ questions.append(value)
599
+ elif isinstance(content, list):
600
+ questions = [str(q) for q in content if q]
601
+ elif isinstance(content, str):
602
+ questions = [content]
603
+
604
+ return questions
404
605
 
405
606
  def _update_state(self, title: str | None, answer_data: dict[str, Any]) -> None:
406
607
  self._title = title
@@ -426,7 +627,8 @@ class Conversation:
426
627
  chunks = answer_data.get("chunks", [])
427
628
 
428
629
  if chunks:
429
- self._chunks = chunks
630
+ formatted = [self._format_citations(chunk) for chunk in chunks if chunk is not None]
631
+ self._chunks = [c for c in formatted if c is not None]
430
632
 
431
633
  self._raw_data = answer_data
432
634