chatterer 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,631 +1,649 @@
1
- """
2
- PlaywrightBot
3
-
4
- This module provides a single class that uses Playwright to:
5
- - Fetch and render HTML pages (with JavaScript execution),
6
- - Optionally scroll down or reload pages,
7
- - Convert rendered HTML into Markdown,
8
- - Extract specific elements using CSS selectors,
9
- - Filter key information from a page via integration with a language model (Chatterer).
10
-
11
- Both synchronous and asynchronous methods are available in this unified class.
12
- Use the synchronous methods (without the "a" prefix) in a normal context manager,
13
- or use the asynchronous methods (prefixed with "a") within an async context manager.
14
- """
15
-
16
- import asyncio
17
- from dataclasses import dataclass, field
18
- from traceback import format_exception_only, print_exc
19
- from types import TracebackType
20
- from typing import (
21
- Awaitable,
22
- Optional,
23
- Self,
24
- Type,
25
- TypeGuard,
26
- Union,
27
- )
28
-
29
- import playwright.async_api
30
- import playwright.sync_api
31
-
32
- from ...language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
33
- from ..convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
34
- from .utils import (
35
- DEFAULT_UA,
36
- ImageDescriptionAndReferences,
37
- ImageProcessingConfig,
38
- MarkdownLink,
39
- PlaywrightLaunchOptions,
40
- PlaywrightPersistencyOptions,
41
- SelectedLineRanges,
42
- WaitUntil,
43
- aget_image_url_and_markdown_links,
44
- get_default_image_processing_config,
45
- get_default_playwright_launch_options,
46
- get_image_url_and_markdown_links,
47
- replace_images,
48
- )
49
-
50
-
51
- @dataclass
52
- class PlayWrightBot:
53
- """
54
- A unified bot that leverages Playwright to render web pages, convert them to Markdown,
55
- extract elements, and filter key information using a language model.
56
-
57
- This class exposes both synchronous and asynchronous methods.
58
-
59
- Synchronous usage:
60
- with UnifiedPlaywrightBot() as bot:
61
- md = bot.url_to_md("https://example.com")
62
- headings = bot.select_and_extract("https://example.com", "h2")
63
- filtered_md = bot.url_to_md_with_llm("https://example.com")
64
-
65
- Asynchronous usage:
66
- async with UnifiedPlaywrightBot() as bot:
67
- md = await bot.aurl_to_md("https://example.com")
68
- headings = await bot.aselect_and_extract("https://example.com", "h2")
69
- filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
70
-
71
- Attributes:
72
- headless (bool): Whether to run the browser in headless mode (default True).
73
- chatterer (Chatterer): An instance of the language model interface for processing text.
74
- """
75
-
76
- chatterer: Chatterer = field(default_factory=Chatterer.openai)
77
- playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
78
- playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
79
- html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
80
- image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
81
- headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
82
- markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
83
-
84
- You excel at the following tasks:
85
- 1. Identifying the main article content of a webpage.
86
- 2. Filtering out ads, navigation links, and other irrelevant information.
87
- 3. Selecting the line number ranges that correspond to the article content.
88
- 4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
89
-
90
- However, there are a few rules you must follow:
91
- 1. Do not remove the title of the article, if present.
92
- 2. Do not remove the author's name or the publication date, if present.
93
- 3. Include only images that are part of the article.
94
-
95
- Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
96
-
97
- Markdown-formatted webpage content is provided below for your reference:
98
- ---
99
- """.strip()
100
- description_format: str = (
101
- "<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
102
- )
103
- image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
104
-
105
- sync_playwright: Optional[playwright.sync_api.Playwright] = None
106
- sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
107
- async_playwright: Optional[playwright.async_api.Playwright] = None
108
- async_browser_context: Optional[playwright.async_api.BrowserContext] = None
109
-
110
- def get_sync_playwright(self) -> playwright.sync_api.Playwright:
111
- if self.sync_playwright is None:
112
- self.sync_playwright = playwright.sync_api.sync_playwright().start()
113
- return self.sync_playwright
114
-
115
- async def get_async_playwright(self) -> playwright.async_api.Playwright:
116
- if self.async_playwright is None:
117
- self.async_playwright = await playwright.async_api.async_playwright().start()
118
- return self.async_playwright
119
-
120
- def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
121
- if self.sync_browser_context is not None:
122
- return self.sync_browser_context
123
-
124
- user_data_dir = self.playwright_persistency_options.get("user_data_dir")
125
- if user_data_dir:
126
- # Use persistent context if user_data_dir is provided
127
- self.sync_browser_context = self.get_sync_playwright().chromium.launch_persistent_context(
128
- user_data_dir=user_data_dir, **self.playwright_launch_options
129
- )
130
- return self.sync_browser_context
131
-
132
- # Otherwise, launch a new context
133
- browser = self.get_sync_playwright().chromium.launch(**self.playwright_launch_options)
134
- storage_state = self.playwright_persistency_options.get("storage_state")
135
- if storage_state:
136
- self.sync_browser_context = browser.new_context(storage_state=storage_state)
137
- else:
138
- self.sync_browser_context = browser.new_context()
139
- return self.sync_browser_context
140
-
141
- async def get_async_browser(self) -> playwright.async_api.BrowserContext:
142
- if self.async_browser_context is not None:
143
- return self.async_browser_context
144
-
145
- user_data_dir = self.playwright_persistency_options.get("user_data_dir")
146
- if user_data_dir:
147
- # Use persistent context if user_data_dir is provided
148
- self.async_browser_context = await (await self.get_async_playwright()).chromium.launch_persistent_context(
149
- user_data_dir=user_data_dir, **self.playwright_launch_options
150
- )
151
- return self.async_browser_context
152
-
153
- # Otherwise, launch a new context
154
- browser = await (await self.get_async_playwright()).chromium.launch(**self.playwright_launch_options)
155
- storage_state = self.playwright_persistency_options.get("storage_state")
156
- if storage_state:
157
- self.async_browser_context = await browser.new_context(storage_state=storage_state)
158
- else:
159
- self.async_browser_context = await browser.new_context()
160
- return self.async_browser_context
161
-
162
- def get_page(
163
- self,
164
- url: str,
165
- timeout: float = 10.0,
166
- wait_until: Optional[WaitUntil] = "domcontentloaded",
167
- referer: Optional[str] = None,
168
- ) -> playwright.sync_api.Page:
169
- """
170
- Create a new page and navigate to the given URL synchronously.
171
-
172
- Args:
173
- url (str): URL to navigate to.
174
- timeout (float): Maximum navigation time in seconds.
175
- wait_until (str): Load state to wait for (e.g., "domcontentloaded").
176
- referer (Optional[str]): Referer URL to set.
177
-
178
- Returns:
179
- Page: The Playwright page object.
180
- """
181
- page = self.get_sync_browser().new_page()
182
- page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
183
- return page
184
-
185
- async def aget_page(
186
- self,
187
- url: str,
188
- timeout: float = 8,
189
- wait_until: Optional[WaitUntil] = "domcontentloaded",
190
- referer: Optional[str] = None,
191
- ) -> playwright.async_api.Page:
192
- """
193
- Create a new page and navigate to the given URL asynchronously.
194
-
195
- Args:
196
- url (str): URL to navigate to.
197
- timeout (float): Maximum navigation time in seconds.
198
- wait_until (str): Load state to wait for.
199
- referer (Optional[str]): Referer URL to set.
200
-
201
- Returns:
202
- AsyncPage: The Playwright asynchronous page object.
203
- """
204
- page = await (await self.get_async_browser()).new_page()
205
- await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
206
- return page
207
-
208
- def url_to_md(
209
- self,
210
- url: str,
211
- wait: float = 0.2,
212
- scrolldown: bool = False,
213
- sleep: int = 0,
214
- reload: bool = True,
215
- timeout: Union[float, int] = 8,
216
- keep_page: bool = False,
217
- referer: Optional[str] = None,
218
- ) -> str:
219
- """
220
- Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
221
-
222
- Args:
223
- url (str): URL of the page.
224
- wait (float): Time to wait after navigation (in seconds).
225
- scrolldown (bool): If True, scroll to the bottom of the page.
226
- sleep (int): Time to wait after scrolling (in seconds).
227
- reload (bool): If True, reload the page.
228
- timeout (float | int): Navigation timeout in seconds.
229
- keep_page (bool): If True, do not close the page after processing.
230
- referer (Optional[str]): Referer URL to set.
231
-
232
- Returns:
233
- str: The page content converted to Markdown.
234
- """
235
- page = self.get_page(url, timeout=timeout, referer=referer)
236
- if wait:
237
- page.wait_for_timeout(wait * 1000)
238
- if scrolldown:
239
- page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
240
- if sleep:
241
- page.wait_for_timeout(sleep * 1000)
242
- if reload:
243
- page.reload(timeout=int(timeout * 1000))
244
- html = page.content()
245
- md = html_to_markdown(html=html, options=self.html_to_markdown_options)
246
- if not keep_page:
247
- page.close()
248
- return md
249
-
250
- async def aurl_to_md(
251
- self,
252
- url: str,
253
- wait: float = 0.2,
254
- scrolldown: bool = False,
255
- sleep: int = 0,
256
- reload: bool = True,
257
- timeout: Union[float, int] = 8,
258
- keep_page: bool = False,
259
- referer: Optional[str] = None,
260
- ) -> str:
261
- """
262
- Asynchronously navigate to a URL, wait, scroll or reload if specified,
263
- and convert the rendered HTML to Markdown.
264
-
265
- Args:
266
- url (str): URL of the page.
267
- wait (float): Time to wait after navigation (in seconds).
268
- scrolldown (bool): If True, scroll the page.
269
- sleep (int): Time to wait after scrolling (in seconds).
270
- reload (bool): If True, reload the page.
271
- timeout (float | int): Navigation timeout (in seconds).
272
- keep_page (bool): If True, do not close the page after processing.
273
- referer (Optional[str]): Referer URL to set.
274
-
275
- Returns:
276
- str: The page content converted to Markdown.
277
- """
278
- page = await self.aget_page(url, timeout=timeout, referer=referer)
279
- if wait:
280
- await page.wait_for_timeout(wait * 1000)
281
- if scrolldown:
282
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
283
- if sleep:
284
- await page.wait_for_timeout(sleep * 1000)
285
- if reload:
286
- await page.reload(timeout=int(timeout * 1000))
287
- html = await page.content()
288
- md = html_to_markdown(html=html, options=self.html_to_markdown_options)
289
- if not keep_page:
290
- await page.close()
291
- return md
292
-
293
- def select_and_extract(
294
- self,
295
- url: str,
296
- css_selector: str,
297
- wait: float = 0.2,
298
- scrolldown: bool = False,
299
- sleep: int = 0,
300
- reload: bool = True,
301
- timeout: Union[float, int] = 8,
302
- keep_page: bool = False,
303
- referer: Optional[str] = None,
304
- ) -> list[str]:
305
- """
306
- Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
307
-
308
- Args:
309
- url (str): URL of the page.
310
- css_selector (str): CSS selector to locate elements.
311
- wait (float): Time to wait after navigation (in seconds).
312
- scrolldown (bool): If True, scroll the page.
313
- sleep (int): Time to wait after scrolling (in seconds).
314
- reload (bool): If True, reload the page.
315
- timeout (float | int): Maximum navigation time (in seconds).
316
- keep_page (bool): If True, do not close the page after processing.
317
- referer (Optional[str]): Referer URL to set.
318
-
319
- Returns:
320
- List[str]: A list of text contents from the matching elements.
321
- """
322
- page = self.get_page(url, timeout=timeout, referer=referer)
323
- if wait:
324
- page.wait_for_timeout(wait * 1000)
325
- if scrolldown:
326
- page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
327
- if sleep:
328
- page.wait_for_timeout(sleep * 1000)
329
- if reload:
330
- page.reload(timeout=int(timeout * 1000))
331
- elements = page.query_selector_all(css_selector)
332
- texts = [element.inner_text() for element in elements]
333
- if not keep_page:
334
- page.close()
335
- return texts
336
-
337
- async def aselect_and_extract(
338
- self,
339
- url: str,
340
- css_selector: str,
341
- wait: float = 0.2,
342
- scrolldown: bool = False,
343
- sleep: int = 0,
344
- reload: bool = True,
345
- timeout: Union[float, int] = 8,
346
- keep_page: bool = False,
347
- referer: Optional[str] = None,
348
- ) -> list[str]:
349
- """
350
- Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
351
-
352
- Args:
353
- url (str): URL of the page.
354
- css_selector (str): CSS selector to locate elements.
355
- wait (float): Time to wait after navigation (in seconds).
356
- scrolldown (bool): If True, scroll the page.
357
- sleep (int): Time to wait after scrolling (in seconds).
358
- reload (bool): If True, reload the page.
359
- timeout (float | int): Navigation timeout (in seconds).
360
- keep_page (bool): If True, do not close the page after processing.
361
- referer (Optional[str]): Referer URL to set.
362
-
363
- Returns:
364
- List[str]: A list of text contents from the matching elements.
365
- """
366
- page = await self.aget_page(url, timeout=timeout, referer=referer)
367
- if wait:
368
- await page.wait_for_timeout(wait * 1000)
369
- if scrolldown:
370
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
371
- if sleep:
372
- await page.wait_for_timeout(sleep * 1000)
373
- if reload:
374
- await page.reload(timeout=int(timeout * 1000))
375
- elements = await page.query_selector_all(css_selector)
376
- texts: list[str] = []
377
- for element in elements:
378
- text = await element.inner_text()
379
- texts.append(text)
380
- if not keep_page:
381
- await page.close()
382
- return texts
383
-
384
- def url_to_md_with_llm(
385
- self,
386
- url: str,
387
- chunk_size: Optional[int] = None,
388
- wait: float = 0.2,
389
- scrolldown: bool = False,
390
- sleep: int = 0,
391
- reload: bool = True,
392
- timeout: Union[float, int] = 8,
393
- keep_page: bool = False,
394
- referer: Optional[str] = None,
395
- ) -> str:
396
- """
397
- Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
398
-
399
- The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
400
- to select the important line ranges. It then reconstructs the filtered Markdown.
401
-
402
- Args:
403
- url (str): URL of the page.
404
- chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
405
- wait (float): Time to wait after navigation (in seconds).
406
- scrolldown (bool): If True, scroll down the page.
407
- sleep (int): Time to wait after scrolling (in seconds).
408
- reload (bool): If True, reload the page.
409
- timeout (float | int): Navigation timeout (in seconds).
410
- keep_page (bool): If True, do not close the page after processing.
411
- referer (Optional[str]): Referer URL to set.
412
-
413
- Returns:
414
- str: Filtered Markdown containing only the important lines.
415
- """
416
- markdown_content = self.url_to_md(
417
- url,
418
- wait=wait,
419
- scrolldown=scrolldown,
420
- sleep=sleep,
421
- reload=reload,
422
- timeout=timeout,
423
- keep_page=keep_page,
424
- referer=referer,
425
- )
426
- markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
427
- lines = markdown_content.split("\n")
428
- line_length = len(lines)
429
- important_lines: set[int] = set()
430
-
431
- def _into_safe_range(value: int) -> int:
432
- """Ensure the line index stays within bounds."""
433
- return min(max(value, 0), line_length - 1)
434
-
435
- if chunk_size is None:
436
- chunk_size = line_length
437
-
438
- # Process the markdown in chunks.
439
- for i in range(0, len(lines), chunk_size):
440
- chunk_lines = lines[i : i + chunk_size]
441
- # Prepend line numbers to each line.
442
- numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
443
- # Use the language model synchronously to get the line ranges.
444
- result: SelectedLineRanges = self.chatterer.generate_pydantic(
445
- response_model=SelectedLineRanges,
446
- messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
447
- )
448
- for range_str in result.line_ranges:
449
- if "-" in range_str:
450
- start, end = map(int, range_str.split("-"))
451
- important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
452
- else:
453
- important_lines.add(_into_safe_range(int(range_str) + i - 1))
454
- # Reconstruct the filtered markdown.
455
- return "\n".join(lines[line_no] for line_no in sorted(important_lines))
456
-
457
- async def aurl_to_md_with_llm(
458
- self,
459
- url: str,
460
- chunk_size: Optional[int] = None,
461
- wait: float = 0.2,
462
- scrolldown: bool = False,
463
- sleep: int = 0,
464
- reload: bool = True,
465
- timeout: Union[float, int] = 8,
466
- keep_page: bool = False,
467
- referer: Optional[str] = None,
468
- ) -> str:
469
- """
470
- Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
471
- to filter out unimportant lines.
472
-
473
- The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
474
- to select the important line ranges. It then reconstructs the filtered Markdown.
475
-
476
- Args:
477
- url (str): URL of the page.
478
- chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
479
- wait (float): Time to wait after navigation (in seconds).
480
- scrolldown (bool): If True, scroll the page.
481
- sleep (int): Time to wait after scrolling (in seconds).
482
- reload (bool): If True, reload the page.
483
- timeout (float | int): Navigation timeout (in seconds).
484
- keep_page (bool): If True, do not close the page after processing.
485
- referer (Optional[str]): Referer URL to set.
486
-
487
- Returns:
488
- str: Filtered Markdown containing only the important lines.
489
- """
490
- markdown_content = await self.aurl_to_md(
491
- url,
492
- wait=wait,
493
- scrolldown=scrolldown,
494
- sleep=sleep,
495
- reload=reload,
496
- timeout=timeout,
497
- keep_page=keep_page,
498
- referer=referer,
499
- )
500
- markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
501
- lines = markdown_content.split("\n")
502
- line_length = len(lines)
503
- important_lines: set[int] = set()
504
-
505
- def _into_safe_range(value: int) -> int:
506
- """Ensure the line index is within valid bounds."""
507
- return min(max(value, 0), line_length - 1)
508
-
509
- if chunk_size is None:
510
- chunk_size = line_length
511
-
512
- for i in range(0, len(lines), chunk_size):
513
- chunk_lines = lines[i : i + chunk_size]
514
- numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
515
- # Use the asynchronous language model method.
516
- result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
517
- response_model=SelectedLineRanges,
518
- messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
519
- )
520
- for range_str in result.line_ranges:
521
- if "-" in range_str:
522
- start, end = map(int, range_str.split("-"))
523
- important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
524
- else:
525
- important_lines.add(_into_safe_range(int(range_str) + i - 1))
526
- return "\n".join(lines[line_no] for line_no in sorted(important_lines))
527
-
528
- def describe_images(self, markdown_text: str, referer_url: str) -> str:
529
- """
530
- Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
531
- """
532
- image_url_and_markdown_links: dict[Optional[str], list[MarkdownLink]] = get_image_url_and_markdown_links(
533
- markdown_text=markdown_text,
534
- headers=self.headers | {"Referer": referer_url},
535
- config=self.image_processing_config,
536
- )
537
-
538
- image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
539
- for image_url, markdown_links in image_url_and_markdown_links.items():
540
- if image_url is not None:
541
- try:
542
- image_summary: str = self.chatterer.describe_image(
543
- image_url=image_url,
544
- instruction=self.image_description_instruction,
545
- )
546
- except Exception:
547
- print_exc()
548
- continue
549
- image_description_and_references[image_summary] = markdown_links
550
- else:
551
- image_description_and_references[None] = markdown_links
552
-
553
- return replace_images(
554
- markdown_text=markdown_text,
555
- image_description_and_references=image_description_and_references,
556
- description_format=self.description_format,
557
- )
558
-
559
- async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
560
- """
561
- Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
562
- """
563
- image_url_and_markdown_links: dict[Optional[str], list[MarkdownLink]] = await aget_image_url_and_markdown_links(
564
- markdown_text=markdown_text,
565
- headers=self.headers | {"Referer": referer_url},
566
- config=self.image_processing_config,
567
- )
568
-
569
- async def dummy() -> None:
570
- pass
571
-
572
- def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
573
- if isinstance(e, BaseException):
574
- print(format_exception_only(type(e), e))
575
- return False
576
- return True
577
-
578
- coros: list[Awaitable[Optional[str]]] = [
579
- self.chatterer.adescribe_image(image_url=image_url, instruction=self.image_description_instruction)
580
- if image_url is not None
581
- else dummy()
582
- for image_url in image_url_and_markdown_links.keys()
583
- ]
584
-
585
- return replace_images(
586
- markdown_text=markdown_text,
587
- image_description_and_references=ImageDescriptionAndReferences({
588
- image_summary: markdown_links
589
- for markdown_links, image_summary in zip(
590
- image_url_and_markdown_links.values(), await asyncio.gather(*coros, return_exceptions=True)
591
- )
592
- if _handle_exception(image_summary)
593
- }),
594
- description_format=self.description_format,
595
- )
596
-
597
- def __enter__(self) -> Self:
598
- return self
599
-
600
- async def __aenter__(self) -> Self:
601
- return self
602
-
603
- def __exit__(
604
- self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
605
- ) -> None:
606
- """
607
- Exit the synchronous context.
608
-
609
- Closes the browser and stops Playwright.
610
- """
611
- if self.sync_browser_context is not None:
612
- self.sync_browser_context.close()
613
- self.sync_browser_context = None
614
- if self.sync_playwright:
615
- self.sync_playwright.stop()
616
- self.sync_playwright = None
617
-
618
- async def __aexit__(
619
- self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
620
- ) -> None:
621
- """
622
- Asynchronously exit the context.
623
-
624
- Closes the asynchronous browser and stops Playwright.
625
- """
626
- if self.async_browser_context is not None:
627
- await self.async_browser_context.close()
628
- self.async_browser_context = None
629
- if self.async_playwright:
630
- await self.async_playwright.stop()
631
- self.async_playwright = None
1
+ """
2
+ PlaywrightBot
3
+
4
+ This module provides a single class that uses Playwright to:
5
+ - Fetch and render HTML pages (with JavaScript execution),
6
+ - Optionally scroll down or reload pages,
7
+ - Convert rendered HTML into Markdown,
8
+ - Extract specific elements using CSS selectors,
9
+ - Filter key information from a page via integration with a language model (Chatterer).
10
+
11
+ Both synchronous and asynchronous methods are available in this unified class.
12
+ Use the synchronous methods (without the "a" prefix) in a normal context manager,
13
+ or use the asynchronous methods (prefixed with "a") within an async context manager.
14
+ """
15
+
16
+ import asyncio
17
+ from dataclasses import dataclass, field
18
+ from traceback import format_exception_only, print_exc
19
+ from types import TracebackType
20
+ from typing import (
21
+ Awaitable,
22
+ Optional,
23
+ Self,
24
+ Type,
25
+ TypeGuard,
26
+ Union,
27
+ )
28
+
29
+ import playwright.async_api
30
+ import playwright.sync_api
31
+
32
+ from ...language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
33
+ from ...utils.image import Base64Image, get_default_image_processing_config
34
+ from ..convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
35
+ from .utils import (
36
+ DEFAULT_UA,
37
+ ImageDescriptionAndReferences,
38
+ ImageProcessingConfig,
39
+ MarkdownLink,
40
+ PlaywrightLaunchOptions,
41
+ PlaywrightPersistencyOptions,
42
+ SelectedLineRanges,
43
+ WaitUntil,
44
+ aget_image_url_and_markdown_links,
45
+ get_default_playwright_launch_options,
46
+ get_image_url_and_markdown_links,
47
+ replace_images,
48
+ )
49
+
50
+
51
+ @dataclass
52
+ class PlayWrightBot:
53
+ """
54
+ A unified bot that leverages Playwright to render web pages, convert them to Markdown,
55
+ extract elements, and filter key information using a language model.
56
+
57
+ This class exposes both synchronous and asynchronous methods.
58
+
59
+ Synchronous usage:
60
+ with UnifiedPlaywrightBot() as bot:
61
+ md = bot.url_to_md("https://example.com")
62
+ headings = bot.select_and_extract("https://example.com", "h2")
63
+ filtered_md = bot.url_to_md_with_llm("https://example.com")
64
+
65
+ Asynchronous usage:
66
+ async with UnifiedPlaywrightBot() as bot:
67
+ md = await bot.aurl_to_md("https://example.com")
68
+ headings = await bot.aselect_and_extract("https://example.com", "h2")
69
+ filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
70
+
71
+ Attributes:
72
+ headless (bool): Whether to run the browser in headless mode (default True).
73
+ chatterer (Chatterer): An instance of the language model interface for processing text.
74
+ """
75
+
76
+ chatterer: Chatterer = field(default_factory=Chatterer.openai)
77
+ playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
78
+ playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
79
+ html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
80
+ image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
81
+ headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
82
+ markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
83
+
84
+ You excel at the following tasks:
85
+ 1. Identifying the main article content of a webpage.
86
+ 2. Filtering out ads, navigation links, and other irrelevant information.
87
+ 3. Selecting the line number ranges that correspond to the article content.
88
+ 4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
89
+
90
+ However, there are a few rules you must follow:
91
+ 1. Do not remove the title of the article, if present.
92
+ 2. Do not remove the author's name or the publication date, if present.
93
+ 3. Include only images that are part of the article.
94
+
95
+ Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
96
+
97
+ Markdown-formatted webpage content is provided below for your reference:
98
+ ---
99
+ """.strip()
100
+ description_format: str = (
101
+ "<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
102
+ )
103
+ image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
104
+
105
+ sync_playwright: Optional[playwright.sync_api.Playwright] = None
106
+ sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
107
+ async_playwright: Optional[playwright.async_api.Playwright] = None
108
+ async_browser_context: Optional[playwright.async_api.BrowserContext] = None
109
+
110
+ def get_sync_playwright(self) -> playwright.sync_api.Playwright:
111
+ if self.sync_playwright is None:
112
+ self.sync_playwright = playwright.sync_api.sync_playwright().start()
113
+ return self.sync_playwright
114
+
115
+ async def get_async_playwright(self) -> playwright.async_api.Playwright:
116
+ if self.async_playwright is None:
117
+ self.async_playwright = await playwright.async_api.async_playwright().start()
118
+ return self.async_playwright
119
+
120
+ def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
121
+ if self.sync_browser_context is not None:
122
+ return self.sync_browser_context
123
+
124
+ user_data_dir = self.playwright_persistency_options.get("user_data_dir")
125
+ if user_data_dir:
126
+ # Use persistent context if user_data_dir is provided
127
+ self.sync_browser_context = self.get_sync_playwright().chromium.launch_persistent_context(
128
+ user_data_dir=user_data_dir, **self.playwright_launch_options
129
+ )
130
+ return self.sync_browser_context
131
+
132
+ # Otherwise, launch a new context
133
+ browser = self.get_sync_playwright().chromium.launch(**self.playwright_launch_options)
134
+ storage_state = self.playwright_persistency_options.get("storage_state")
135
+ if storage_state:
136
+ self.sync_browser_context = browser.new_context(storage_state=storage_state)
137
+ else:
138
+ self.sync_browser_context = browser.new_context()
139
+ return self.sync_browser_context
140
+
141
+ async def get_async_browser(self) -> playwright.async_api.BrowserContext:
142
+ if self.async_browser_context is not None:
143
+ return self.async_browser_context
144
+
145
+ user_data_dir = self.playwright_persistency_options.get("user_data_dir")
146
+ if user_data_dir:
147
+ # Use persistent context if user_data_dir is provided
148
+ self.async_browser_context = await (await self.get_async_playwright()).chromium.launch_persistent_context(
149
+ user_data_dir=user_data_dir, **self.playwright_launch_options
150
+ )
151
+ return self.async_browser_context
152
+
153
+ # Otherwise, launch a new context
154
+ browser = await (await self.get_async_playwright()).chromium.launch(**self.playwright_launch_options)
155
+ storage_state = self.playwright_persistency_options.get("storage_state")
156
+ if storage_state:
157
+ self.async_browser_context = await browser.new_context(storage_state=storage_state)
158
+ else:
159
+ self.async_browser_context = await browser.new_context()
160
+ return self.async_browser_context
161
+
162
+ def get_page(
163
+ self,
164
+ url: str,
165
+ timeout: float = 10.0,
166
+ wait_until: Optional[WaitUntil] = "domcontentloaded",
167
+ referer: Optional[str] = None,
168
+ ) -> playwright.sync_api.Page:
169
+ """
170
+ Create a new page and navigate to the given URL synchronously.
171
+
172
+ Args:
173
+ url (str): URL to navigate to.
174
+ timeout (float): Maximum navigation time in seconds.
175
+ wait_until (str): Load state to wait for (e.g., "domcontentloaded").
176
+ referer (Optional[str]): Referer URL to set.
177
+
178
+ Returns:
179
+ Page: The Playwright page object.
180
+ """
181
+ page = self.get_sync_browser().new_page()
182
+ page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
183
+ return page
184
+
185
+ async def aget_page(
186
+ self,
187
+ url: str,
188
+ timeout: float = 8,
189
+ wait_until: Optional[WaitUntil] = "domcontentloaded",
190
+ referer: Optional[str] = None,
191
+ ) -> playwright.async_api.Page:
192
+ """
193
+ Create a new page and navigate to the given URL asynchronously.
194
+
195
+ Args:
196
+ url (str): URL to navigate to.
197
+ timeout (float): Maximum navigation time in seconds.
198
+ wait_until (str): Load state to wait for.
199
+ referer (Optional[str]): Referer URL to set.
200
+
201
+ Returns:
202
+ AsyncPage: The Playwright asynchronous page object.
203
+ """
204
+ page = await (await self.get_async_browser()).new_page()
205
+ await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
206
+ return page
207
+
208
+ def url_to_md(
209
+ self,
210
+ url: str,
211
+ wait: float = 0.2,
212
+ scrolldown: bool = False,
213
+ sleep: int = 0,
214
+ reload: bool = True,
215
+ timeout: Union[float, int] = 8,
216
+ keep_page: bool = False,
217
+ referer: Optional[str] = None,
218
+ ) -> str:
219
+ """
220
+ Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
221
+
222
+ Args:
223
+ url (str): URL of the page.
224
+ wait (float): Time to wait after navigation (in seconds).
225
+ scrolldown (bool): If True, scroll to the bottom of the page.
226
+ sleep (int): Time to wait after scrolling (in seconds).
227
+ reload (bool): If True, reload the page.
228
+ timeout (float | int): Navigation timeout in seconds.
229
+ keep_page (bool): If True, do not close the page after processing.
230
+ referer (Optional[str]): Referer URL to set.
231
+
232
+ Returns:
233
+ str: The page content converted to Markdown.
234
+ """
235
+ page = self.get_page(url, timeout=timeout, referer=referer)
236
+ if wait:
237
+ page.wait_for_timeout(wait * 1000)
238
+ if scrolldown:
239
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
240
+ if sleep:
241
+ page.wait_for_timeout(sleep * 1000)
242
+ if reload:
243
+ page.reload(timeout=int(timeout * 1000))
244
+ html = page.content()
245
+ md = html_to_markdown(html=html, options=self.html_to_markdown_options)
246
+ if not keep_page:
247
+ page.close()
248
+ return md
249
+
250
+ async def aurl_to_md(
251
+ self,
252
+ url: str,
253
+ wait: float = 0.2,
254
+ scrolldown: bool = False,
255
+ sleep: int = 0,
256
+ reload: bool = True,
257
+ timeout: Union[float, int] = 8,
258
+ keep_page: bool = False,
259
+ referer: Optional[str] = None,
260
+ ) -> str:
261
+ """
262
+ Asynchronously navigate to a URL, wait, scroll or reload if specified,
263
+ and convert the rendered HTML to Markdown.
264
+
265
+ Args:
266
+ url (str): URL of the page.
267
+ wait (float): Time to wait after navigation (in seconds).
268
+ scrolldown (bool): If True, scroll the page.
269
+ sleep (int): Time to wait after scrolling (in seconds).
270
+ reload (bool): If True, reload the page.
271
+ timeout (float | int): Navigation timeout (in seconds).
272
+ keep_page (bool): If True, do not close the page after processing.
273
+ referer (Optional[str]): Referer URL to set.
274
+
275
+ Returns:
276
+ str: The page content converted to Markdown.
277
+ """
278
+ page = await self.aget_page(url, timeout=timeout, referer=referer)
279
+ if wait:
280
+ await page.wait_for_timeout(wait * 1000)
281
+ if scrolldown:
282
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
283
+ if sleep:
284
+ await page.wait_for_timeout(sleep * 1000)
285
+ if reload:
286
+ await page.reload(timeout=int(timeout * 1000))
287
+ html = await page.content()
288
+ md = html_to_markdown(html=html, options=self.html_to_markdown_options)
289
+ if not keep_page:
290
+ await page.close()
291
+ return md
292
+
293
+ def select_and_extract(
294
+ self,
295
+ url: str,
296
+ css_selector: str,
297
+ wait: float = 0.2,
298
+ scrolldown: bool = False,
299
+ sleep: int = 0,
300
+ reload: bool = True,
301
+ timeout: Union[float, int] = 8,
302
+ keep_page: bool = False,
303
+ referer: Optional[str] = None,
304
+ ) -> list[str]:
305
+ """
306
+ Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
307
+
308
+ Args:
309
+ url (str): URL of the page.
310
+ css_selector (str): CSS selector to locate elements.
311
+ wait (float): Time to wait after navigation (in seconds).
312
+ scrolldown (bool): If True, scroll the page.
313
+ sleep (int): Time to wait after scrolling (in seconds).
314
+ reload (bool): If True, reload the page.
315
+ timeout (float | int): Maximum navigation time (in seconds).
316
+ keep_page (bool): If True, do not close the page after processing.
317
+ referer (Optional[str]): Referer URL to set.
318
+
319
+ Returns:
320
+ List[str]: A list of text contents from the matching elements.
321
+ """
322
+ page = self.get_page(url, timeout=timeout, referer=referer)
323
+ if wait:
324
+ page.wait_for_timeout(wait * 1000)
325
+ if scrolldown:
326
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
327
+ if sleep:
328
+ page.wait_for_timeout(sleep * 1000)
329
+ if reload:
330
+ page.reload(timeout=int(timeout * 1000))
331
+ elements = page.query_selector_all(css_selector)
332
+ texts = [element.inner_text() for element in elements]
333
+ if not keep_page:
334
+ page.close()
335
+ return texts
336
+
337
+ async def aselect_and_extract(
338
+ self,
339
+ url: str,
340
+ css_selector: str,
341
+ wait: float = 0.2,
342
+ scrolldown: bool = False,
343
+ sleep: int = 0,
344
+ reload: bool = True,
345
+ timeout: Union[float, int] = 8,
346
+ keep_page: bool = False,
347
+ referer: Optional[str] = None,
348
+ ) -> list[str]:
349
+ """
350
+ Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
351
+
352
+ Args:
353
+ url (str): URL of the page.
354
+ css_selector (str): CSS selector to locate elements.
355
+ wait (float): Time to wait after navigation (in seconds).
356
+ scrolldown (bool): If True, scroll the page.
357
+ sleep (int): Time to wait after scrolling (in seconds).
358
+ reload (bool): If True, reload the page.
359
+ timeout (float | int): Navigation timeout (in seconds).
360
+ keep_page (bool): If True, do not close the page after processing.
361
+ referer (Optional[str]): Referer URL to set.
362
+
363
+ Returns:
364
+ List[str]: A list of text contents from the matching elements.
365
+ """
366
+ page = await self.aget_page(url, timeout=timeout, referer=referer)
367
+ if wait:
368
+ await page.wait_for_timeout(wait * 1000)
369
+ if scrolldown:
370
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
371
+ if sleep:
372
+ await page.wait_for_timeout(sleep * 1000)
373
+ if reload:
374
+ await page.reload(timeout=int(timeout * 1000))
375
+ elements = await page.query_selector_all(css_selector)
376
+ texts: list[str] = []
377
+ for element in elements:
378
+ text = await element.inner_text()
379
+ texts.append(text)
380
+ if not keep_page:
381
+ await page.close()
382
+ return texts
383
+
384
+ def url_to_md_with_llm(
385
+ self,
386
+ url: str,
387
+ chunk_size: Optional[int] = None,
388
+ wait: float = 0.2,
389
+ scrolldown: bool = False,
390
+ sleep: int = 0,
391
+ reload: bool = True,
392
+ timeout: Union[float, int] = 8,
393
+ keep_page: bool = False,
394
+ referer: Optional[str] = None,
395
+ describe_images: bool = True,
396
+ filter: bool = True,
397
+ ) -> str:
398
+ """
399
+ Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
400
+
401
+ The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
402
+ to select the important line ranges. It then reconstructs the filtered Markdown.
403
+
404
+ Args:
405
+ url (str): URL of the page.
406
+ chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
407
+ wait (float): Time to wait after navigation (in seconds).
408
+ scrolldown (bool): If True, scroll down the page.
409
+ sleep (int): Time to wait after scrolling (in seconds).
410
+ reload (bool): If True, reload the page.
411
+ timeout (float | int): Navigation timeout (in seconds).
412
+ keep_page (bool): If True, do not close the page after processing.
413
+ referer (Optional[str]): Referer URL to set.
414
+ describe_images (bool): If True, describe images in the Markdown text.
415
+ filter (bool): If True, filter the important lines using the language model.
416
+
417
+ Returns:
418
+ str: Filtered Markdown containing only the important lines.
419
+ """
420
+ markdown_content = self.url_to_md(
421
+ url,
422
+ wait=wait,
423
+ scrolldown=scrolldown,
424
+ sleep=sleep,
425
+ reload=reload,
426
+ timeout=timeout,
427
+ keep_page=keep_page,
428
+ referer=referer,
429
+ )
430
+ if describe_images:
431
+ markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
432
+ if not filter:
433
+ return markdown_content
434
+ lines = markdown_content.split("\n")
435
+ line_length = len(lines)
436
+ important_lines: set[int] = set()
437
+
438
+ def _into_safe_range(value: int) -> int:
439
+ """Ensure the line index stays within bounds."""
440
+ return min(max(value, 0), line_length - 1)
441
+
442
+ if chunk_size is None:
443
+ chunk_size = line_length
444
+
445
+ # Process the markdown in chunks.
446
+ for i in range(0, len(lines), chunk_size):
447
+ chunk_lines = lines[i : i + chunk_size]
448
+ # Prepend line numbers to each line.
449
+ numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
450
+ # Use the language model synchronously to get the line ranges.
451
+ result: SelectedLineRanges = self.chatterer.generate_pydantic(
452
+ response_model=SelectedLineRanges,
453
+ messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
454
+ )
455
+ for range_str in result.line_ranges:
456
+ if "-" in range_str:
457
+ start, end = map(int, range_str.split("-"))
458
+ important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
459
+ else:
460
+ important_lines.add(_into_safe_range(int(range_str) + i - 1))
461
+ # Reconstruct the filtered markdown.
462
+ return "\n".join(lines[line_no] for line_no in sorted(important_lines))
463
+
464
+ async def aurl_to_md_with_llm(
465
+ self,
466
+ url: str,
467
+ chunk_size: Optional[int] = None,
468
+ wait: float = 0.2,
469
+ scrolldown: bool = False,
470
+ sleep: int = 0,
471
+ reload: bool = True,
472
+ timeout: Union[float, int] = 8,
473
+ keep_page: bool = False,
474
+ referer: Optional[str] = None,
475
+ describe_images: bool = True,
476
+ filter: bool = True,
477
+ ) -> str:
478
+ """
479
+ Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
480
+ to filter out unimportant lines.
481
+
482
+ The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
483
+ to select the important line ranges. It then reconstructs the filtered Markdown.
484
+
485
+ Args:
486
+ url (str): URL of the page.
487
+ chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
488
+ wait (float): Time to wait after navigation (in seconds).
489
+ scrolldown (bool): If True, scroll the page.
490
+ sleep (int): Time to wait after scrolling (in seconds).
491
+ reload (bool): If True, reload the page.
492
+ timeout (float | int): Navigation timeout (in seconds).
493
+ keep_page (bool): If True, do not close the page after processing.
494
+ referer (Optional[str]): Referer URL to set.
495
+ describe_images (bool): If True, describe images in the Markdown text.
496
+ filter (bool): If True, filter the important lines using the language model.
497
+
498
+ Returns:
499
+ str: Filtered Markdown containing only the important lines.
500
+ """
501
+ markdown_content = await self.aurl_to_md(
502
+ url,
503
+ wait=wait,
504
+ scrolldown=scrolldown,
505
+ sleep=sleep,
506
+ reload=reload,
507
+ timeout=timeout,
508
+ keep_page=keep_page,
509
+ referer=referer,
510
+ )
511
+ if describe_images:
512
+ markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
513
+ if not filter:
514
+ return markdown_content
515
+ lines = markdown_content.split("\n")
516
+ line_length = len(lines)
517
+ important_lines: set[int] = set()
518
+
519
+ def _into_safe_range(value: int) -> int:
520
+ """Ensure the line index is within valid bounds."""
521
+ return min(max(value, 0), line_length - 1)
522
+
523
+ if chunk_size is None:
524
+ chunk_size = line_length
525
+
526
+ for i in range(0, len(lines), chunk_size):
527
+ chunk_lines = lines[i : i + chunk_size]
528
+ numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
529
+ # Use the asynchronous language model method.
530
+ result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
531
+ response_model=SelectedLineRanges,
532
+ messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
533
+ )
534
+ for range_str in result.line_ranges:
535
+ if "-" in range_str:
536
+ start, end = map(int, range_str.split("-"))
537
+ important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
538
+ else:
539
+ important_lines.add(_into_safe_range(int(range_str) + i - 1))
540
+ return "\n".join(lines[line_no] for line_no in sorted(important_lines))
541
+
542
+ def describe_images(self, markdown_text: str, referer_url: str) -> str:
543
+ """
544
+ Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
545
+ """
546
+ image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = (
547
+ get_image_url_and_markdown_links(
548
+ markdown_text=markdown_text,
549
+ headers=self.headers | {"Referer": referer_url},
550
+ config=self.image_processing_config,
551
+ )
552
+ )
553
+
554
+ image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
555
+ for image_url, markdown_links in image_url_and_markdown_links.items():
556
+ if image_url is not None:
557
+ try:
558
+ image_summary: str = self.chatterer.describe_image(
559
+ image_url=image_url.data_uri,
560
+ instruction=self.image_description_instruction,
561
+ )
562
+ except Exception:
563
+ print_exc()
564
+ continue
565
+ image_description_and_references[image_summary] = markdown_links
566
+ else:
567
+ image_description_and_references[None] = markdown_links
568
+
569
+ return replace_images(
570
+ markdown_text=markdown_text,
571
+ image_description_and_references=image_description_and_references,
572
+ description_format=self.description_format,
573
+ )
574
+
575
+ async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
576
+ """
577
+ Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
578
+ """
579
+ image_url_and_markdown_links: dict[
580
+ Optional[Base64Image], list[MarkdownLink]
581
+ ] = await aget_image_url_and_markdown_links(
582
+ markdown_text=markdown_text,
583
+ headers=self.headers | {"Referer": referer_url},
584
+ config=self.image_processing_config,
585
+ )
586
+
587
+ async def dummy() -> None:
588
+ pass
589
+
590
+ def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
591
+ if isinstance(e, BaseException):
592
+ print(format_exception_only(type(e), e))
593
+ return False
594
+ return True
595
+
596
+ coros: list[Awaitable[Optional[str]]] = [
597
+ self.chatterer.adescribe_image(image_url=image_url.data_uri, instruction=self.image_description_instruction)
598
+ if image_url is not None
599
+ else dummy()
600
+ for image_url in image_url_and_markdown_links.keys()
601
+ ]
602
+
603
+ return replace_images(
604
+ markdown_text=markdown_text,
605
+ image_description_and_references=ImageDescriptionAndReferences({
606
+ image_summary: markdown_links
607
+ for markdown_links, image_summary in zip(
608
+ image_url_and_markdown_links.values(), await asyncio.gather(*coros, return_exceptions=True)
609
+ )
610
+ if _handle_exception(image_summary)
611
+ }),
612
+ description_format=self.description_format,
613
+ )
614
+
615
+ def __enter__(self) -> Self:
616
+ return self
617
+
618
+ async def __aenter__(self) -> Self:
619
+ return self
620
+
621
+ def __exit__(
622
+ self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
623
+ ) -> None:
624
+ """
625
+ Exit the synchronous context.
626
+
627
+ Closes the browser and stops Playwright.
628
+ """
629
+ if self.sync_browser_context is not None:
630
+ self.sync_browser_context.close()
631
+ self.sync_browser_context = None
632
+ if self.sync_playwright:
633
+ self.sync_playwright.stop()
634
+ self.sync_playwright = None
635
+
636
+ async def __aexit__(
637
+ self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
638
+ ) -> None:
639
+ """
640
+ Asynchronously exit the context.
641
+
642
+ Closes the asynchronous browser and stops Playwright.
643
+ """
644
+ if self.async_browser_context is not None:
645
+ await self.async_browser_context.close()
646
+ self.async_browser_context = None
647
+ if self.async_playwright:
648
+ await self.async_playwright.stop()
649
+ self.async_playwright = None