chatterer 0.1.10__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +60 -60
- chatterer/language_model.py +577 -580
- chatterer/messages.py +9 -9
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +28 -28
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_to_text.py +463 -463
- chatterer/tools/webpage_to_markdown/__init__.py +4 -4
- chatterer/tools/webpage_to_markdown/playwright_bot.py +649 -649
- chatterer/tools/webpage_to_markdown/utils.py +334 -329
- chatterer/tools/youtube.py +146 -132
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/code_agent.py +138 -138
- chatterer/utils/image.py +291 -288
- {chatterer-0.1.10.dist-info → chatterer-0.1.12.dist-info}/METADATA +170 -170
- chatterer-0.1.12.dist-info/RECORD +27 -0
- {chatterer-0.1.10.dist-info → chatterer-0.1.12.dist-info}/WHEEL +1 -1
- chatterer-0.1.10.dist-info/RECORD +0 -27
- {chatterer-0.1.10.dist-info → chatterer-0.1.12.dist-info}/top_level.txt +0 -0
@@ -1,649 +1,649 @@
|
|
1
|
-
"""
|
2
|
-
PlaywrightBot
|
3
|
-
|
4
|
-
This module provides a single class that uses Playwright to:
|
5
|
-
- Fetch and render HTML pages (with JavaScript execution),
|
6
|
-
- Optionally scroll down or reload pages,
|
7
|
-
- Convert rendered HTML into Markdown,
|
8
|
-
- Extract specific elements using CSS selectors,
|
9
|
-
- Filter key information from a page via integration with a language model (Chatterer).
|
10
|
-
|
11
|
-
Both synchronous and asynchronous methods are available in this unified class.
|
12
|
-
Use the synchronous methods (without the "a" prefix) in a normal context manager,
|
13
|
-
or use the asynchronous methods (prefixed with "a") within an async context manager.
|
14
|
-
"""
|
15
|
-
|
16
|
-
import asyncio
|
17
|
-
from dataclasses import dataclass, field
|
18
|
-
from traceback import format_exception_only, print_exc
|
19
|
-
from types import TracebackType
|
20
|
-
from typing import (
|
21
|
-
Awaitable,
|
22
|
-
Optional,
|
23
|
-
Self,
|
24
|
-
Type,
|
25
|
-
TypeGuard,
|
26
|
-
Union,
|
27
|
-
)
|
28
|
-
|
29
|
-
import playwright.async_api
|
30
|
-
import playwright.sync_api
|
31
|
-
|
32
|
-
from ...language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
33
|
-
from ...utils.image import Base64Image, get_default_image_processing_config
|
34
|
-
from ..convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
|
35
|
-
from .utils import (
|
36
|
-
DEFAULT_UA,
|
37
|
-
ImageDescriptionAndReferences,
|
38
|
-
ImageProcessingConfig,
|
39
|
-
MarkdownLink,
|
40
|
-
PlaywrightLaunchOptions,
|
41
|
-
PlaywrightPersistencyOptions,
|
42
|
-
SelectedLineRanges,
|
43
|
-
WaitUntil,
|
44
|
-
aget_image_url_and_markdown_links,
|
45
|
-
get_default_playwright_launch_options,
|
46
|
-
get_image_url_and_markdown_links,
|
47
|
-
replace_images,
|
48
|
-
)
|
49
|
-
|
50
|
-
|
51
|
-
@dataclass
|
52
|
-
class PlayWrightBot:
|
53
|
-
"""
|
54
|
-
A unified bot that leverages Playwright to render web pages, convert them to Markdown,
|
55
|
-
extract elements, and filter key information using a language model.
|
56
|
-
|
57
|
-
This class exposes both synchronous and asynchronous methods.
|
58
|
-
|
59
|
-
Synchronous usage:
|
60
|
-
with UnifiedPlaywrightBot() as bot:
|
61
|
-
md = bot.url_to_md("https://example.com")
|
62
|
-
headings = bot.select_and_extract("https://example.com", "h2")
|
63
|
-
filtered_md = bot.url_to_md_with_llm("https://example.com")
|
64
|
-
|
65
|
-
Asynchronous usage:
|
66
|
-
async with UnifiedPlaywrightBot() as bot:
|
67
|
-
md = await bot.aurl_to_md("https://example.com")
|
68
|
-
headings = await bot.aselect_and_extract("https://example.com", "h2")
|
69
|
-
filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
|
70
|
-
|
71
|
-
Attributes:
|
72
|
-
headless (bool): Whether to run the browser in headless mode (default True).
|
73
|
-
chatterer (Chatterer): An instance of the language model interface for processing text.
|
74
|
-
"""
|
75
|
-
|
76
|
-
chatterer: Chatterer = field(default_factory=Chatterer.openai)
|
77
|
-
playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
|
78
|
-
playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
|
79
|
-
html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
|
80
|
-
image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
|
81
|
-
headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
|
82
|
-
markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
|
83
|
-
|
84
|
-
You excel at the following tasks:
|
85
|
-
1. Identifying the main article content of a webpage.
|
86
|
-
2. Filtering out ads, navigation links, and other irrelevant information.
|
87
|
-
3. Selecting the line number ranges that correspond to the article content.
|
88
|
-
4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
|
89
|
-
|
90
|
-
However, there are a few rules you must follow:
|
91
|
-
1. Do not remove the title of the article, if present.
|
92
|
-
2. Do not remove the author's name or the publication date, if present.
|
93
|
-
3. Include only images that are part of the article.
|
94
|
-
|
95
|
-
Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
|
96
|
-
|
97
|
-
Markdown-formatted webpage content is provided below for your reference:
|
98
|
-
---
|
99
|
-
""".strip()
|
100
|
-
description_format: str = (
|
101
|
-
"<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
|
102
|
-
)
|
103
|
-
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
|
104
|
-
|
105
|
-
sync_playwright: Optional[playwright.sync_api.Playwright] = None
|
106
|
-
sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
|
107
|
-
async_playwright: Optional[playwright.async_api.Playwright] = None
|
108
|
-
async_browser_context: Optional[playwright.async_api.BrowserContext] = None
|
109
|
-
|
110
|
-
def get_sync_playwright(self) -> playwright.sync_api.Playwright:
|
111
|
-
if self.sync_playwright is None:
|
112
|
-
self.sync_playwright = playwright.sync_api.sync_playwright().start()
|
113
|
-
return self.sync_playwright
|
114
|
-
|
115
|
-
async def get_async_playwright(self) -> playwright.async_api.Playwright:
|
116
|
-
if self.async_playwright is None:
|
117
|
-
self.async_playwright = await playwright.async_api.async_playwright().start()
|
118
|
-
return self.async_playwright
|
119
|
-
|
120
|
-
def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
|
121
|
-
if self.sync_browser_context is not None:
|
122
|
-
return self.sync_browser_context
|
123
|
-
|
124
|
-
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
125
|
-
if user_data_dir:
|
126
|
-
# Use persistent context if user_data_dir is provided
|
127
|
-
self.sync_browser_context = self.get_sync_playwright().chromium.launch_persistent_context(
|
128
|
-
user_data_dir=user_data_dir, **self.playwright_launch_options
|
129
|
-
)
|
130
|
-
return self.sync_browser_context
|
131
|
-
|
132
|
-
# Otherwise, launch a new context
|
133
|
-
browser = self.get_sync_playwright().chromium.launch(**self.playwright_launch_options)
|
134
|
-
storage_state = self.playwright_persistency_options.get("storage_state")
|
135
|
-
if storage_state:
|
136
|
-
self.sync_browser_context = browser.new_context(storage_state=storage_state)
|
137
|
-
else:
|
138
|
-
self.sync_browser_context = browser.new_context()
|
139
|
-
return self.sync_browser_context
|
140
|
-
|
141
|
-
async def get_async_browser(self) -> playwright.async_api.BrowserContext:
|
142
|
-
if self.async_browser_context is not None:
|
143
|
-
return self.async_browser_context
|
144
|
-
|
145
|
-
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
146
|
-
if user_data_dir:
|
147
|
-
# Use persistent context if user_data_dir is provided
|
148
|
-
self.async_browser_context = await (await self.get_async_playwright()).chromium.launch_persistent_context(
|
149
|
-
user_data_dir=user_data_dir, **self.playwright_launch_options
|
150
|
-
)
|
151
|
-
return self.async_browser_context
|
152
|
-
|
153
|
-
# Otherwise, launch a new context
|
154
|
-
browser = await (await self.get_async_playwright()).chromium.launch(**self.playwright_launch_options)
|
155
|
-
storage_state = self.playwright_persistency_options.get("storage_state")
|
156
|
-
if storage_state:
|
157
|
-
self.async_browser_context = await browser.new_context(storage_state=storage_state)
|
158
|
-
else:
|
159
|
-
self.async_browser_context = await browser.new_context()
|
160
|
-
return self.async_browser_context
|
161
|
-
|
162
|
-
def get_page(
|
163
|
-
self,
|
164
|
-
url: str,
|
165
|
-
timeout: float = 10.0,
|
166
|
-
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
167
|
-
referer: Optional[str] = None,
|
168
|
-
) -> playwright.sync_api.Page:
|
169
|
-
"""
|
170
|
-
Create a new page and navigate to the given URL synchronously.
|
171
|
-
|
172
|
-
Args:
|
173
|
-
url (str): URL to navigate to.
|
174
|
-
timeout (float): Maximum navigation time in seconds.
|
175
|
-
wait_until (str): Load state to wait for (e.g., "domcontentloaded").
|
176
|
-
referer (Optional[str]): Referer URL to set.
|
177
|
-
|
178
|
-
Returns:
|
179
|
-
Page: The Playwright page object.
|
180
|
-
"""
|
181
|
-
page = self.get_sync_browser().new_page()
|
182
|
-
page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
183
|
-
return page
|
184
|
-
|
185
|
-
async def aget_page(
|
186
|
-
self,
|
187
|
-
url: str,
|
188
|
-
timeout: float = 8,
|
189
|
-
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
190
|
-
referer: Optional[str] = None,
|
191
|
-
) -> playwright.async_api.Page:
|
192
|
-
"""
|
193
|
-
Create a new page and navigate to the given URL asynchronously.
|
194
|
-
|
195
|
-
Args:
|
196
|
-
url (str): URL to navigate to.
|
197
|
-
timeout (float): Maximum navigation time in seconds.
|
198
|
-
wait_until (str): Load state to wait for.
|
199
|
-
referer (Optional[str]): Referer URL to set.
|
200
|
-
|
201
|
-
Returns:
|
202
|
-
AsyncPage: The Playwright asynchronous page object.
|
203
|
-
"""
|
204
|
-
page = await (await self.get_async_browser()).new_page()
|
205
|
-
await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
206
|
-
return page
|
207
|
-
|
208
|
-
def url_to_md(
|
209
|
-
self,
|
210
|
-
url: str,
|
211
|
-
wait: float = 0.2,
|
212
|
-
scrolldown: bool = False,
|
213
|
-
sleep: int = 0,
|
214
|
-
reload: bool = True,
|
215
|
-
timeout: Union[float, int] = 8,
|
216
|
-
keep_page: bool = False,
|
217
|
-
referer: Optional[str] = None,
|
218
|
-
) -> str:
|
219
|
-
"""
|
220
|
-
Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
|
221
|
-
|
222
|
-
Args:
|
223
|
-
url (str): URL of the page.
|
224
|
-
wait (float): Time to wait after navigation (in seconds).
|
225
|
-
scrolldown (bool): If True, scroll to the bottom of the page.
|
226
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
227
|
-
reload (bool): If True, reload the page.
|
228
|
-
timeout (float | int): Navigation timeout in seconds.
|
229
|
-
keep_page (bool): If True, do not close the page after processing.
|
230
|
-
referer (Optional[str]): Referer URL to set.
|
231
|
-
|
232
|
-
Returns:
|
233
|
-
str: The page content converted to Markdown.
|
234
|
-
"""
|
235
|
-
page = self.get_page(url, timeout=timeout, referer=referer)
|
236
|
-
if wait:
|
237
|
-
page.wait_for_timeout(wait * 1000)
|
238
|
-
if scrolldown:
|
239
|
-
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
240
|
-
if sleep:
|
241
|
-
page.wait_for_timeout(sleep * 1000)
|
242
|
-
if reload:
|
243
|
-
page.reload(timeout=int(timeout * 1000))
|
244
|
-
html = page.content()
|
245
|
-
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
246
|
-
if not keep_page:
|
247
|
-
page.close()
|
248
|
-
return md
|
249
|
-
|
250
|
-
async def aurl_to_md(
|
251
|
-
self,
|
252
|
-
url: str,
|
253
|
-
wait: float = 0.2,
|
254
|
-
scrolldown: bool = False,
|
255
|
-
sleep: int = 0,
|
256
|
-
reload: bool = True,
|
257
|
-
timeout: Union[float, int] = 8,
|
258
|
-
keep_page: bool = False,
|
259
|
-
referer: Optional[str] = None,
|
260
|
-
) -> str:
|
261
|
-
"""
|
262
|
-
Asynchronously navigate to a URL, wait, scroll or reload if specified,
|
263
|
-
and convert the rendered HTML to Markdown.
|
264
|
-
|
265
|
-
Args:
|
266
|
-
url (str): URL of the page.
|
267
|
-
wait (float): Time to wait after navigation (in seconds).
|
268
|
-
scrolldown (bool): If True, scroll the page.
|
269
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
270
|
-
reload (bool): If True, reload the page.
|
271
|
-
timeout (float | int): Navigation timeout (in seconds).
|
272
|
-
keep_page (bool): If True, do not close the page after processing.
|
273
|
-
referer (Optional[str]): Referer URL to set.
|
274
|
-
|
275
|
-
Returns:
|
276
|
-
str: The page content converted to Markdown.
|
277
|
-
"""
|
278
|
-
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
279
|
-
if wait:
|
280
|
-
await page.wait_for_timeout(wait * 1000)
|
281
|
-
if scrolldown:
|
282
|
-
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
283
|
-
if sleep:
|
284
|
-
await page.wait_for_timeout(sleep * 1000)
|
285
|
-
if reload:
|
286
|
-
await page.reload(timeout=int(timeout * 1000))
|
287
|
-
html = await page.content()
|
288
|
-
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
289
|
-
if not keep_page:
|
290
|
-
await page.close()
|
291
|
-
return md
|
292
|
-
|
293
|
-
def select_and_extract(
|
294
|
-
self,
|
295
|
-
url: str,
|
296
|
-
css_selector: str,
|
297
|
-
wait: float = 0.2,
|
298
|
-
scrolldown: bool = False,
|
299
|
-
sleep: int = 0,
|
300
|
-
reload: bool = True,
|
301
|
-
timeout: Union[float, int] = 8,
|
302
|
-
keep_page: bool = False,
|
303
|
-
referer: Optional[str] = None,
|
304
|
-
) -> list[str]:
|
305
|
-
"""
|
306
|
-
Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
|
307
|
-
|
308
|
-
Args:
|
309
|
-
url (str): URL of the page.
|
310
|
-
css_selector (str): CSS selector to locate elements.
|
311
|
-
wait (float): Time to wait after navigation (in seconds).
|
312
|
-
scrolldown (bool): If True, scroll the page.
|
313
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
314
|
-
reload (bool): If True, reload the page.
|
315
|
-
timeout (float | int): Maximum navigation time (in seconds).
|
316
|
-
keep_page (bool): If True, do not close the page after processing.
|
317
|
-
referer (Optional[str]): Referer URL to set.
|
318
|
-
|
319
|
-
Returns:
|
320
|
-
List[str]: A list of text contents from the matching elements.
|
321
|
-
"""
|
322
|
-
page = self.get_page(url, timeout=timeout, referer=referer)
|
323
|
-
if wait:
|
324
|
-
page.wait_for_timeout(wait * 1000)
|
325
|
-
if scrolldown:
|
326
|
-
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
327
|
-
if sleep:
|
328
|
-
page.wait_for_timeout(sleep * 1000)
|
329
|
-
if reload:
|
330
|
-
page.reload(timeout=int(timeout * 1000))
|
331
|
-
elements = page.query_selector_all(css_selector)
|
332
|
-
texts = [element.inner_text() for element in elements]
|
333
|
-
if not keep_page:
|
334
|
-
page.close()
|
335
|
-
return texts
|
336
|
-
|
337
|
-
async def aselect_and_extract(
|
338
|
-
self,
|
339
|
-
url: str,
|
340
|
-
css_selector: str,
|
341
|
-
wait: float = 0.2,
|
342
|
-
scrolldown: bool = False,
|
343
|
-
sleep: int = 0,
|
344
|
-
reload: bool = True,
|
345
|
-
timeout: Union[float, int] = 8,
|
346
|
-
keep_page: bool = False,
|
347
|
-
referer: Optional[str] = None,
|
348
|
-
) -> list[str]:
|
349
|
-
"""
|
350
|
-
Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
|
351
|
-
|
352
|
-
Args:
|
353
|
-
url (str): URL of the page.
|
354
|
-
css_selector (str): CSS selector to locate elements.
|
355
|
-
wait (float): Time to wait after navigation (in seconds).
|
356
|
-
scrolldown (bool): If True, scroll the page.
|
357
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
358
|
-
reload (bool): If True, reload the page.
|
359
|
-
timeout (float | int): Navigation timeout (in seconds).
|
360
|
-
keep_page (bool): If True, do not close the page after processing.
|
361
|
-
referer (Optional[str]): Referer URL to set.
|
362
|
-
|
363
|
-
Returns:
|
364
|
-
List[str]: A list of text contents from the matching elements.
|
365
|
-
"""
|
366
|
-
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
367
|
-
if wait:
|
368
|
-
await page.wait_for_timeout(wait * 1000)
|
369
|
-
if scrolldown:
|
370
|
-
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
371
|
-
if sleep:
|
372
|
-
await page.wait_for_timeout(sleep * 1000)
|
373
|
-
if reload:
|
374
|
-
await page.reload(timeout=int(timeout * 1000))
|
375
|
-
elements = await page.query_selector_all(css_selector)
|
376
|
-
texts: list[str] = []
|
377
|
-
for element in elements:
|
378
|
-
text = await element.inner_text()
|
379
|
-
texts.append(text)
|
380
|
-
if not keep_page:
|
381
|
-
await page.close()
|
382
|
-
return texts
|
383
|
-
|
384
|
-
def url_to_md_with_llm(
|
385
|
-
self,
|
386
|
-
url: str,
|
387
|
-
chunk_size: Optional[int] = None,
|
388
|
-
wait: float = 0.2,
|
389
|
-
scrolldown: bool = False,
|
390
|
-
sleep: int = 0,
|
391
|
-
reload: bool = True,
|
392
|
-
timeout: Union[float, int] = 8,
|
393
|
-
keep_page: bool = False,
|
394
|
-
referer: Optional[str] = None,
|
395
|
-
describe_images: bool = True,
|
396
|
-
filter: bool = True,
|
397
|
-
) -> str:
|
398
|
-
"""
|
399
|
-
Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
|
400
|
-
|
401
|
-
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
402
|
-
to select the important line ranges. It then reconstructs the filtered Markdown.
|
403
|
-
|
404
|
-
Args:
|
405
|
-
url (str): URL of the page.
|
406
|
-
chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
|
407
|
-
wait (float): Time to wait after navigation (in seconds).
|
408
|
-
scrolldown (bool): If True, scroll down the page.
|
409
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
410
|
-
reload (bool): If True, reload the page.
|
411
|
-
timeout (float | int): Navigation timeout (in seconds).
|
412
|
-
keep_page (bool): If True, do not close the page after processing.
|
413
|
-
referer (Optional[str]): Referer URL to set.
|
414
|
-
describe_images (bool): If True, describe images in the Markdown text.
|
415
|
-
filter (bool): If True, filter the important lines using the language model.
|
416
|
-
|
417
|
-
Returns:
|
418
|
-
str: Filtered Markdown containing only the important lines.
|
419
|
-
"""
|
420
|
-
markdown_content = self.url_to_md(
|
421
|
-
url,
|
422
|
-
wait=wait,
|
423
|
-
scrolldown=scrolldown,
|
424
|
-
sleep=sleep,
|
425
|
-
reload=reload,
|
426
|
-
timeout=timeout,
|
427
|
-
keep_page=keep_page,
|
428
|
-
referer=referer,
|
429
|
-
)
|
430
|
-
if describe_images:
|
431
|
-
markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
|
432
|
-
if not filter:
|
433
|
-
return markdown_content
|
434
|
-
lines = markdown_content.split("\n")
|
435
|
-
line_length = len(lines)
|
436
|
-
important_lines: set[int] = set()
|
437
|
-
|
438
|
-
def _into_safe_range(value: int) -> int:
|
439
|
-
"""Ensure the line index stays within bounds."""
|
440
|
-
return min(max(value, 0), line_length - 1)
|
441
|
-
|
442
|
-
if chunk_size is None:
|
443
|
-
chunk_size = line_length
|
444
|
-
|
445
|
-
# Process the markdown in chunks.
|
446
|
-
for i in range(0, len(lines), chunk_size):
|
447
|
-
chunk_lines = lines[i : i + chunk_size]
|
448
|
-
# Prepend line numbers to each line.
|
449
|
-
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
450
|
-
# Use the language model synchronously to get the line ranges.
|
451
|
-
result: SelectedLineRanges = self.chatterer.generate_pydantic(
|
452
|
-
response_model=SelectedLineRanges,
|
453
|
-
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
454
|
-
)
|
455
|
-
for range_str in result.line_ranges:
|
456
|
-
if "-" in range_str:
|
457
|
-
start, end = map(int, range_str.split("-"))
|
458
|
-
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
459
|
-
else:
|
460
|
-
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
461
|
-
# Reconstruct the filtered markdown.
|
462
|
-
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
463
|
-
|
464
|
-
async def aurl_to_md_with_llm(
|
465
|
-
self,
|
466
|
-
url: str,
|
467
|
-
chunk_size: Optional[int] = None,
|
468
|
-
wait: float = 0.2,
|
469
|
-
scrolldown: bool = False,
|
470
|
-
sleep: int = 0,
|
471
|
-
reload: bool = True,
|
472
|
-
timeout: Union[float, int] = 8,
|
473
|
-
keep_page: bool = False,
|
474
|
-
referer: Optional[str] = None,
|
475
|
-
describe_images: bool = True,
|
476
|
-
filter: bool = True,
|
477
|
-
) -> str:
|
478
|
-
"""
|
479
|
-
Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
|
480
|
-
to filter out unimportant lines.
|
481
|
-
|
482
|
-
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
483
|
-
to select the important line ranges. It then reconstructs the filtered Markdown.
|
484
|
-
|
485
|
-
Args:
|
486
|
-
url (str): URL of the page.
|
487
|
-
chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
|
488
|
-
wait (float): Time to wait after navigation (in seconds).
|
489
|
-
scrolldown (bool): If True, scroll the page.
|
490
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
491
|
-
reload (bool): If True, reload the page.
|
492
|
-
timeout (float | int): Navigation timeout (in seconds).
|
493
|
-
keep_page (bool): If True, do not close the page after processing.
|
494
|
-
referer (Optional[str]): Referer URL to set.
|
495
|
-
describe_images (bool): If True, describe images in the Markdown text.
|
496
|
-
filter (bool): If True, filter the important lines using the language model.
|
497
|
-
|
498
|
-
Returns:
|
499
|
-
str: Filtered Markdown containing only the important lines.
|
500
|
-
"""
|
501
|
-
markdown_content = await self.aurl_to_md(
|
502
|
-
url,
|
503
|
-
wait=wait,
|
504
|
-
scrolldown=scrolldown,
|
505
|
-
sleep=sleep,
|
506
|
-
reload=reload,
|
507
|
-
timeout=timeout,
|
508
|
-
keep_page=keep_page,
|
509
|
-
referer=referer,
|
510
|
-
)
|
511
|
-
if describe_images:
|
512
|
-
markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
|
513
|
-
if not filter:
|
514
|
-
return markdown_content
|
515
|
-
lines = markdown_content.split("\n")
|
516
|
-
line_length = len(lines)
|
517
|
-
important_lines: set[int] = set()
|
518
|
-
|
519
|
-
def _into_safe_range(value: int) -> int:
|
520
|
-
"""Ensure the line index is within valid bounds."""
|
521
|
-
return min(max(value, 0), line_length - 1)
|
522
|
-
|
523
|
-
if chunk_size is None:
|
524
|
-
chunk_size = line_length
|
525
|
-
|
526
|
-
for i in range(0, len(lines), chunk_size):
|
527
|
-
chunk_lines = lines[i : i + chunk_size]
|
528
|
-
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
529
|
-
# Use the asynchronous language model method.
|
530
|
-
result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
|
531
|
-
response_model=SelectedLineRanges,
|
532
|
-
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
533
|
-
)
|
534
|
-
for range_str in result.line_ranges:
|
535
|
-
if "-" in range_str:
|
536
|
-
start, end = map(int, range_str.split("-"))
|
537
|
-
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
538
|
-
else:
|
539
|
-
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
540
|
-
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
541
|
-
|
542
|
-
def describe_images(self, markdown_text: str, referer_url: str) -> str:
|
543
|
-
"""
|
544
|
-
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
545
|
-
"""
|
546
|
-
image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = (
|
547
|
-
get_image_url_and_markdown_links(
|
548
|
-
markdown_text=markdown_text,
|
549
|
-
headers=self.headers | {"Referer": referer_url},
|
550
|
-
config=self.image_processing_config,
|
551
|
-
)
|
552
|
-
)
|
553
|
-
|
554
|
-
image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
|
555
|
-
for image_url, markdown_links in image_url_and_markdown_links.items():
|
556
|
-
if image_url is not None:
|
557
|
-
try:
|
558
|
-
image_summary: str = self.chatterer.describe_image(
|
559
|
-
image_url=image_url.data_uri,
|
560
|
-
instruction=self.image_description_instruction,
|
561
|
-
)
|
562
|
-
except Exception:
|
563
|
-
print_exc()
|
564
|
-
continue
|
565
|
-
image_description_and_references[image_summary] = markdown_links
|
566
|
-
else:
|
567
|
-
image_description_and_references[None] = markdown_links
|
568
|
-
|
569
|
-
return replace_images(
|
570
|
-
markdown_text=markdown_text,
|
571
|
-
image_description_and_references=image_description_and_references,
|
572
|
-
description_format=self.description_format,
|
573
|
-
)
|
574
|
-
|
575
|
-
async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
|
576
|
-
"""
|
577
|
-
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
578
|
-
"""
|
579
|
-
image_url_and_markdown_links: dict[
|
580
|
-
Optional[Base64Image], list[MarkdownLink]
|
581
|
-
] = await aget_image_url_and_markdown_links(
|
582
|
-
markdown_text=markdown_text,
|
583
|
-
headers=self.headers | {"Referer": referer_url},
|
584
|
-
config=self.image_processing_config,
|
585
|
-
)
|
586
|
-
|
587
|
-
async def dummy() -> None:
|
588
|
-
pass
|
589
|
-
|
590
|
-
def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
|
591
|
-
if isinstance(e, BaseException):
|
592
|
-
print(format_exception_only(type(e), e))
|
593
|
-
return False
|
594
|
-
return True
|
595
|
-
|
596
|
-
coros: list[Awaitable[Optional[str]]] = [
|
597
|
-
self.chatterer.adescribe_image(image_url=image_url.data_uri, instruction=self.image_description_instruction)
|
598
|
-
if image_url is not None
|
599
|
-
else dummy()
|
600
|
-
for image_url in image_url_and_markdown_links.keys()
|
601
|
-
]
|
602
|
-
|
603
|
-
return replace_images(
|
604
|
-
markdown_text=markdown_text,
|
605
|
-
image_description_and_references=ImageDescriptionAndReferences({
|
606
|
-
image_summary: markdown_links
|
607
|
-
for markdown_links, image_summary in zip(
|
608
|
-
image_url_and_markdown_links.values(), await asyncio.gather(*coros, return_exceptions=True)
|
609
|
-
)
|
610
|
-
if _handle_exception(image_summary)
|
611
|
-
}),
|
612
|
-
description_format=self.description_format,
|
613
|
-
)
|
614
|
-
|
615
|
-
def __enter__(self) -> Self:
|
616
|
-
return self
|
617
|
-
|
618
|
-
async def __aenter__(self) -> Self:
|
619
|
-
return self
|
620
|
-
|
621
|
-
def __exit__(
|
622
|
-
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
623
|
-
) -> None:
|
624
|
-
"""
|
625
|
-
Exit the synchronous context.
|
626
|
-
|
627
|
-
Closes the browser and stops Playwright.
|
628
|
-
"""
|
629
|
-
if self.sync_browser_context is not None:
|
630
|
-
self.sync_browser_context.close()
|
631
|
-
self.sync_browser_context = None
|
632
|
-
if self.sync_playwright:
|
633
|
-
self.sync_playwright.stop()
|
634
|
-
self.sync_playwright = None
|
635
|
-
|
636
|
-
async def __aexit__(
|
637
|
-
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
638
|
-
) -> None:
|
639
|
-
"""
|
640
|
-
Asynchronously exit the context.
|
641
|
-
|
642
|
-
Closes the asynchronous browser and stops Playwright.
|
643
|
-
"""
|
644
|
-
if self.async_browser_context is not None:
|
645
|
-
await self.async_browser_context.close()
|
646
|
-
self.async_browser_context = None
|
647
|
-
if self.async_playwright:
|
648
|
-
await self.async_playwright.stop()
|
649
|
-
self.async_playwright = None
|
1
|
+
"""
|
2
|
+
PlaywrightBot
|
3
|
+
|
4
|
+
This module provides a single class that uses Playwright to:
|
5
|
+
- Fetch and render HTML pages (with JavaScript execution),
|
6
|
+
- Optionally scroll down or reload pages,
|
7
|
+
- Convert rendered HTML into Markdown,
|
8
|
+
- Extract specific elements using CSS selectors,
|
9
|
+
- Filter key information from a page via integration with a language model (Chatterer).
|
10
|
+
|
11
|
+
Both synchronous and asynchronous methods are available in this unified class.
|
12
|
+
Use the synchronous methods (without the "a" prefix) in a normal context manager,
|
13
|
+
or use the asynchronous methods (prefixed with "a") within an async context manager.
|
14
|
+
"""
|
15
|
+
|
16
|
+
import asyncio
|
17
|
+
from dataclasses import dataclass, field
|
18
|
+
from traceback import format_exception_only, print_exc
|
19
|
+
from types import TracebackType
|
20
|
+
from typing import (
|
21
|
+
Awaitable,
|
22
|
+
Optional,
|
23
|
+
Self,
|
24
|
+
Type,
|
25
|
+
TypeGuard,
|
26
|
+
Union,
|
27
|
+
)
|
28
|
+
|
29
|
+
import playwright.async_api
|
30
|
+
import playwright.sync_api
|
31
|
+
|
32
|
+
from ...language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
33
|
+
from ...utils.image import Base64Image, get_default_image_processing_config
|
34
|
+
from ..convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
|
35
|
+
from .utils import (
|
36
|
+
DEFAULT_UA,
|
37
|
+
ImageDescriptionAndReferences,
|
38
|
+
ImageProcessingConfig,
|
39
|
+
MarkdownLink,
|
40
|
+
PlaywrightLaunchOptions,
|
41
|
+
PlaywrightPersistencyOptions,
|
42
|
+
SelectedLineRanges,
|
43
|
+
WaitUntil,
|
44
|
+
aget_image_url_and_markdown_links,
|
45
|
+
get_default_playwright_launch_options,
|
46
|
+
get_image_url_and_markdown_links,
|
47
|
+
replace_images,
|
48
|
+
)
|
49
|
+
|
50
|
+
|
51
|
+
@dataclass
|
52
|
+
class PlayWrightBot:
|
53
|
+
"""
|
54
|
+
A unified bot that leverages Playwright to render web pages, convert them to Markdown,
|
55
|
+
extract elements, and filter key information using a language model.
|
56
|
+
|
57
|
+
This class exposes both synchronous and asynchronous methods.
|
58
|
+
|
59
|
+
Synchronous usage:
|
60
|
+
with UnifiedPlaywrightBot() as bot:
|
61
|
+
md = bot.url_to_md("https://example.com")
|
62
|
+
headings = bot.select_and_extract("https://example.com", "h2")
|
63
|
+
filtered_md = bot.url_to_md_with_llm("https://example.com")
|
64
|
+
|
65
|
+
Asynchronous usage:
|
66
|
+
async with UnifiedPlaywrightBot() as bot:
|
67
|
+
md = await bot.aurl_to_md("https://example.com")
|
68
|
+
headings = await bot.aselect_and_extract("https://example.com", "h2")
|
69
|
+
filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
|
70
|
+
|
71
|
+
Attributes:
|
72
|
+
headless (bool): Whether to run the browser in headless mode (default True).
|
73
|
+
chatterer (Chatterer): An instance of the language model interface for processing text.
|
74
|
+
"""
|
75
|
+
|
76
|
+
chatterer: Chatterer = field(default_factory=Chatterer.openai)
|
77
|
+
playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
|
78
|
+
playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
|
79
|
+
html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
|
80
|
+
image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
|
81
|
+
headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
|
82
|
+
markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
|
83
|
+
|
84
|
+
You excel at the following tasks:
|
85
|
+
1. Identifying the main article content of a webpage.
|
86
|
+
2. Filtering out ads, navigation links, and other irrelevant information.
|
87
|
+
3. Selecting the line number ranges that correspond to the article content.
|
88
|
+
4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
|
89
|
+
|
90
|
+
However, there are a few rules you must follow:
|
91
|
+
1. Do not remove the title of the article, if present.
|
92
|
+
2. Do not remove the author's name or the publication date, if present.
|
93
|
+
3. Include only images that are part of the article.
|
94
|
+
|
95
|
+
Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
|
96
|
+
|
97
|
+
Markdown-formatted webpage content is provided below for your reference:
|
98
|
+
---
|
99
|
+
""".strip()
|
100
|
+
description_format: str = (
|
101
|
+
"<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
|
102
|
+
)
|
103
|
+
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
|
104
|
+
|
105
|
+
sync_playwright: Optional[playwright.sync_api.Playwright] = None
|
106
|
+
sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
|
107
|
+
async_playwright: Optional[playwright.async_api.Playwright] = None
|
108
|
+
async_browser_context: Optional[playwright.async_api.BrowserContext] = None
|
109
|
+
|
110
|
+
def get_sync_playwright(self) -> playwright.sync_api.Playwright:
|
111
|
+
if self.sync_playwright is None:
|
112
|
+
self.sync_playwright = playwright.sync_api.sync_playwright().start()
|
113
|
+
return self.sync_playwright
|
114
|
+
|
115
|
+
async def get_async_playwright(self) -> playwright.async_api.Playwright:
|
116
|
+
if self.async_playwright is None:
|
117
|
+
self.async_playwright = await playwright.async_api.async_playwright().start()
|
118
|
+
return self.async_playwright
|
119
|
+
|
120
|
+
def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
|
121
|
+
if self.sync_browser_context is not None:
|
122
|
+
return self.sync_browser_context
|
123
|
+
|
124
|
+
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
125
|
+
if user_data_dir:
|
126
|
+
# Use persistent context if user_data_dir is provided
|
127
|
+
self.sync_browser_context = self.get_sync_playwright().chromium.launch_persistent_context(
|
128
|
+
user_data_dir=user_data_dir, **self.playwright_launch_options
|
129
|
+
)
|
130
|
+
return self.sync_browser_context
|
131
|
+
|
132
|
+
# Otherwise, launch a new context
|
133
|
+
browser = self.get_sync_playwright().chromium.launch(**self.playwright_launch_options)
|
134
|
+
storage_state = self.playwright_persistency_options.get("storage_state")
|
135
|
+
if storage_state:
|
136
|
+
self.sync_browser_context = browser.new_context(storage_state=storage_state)
|
137
|
+
else:
|
138
|
+
self.sync_browser_context = browser.new_context()
|
139
|
+
return self.sync_browser_context
|
140
|
+
|
141
|
+
async def get_async_browser(self) -> playwright.async_api.BrowserContext:
|
142
|
+
if self.async_browser_context is not None:
|
143
|
+
return self.async_browser_context
|
144
|
+
|
145
|
+
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
146
|
+
if user_data_dir:
|
147
|
+
# Use persistent context if user_data_dir is provided
|
148
|
+
self.async_browser_context = await (await self.get_async_playwright()).chromium.launch_persistent_context(
|
149
|
+
user_data_dir=user_data_dir, **self.playwright_launch_options
|
150
|
+
)
|
151
|
+
return self.async_browser_context
|
152
|
+
|
153
|
+
# Otherwise, launch a new context
|
154
|
+
browser = await (await self.get_async_playwright()).chromium.launch(**self.playwright_launch_options)
|
155
|
+
storage_state = self.playwright_persistency_options.get("storage_state")
|
156
|
+
if storage_state:
|
157
|
+
self.async_browser_context = await browser.new_context(storage_state=storage_state)
|
158
|
+
else:
|
159
|
+
self.async_browser_context = await browser.new_context()
|
160
|
+
return self.async_browser_context
|
161
|
+
|
162
|
+
def get_page(
|
163
|
+
self,
|
164
|
+
url: str,
|
165
|
+
timeout: float = 10.0,
|
166
|
+
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
167
|
+
referer: Optional[str] = None,
|
168
|
+
) -> playwright.sync_api.Page:
|
169
|
+
"""
|
170
|
+
Create a new page and navigate to the given URL synchronously.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
url (str): URL to navigate to.
|
174
|
+
timeout (float): Maximum navigation time in seconds.
|
175
|
+
wait_until (str): Load state to wait for (e.g., "domcontentloaded").
|
176
|
+
referer (Optional[str]): Referer URL to set.
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
Page: The Playwright page object.
|
180
|
+
"""
|
181
|
+
page = self.get_sync_browser().new_page()
|
182
|
+
page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
183
|
+
return page
|
184
|
+
|
185
|
+
async def aget_page(
|
186
|
+
self,
|
187
|
+
url: str,
|
188
|
+
timeout: float = 8,
|
189
|
+
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
190
|
+
referer: Optional[str] = None,
|
191
|
+
) -> playwright.async_api.Page:
|
192
|
+
"""
|
193
|
+
Create a new page and navigate to the given URL asynchronously.
|
194
|
+
|
195
|
+
Args:
|
196
|
+
url (str): URL to navigate to.
|
197
|
+
timeout (float): Maximum navigation time in seconds.
|
198
|
+
wait_until (str): Load state to wait for.
|
199
|
+
referer (Optional[str]): Referer URL to set.
|
200
|
+
|
201
|
+
Returns:
|
202
|
+
AsyncPage: The Playwright asynchronous page object.
|
203
|
+
"""
|
204
|
+
page = await (await self.get_async_browser()).new_page()
|
205
|
+
await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
206
|
+
return page
|
207
|
+
|
208
|
+
def url_to_md(
|
209
|
+
self,
|
210
|
+
url: str,
|
211
|
+
wait: float = 0.2,
|
212
|
+
scrolldown: bool = False,
|
213
|
+
sleep: int = 0,
|
214
|
+
reload: bool = True,
|
215
|
+
timeout: Union[float, int] = 8,
|
216
|
+
keep_page: bool = False,
|
217
|
+
referer: Optional[str] = None,
|
218
|
+
) -> str:
|
219
|
+
"""
|
220
|
+
Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
url (str): URL of the page.
|
224
|
+
wait (float): Time to wait after navigation (in seconds).
|
225
|
+
scrolldown (bool): If True, scroll to the bottom of the page.
|
226
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
227
|
+
reload (bool): If True, reload the page.
|
228
|
+
timeout (float | int): Navigation timeout in seconds.
|
229
|
+
keep_page (bool): If True, do not close the page after processing.
|
230
|
+
referer (Optional[str]): Referer URL to set.
|
231
|
+
|
232
|
+
Returns:
|
233
|
+
str: The page content converted to Markdown.
|
234
|
+
"""
|
235
|
+
page = self.get_page(url, timeout=timeout, referer=referer)
|
236
|
+
if wait:
|
237
|
+
page.wait_for_timeout(wait * 1000)
|
238
|
+
if scrolldown:
|
239
|
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
240
|
+
if sleep:
|
241
|
+
page.wait_for_timeout(sleep * 1000)
|
242
|
+
if reload:
|
243
|
+
page.reload(timeout=int(timeout * 1000))
|
244
|
+
html = page.content()
|
245
|
+
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
246
|
+
if not keep_page:
|
247
|
+
page.close()
|
248
|
+
return md
|
249
|
+
|
250
|
+
async def aurl_to_md(
|
251
|
+
self,
|
252
|
+
url: str,
|
253
|
+
wait: float = 0.2,
|
254
|
+
scrolldown: bool = False,
|
255
|
+
sleep: int = 0,
|
256
|
+
reload: bool = True,
|
257
|
+
timeout: Union[float, int] = 8,
|
258
|
+
keep_page: bool = False,
|
259
|
+
referer: Optional[str] = None,
|
260
|
+
) -> str:
|
261
|
+
"""
|
262
|
+
Asynchronously navigate to a URL, wait, scroll or reload if specified,
|
263
|
+
and convert the rendered HTML to Markdown.
|
264
|
+
|
265
|
+
Args:
|
266
|
+
url (str): URL of the page.
|
267
|
+
wait (float): Time to wait after navigation (in seconds).
|
268
|
+
scrolldown (bool): If True, scroll the page.
|
269
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
270
|
+
reload (bool): If True, reload the page.
|
271
|
+
timeout (float | int): Navigation timeout (in seconds).
|
272
|
+
keep_page (bool): If True, do not close the page after processing.
|
273
|
+
referer (Optional[str]): Referer URL to set.
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
str: The page content converted to Markdown.
|
277
|
+
"""
|
278
|
+
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
279
|
+
if wait:
|
280
|
+
await page.wait_for_timeout(wait * 1000)
|
281
|
+
if scrolldown:
|
282
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
283
|
+
if sleep:
|
284
|
+
await page.wait_for_timeout(sleep * 1000)
|
285
|
+
if reload:
|
286
|
+
await page.reload(timeout=int(timeout * 1000))
|
287
|
+
html = await page.content()
|
288
|
+
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
289
|
+
if not keep_page:
|
290
|
+
await page.close()
|
291
|
+
return md
|
292
|
+
|
293
|
+
def select_and_extract(
|
294
|
+
self,
|
295
|
+
url: str,
|
296
|
+
css_selector: str,
|
297
|
+
wait: float = 0.2,
|
298
|
+
scrolldown: bool = False,
|
299
|
+
sleep: int = 0,
|
300
|
+
reload: bool = True,
|
301
|
+
timeout: Union[float, int] = 8,
|
302
|
+
keep_page: bool = False,
|
303
|
+
referer: Optional[str] = None,
|
304
|
+
) -> list[str]:
|
305
|
+
"""
|
306
|
+
Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
|
307
|
+
|
308
|
+
Args:
|
309
|
+
url (str): URL of the page.
|
310
|
+
css_selector (str): CSS selector to locate elements.
|
311
|
+
wait (float): Time to wait after navigation (in seconds).
|
312
|
+
scrolldown (bool): If True, scroll the page.
|
313
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
314
|
+
reload (bool): If True, reload the page.
|
315
|
+
timeout (float | int): Maximum navigation time (in seconds).
|
316
|
+
keep_page (bool): If True, do not close the page after processing.
|
317
|
+
referer (Optional[str]): Referer URL to set.
|
318
|
+
|
319
|
+
Returns:
|
320
|
+
List[str]: A list of text contents from the matching elements.
|
321
|
+
"""
|
322
|
+
page = self.get_page(url, timeout=timeout, referer=referer)
|
323
|
+
if wait:
|
324
|
+
page.wait_for_timeout(wait * 1000)
|
325
|
+
if scrolldown:
|
326
|
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
327
|
+
if sleep:
|
328
|
+
page.wait_for_timeout(sleep * 1000)
|
329
|
+
if reload:
|
330
|
+
page.reload(timeout=int(timeout * 1000))
|
331
|
+
elements = page.query_selector_all(css_selector)
|
332
|
+
texts = [element.inner_text() for element in elements]
|
333
|
+
if not keep_page:
|
334
|
+
page.close()
|
335
|
+
return texts
|
336
|
+
|
337
|
+
async def aselect_and_extract(
|
338
|
+
self,
|
339
|
+
url: str,
|
340
|
+
css_selector: str,
|
341
|
+
wait: float = 0.2,
|
342
|
+
scrolldown: bool = False,
|
343
|
+
sleep: int = 0,
|
344
|
+
reload: bool = True,
|
345
|
+
timeout: Union[float, int] = 8,
|
346
|
+
keep_page: bool = False,
|
347
|
+
referer: Optional[str] = None,
|
348
|
+
) -> list[str]:
|
349
|
+
"""
|
350
|
+
Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
url (str): URL of the page.
|
354
|
+
css_selector (str): CSS selector to locate elements.
|
355
|
+
wait (float): Time to wait after navigation (in seconds).
|
356
|
+
scrolldown (bool): If True, scroll the page.
|
357
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
358
|
+
reload (bool): If True, reload the page.
|
359
|
+
timeout (float | int): Navigation timeout (in seconds).
|
360
|
+
keep_page (bool): If True, do not close the page after processing.
|
361
|
+
referer (Optional[str]): Referer URL to set.
|
362
|
+
|
363
|
+
Returns:
|
364
|
+
List[str]: A list of text contents from the matching elements.
|
365
|
+
"""
|
366
|
+
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
367
|
+
if wait:
|
368
|
+
await page.wait_for_timeout(wait * 1000)
|
369
|
+
if scrolldown:
|
370
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
371
|
+
if sleep:
|
372
|
+
await page.wait_for_timeout(sleep * 1000)
|
373
|
+
if reload:
|
374
|
+
await page.reload(timeout=int(timeout * 1000))
|
375
|
+
elements = await page.query_selector_all(css_selector)
|
376
|
+
texts: list[str] = []
|
377
|
+
for element in elements:
|
378
|
+
text = await element.inner_text()
|
379
|
+
texts.append(text)
|
380
|
+
if not keep_page:
|
381
|
+
await page.close()
|
382
|
+
return texts
|
383
|
+
|
384
|
+
def url_to_md_with_llm(
|
385
|
+
self,
|
386
|
+
url: str,
|
387
|
+
chunk_size: Optional[int] = None,
|
388
|
+
wait: float = 0.2,
|
389
|
+
scrolldown: bool = False,
|
390
|
+
sleep: int = 0,
|
391
|
+
reload: bool = True,
|
392
|
+
timeout: Union[float, int] = 8,
|
393
|
+
keep_page: bool = False,
|
394
|
+
referer: Optional[str] = None,
|
395
|
+
describe_images: bool = True,
|
396
|
+
filter: bool = True,
|
397
|
+
) -> str:
|
398
|
+
"""
|
399
|
+
Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
|
400
|
+
|
401
|
+
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
402
|
+
to select the important line ranges. It then reconstructs the filtered Markdown.
|
403
|
+
|
404
|
+
Args:
|
405
|
+
url (str): URL of the page.
|
406
|
+
chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
|
407
|
+
wait (float): Time to wait after navigation (in seconds).
|
408
|
+
scrolldown (bool): If True, scroll down the page.
|
409
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
410
|
+
reload (bool): If True, reload the page.
|
411
|
+
timeout (float | int): Navigation timeout (in seconds).
|
412
|
+
keep_page (bool): If True, do not close the page after processing.
|
413
|
+
referer (Optional[str]): Referer URL to set.
|
414
|
+
describe_images (bool): If True, describe images in the Markdown text.
|
415
|
+
filter (bool): If True, filter the important lines using the language model.
|
416
|
+
|
417
|
+
Returns:
|
418
|
+
str: Filtered Markdown containing only the important lines.
|
419
|
+
"""
|
420
|
+
markdown_content = self.url_to_md(
|
421
|
+
url,
|
422
|
+
wait=wait,
|
423
|
+
scrolldown=scrolldown,
|
424
|
+
sleep=sleep,
|
425
|
+
reload=reload,
|
426
|
+
timeout=timeout,
|
427
|
+
keep_page=keep_page,
|
428
|
+
referer=referer,
|
429
|
+
)
|
430
|
+
if describe_images:
|
431
|
+
markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
|
432
|
+
if not filter:
|
433
|
+
return markdown_content
|
434
|
+
lines = markdown_content.split("\n")
|
435
|
+
line_length = len(lines)
|
436
|
+
important_lines: set[int] = set()
|
437
|
+
|
438
|
+
def _into_safe_range(value: int) -> int:
|
439
|
+
"""Ensure the line index stays within bounds."""
|
440
|
+
return min(max(value, 0), line_length - 1)
|
441
|
+
|
442
|
+
if chunk_size is None:
|
443
|
+
chunk_size = line_length
|
444
|
+
|
445
|
+
# Process the markdown in chunks.
|
446
|
+
for i in range(0, len(lines), chunk_size):
|
447
|
+
chunk_lines = lines[i : i + chunk_size]
|
448
|
+
# Prepend line numbers to each line.
|
449
|
+
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
450
|
+
# Use the language model synchronously to get the line ranges.
|
451
|
+
result: SelectedLineRanges = self.chatterer.generate_pydantic(
|
452
|
+
response_model=SelectedLineRanges,
|
453
|
+
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
454
|
+
)
|
455
|
+
for range_str in result.line_ranges:
|
456
|
+
if "-" in range_str:
|
457
|
+
start, end = map(int, range_str.split("-"))
|
458
|
+
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
459
|
+
else:
|
460
|
+
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
461
|
+
# Reconstruct the filtered markdown.
|
462
|
+
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
463
|
+
|
464
|
+
async def aurl_to_md_with_llm(
|
465
|
+
self,
|
466
|
+
url: str,
|
467
|
+
chunk_size: Optional[int] = None,
|
468
|
+
wait: float = 0.2,
|
469
|
+
scrolldown: bool = False,
|
470
|
+
sleep: int = 0,
|
471
|
+
reload: bool = True,
|
472
|
+
timeout: Union[float, int] = 8,
|
473
|
+
keep_page: bool = False,
|
474
|
+
referer: Optional[str] = None,
|
475
|
+
describe_images: bool = True,
|
476
|
+
filter: bool = True,
|
477
|
+
) -> str:
|
478
|
+
"""
|
479
|
+
Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
|
480
|
+
to filter out unimportant lines.
|
481
|
+
|
482
|
+
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
483
|
+
to select the important line ranges. It then reconstructs the filtered Markdown.
|
484
|
+
|
485
|
+
Args:
|
486
|
+
url (str): URL of the page.
|
487
|
+
chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
|
488
|
+
wait (float): Time to wait after navigation (in seconds).
|
489
|
+
scrolldown (bool): If True, scroll the page.
|
490
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
491
|
+
reload (bool): If True, reload the page.
|
492
|
+
timeout (float | int): Navigation timeout (in seconds).
|
493
|
+
keep_page (bool): If True, do not close the page after processing.
|
494
|
+
referer (Optional[str]): Referer URL to set.
|
495
|
+
describe_images (bool): If True, describe images in the Markdown text.
|
496
|
+
filter (bool): If True, filter the important lines using the language model.
|
497
|
+
|
498
|
+
Returns:
|
499
|
+
str: Filtered Markdown containing only the important lines.
|
500
|
+
"""
|
501
|
+
markdown_content = await self.aurl_to_md(
|
502
|
+
url,
|
503
|
+
wait=wait,
|
504
|
+
scrolldown=scrolldown,
|
505
|
+
sleep=sleep,
|
506
|
+
reload=reload,
|
507
|
+
timeout=timeout,
|
508
|
+
keep_page=keep_page,
|
509
|
+
referer=referer,
|
510
|
+
)
|
511
|
+
if describe_images:
|
512
|
+
markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
|
513
|
+
if not filter:
|
514
|
+
return markdown_content
|
515
|
+
lines = markdown_content.split("\n")
|
516
|
+
line_length = len(lines)
|
517
|
+
important_lines: set[int] = set()
|
518
|
+
|
519
|
+
def _into_safe_range(value: int) -> int:
|
520
|
+
"""Ensure the line index is within valid bounds."""
|
521
|
+
return min(max(value, 0), line_length - 1)
|
522
|
+
|
523
|
+
if chunk_size is None:
|
524
|
+
chunk_size = line_length
|
525
|
+
|
526
|
+
for i in range(0, len(lines), chunk_size):
|
527
|
+
chunk_lines = lines[i : i + chunk_size]
|
528
|
+
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
529
|
+
# Use the asynchronous language model method.
|
530
|
+
result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
|
531
|
+
response_model=SelectedLineRanges,
|
532
|
+
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
533
|
+
)
|
534
|
+
for range_str in result.line_ranges:
|
535
|
+
if "-" in range_str:
|
536
|
+
start, end = map(int, range_str.split("-"))
|
537
|
+
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
538
|
+
else:
|
539
|
+
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
540
|
+
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
541
|
+
|
542
|
+
def describe_images(self, markdown_text: str, referer_url: str) -> str:
|
543
|
+
"""
|
544
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
545
|
+
"""
|
546
|
+
image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = (
|
547
|
+
get_image_url_and_markdown_links(
|
548
|
+
markdown_text=markdown_text,
|
549
|
+
headers=self.headers | {"Referer": referer_url},
|
550
|
+
config=self.image_processing_config,
|
551
|
+
)
|
552
|
+
)
|
553
|
+
|
554
|
+
image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
|
555
|
+
for image_url, markdown_links in image_url_and_markdown_links.items():
|
556
|
+
if image_url is not None:
|
557
|
+
try:
|
558
|
+
image_summary: str = self.chatterer.describe_image(
|
559
|
+
image_url=image_url.data_uri,
|
560
|
+
instruction=self.image_description_instruction,
|
561
|
+
)
|
562
|
+
except Exception:
|
563
|
+
print_exc()
|
564
|
+
continue
|
565
|
+
image_description_and_references[image_summary] = markdown_links
|
566
|
+
else:
|
567
|
+
image_description_and_references[None] = markdown_links
|
568
|
+
|
569
|
+
return replace_images(
|
570
|
+
markdown_text=markdown_text,
|
571
|
+
image_description_and_references=image_description_and_references,
|
572
|
+
description_format=self.description_format,
|
573
|
+
)
|
574
|
+
|
575
|
+
async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
|
576
|
+
"""
|
577
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
578
|
+
"""
|
579
|
+
image_url_and_markdown_links: dict[
|
580
|
+
Optional[Base64Image], list[MarkdownLink]
|
581
|
+
] = await aget_image_url_and_markdown_links(
|
582
|
+
markdown_text=markdown_text,
|
583
|
+
headers=self.headers | {"Referer": referer_url},
|
584
|
+
config=self.image_processing_config,
|
585
|
+
)
|
586
|
+
|
587
|
+
async def dummy() -> None:
|
588
|
+
pass
|
589
|
+
|
590
|
+
def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
|
591
|
+
if isinstance(e, BaseException):
|
592
|
+
print(format_exception_only(type(e), e))
|
593
|
+
return False
|
594
|
+
return True
|
595
|
+
|
596
|
+
coros: list[Awaitable[Optional[str]]] = [
|
597
|
+
self.chatterer.adescribe_image(image_url=image_url.data_uri, instruction=self.image_description_instruction)
|
598
|
+
if image_url is not None
|
599
|
+
else dummy()
|
600
|
+
for image_url in image_url_and_markdown_links.keys()
|
601
|
+
]
|
602
|
+
|
603
|
+
return replace_images(
|
604
|
+
markdown_text=markdown_text,
|
605
|
+
image_description_and_references=ImageDescriptionAndReferences({
|
606
|
+
image_summary: markdown_links
|
607
|
+
for markdown_links, image_summary in zip(
|
608
|
+
image_url_and_markdown_links.values(), await asyncio.gather(*coros, return_exceptions=True)
|
609
|
+
)
|
610
|
+
if _handle_exception(image_summary)
|
611
|
+
}),
|
612
|
+
description_format=self.description_format,
|
613
|
+
)
|
614
|
+
|
615
|
+
def __enter__(self) -> Self:
|
616
|
+
return self
|
617
|
+
|
618
|
+
async def __aenter__(self) -> Self:
|
619
|
+
return self
|
620
|
+
|
621
|
+
def __exit__(
|
622
|
+
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
623
|
+
) -> None:
|
624
|
+
"""
|
625
|
+
Exit the synchronous context.
|
626
|
+
|
627
|
+
Closes the browser and stops Playwright.
|
628
|
+
"""
|
629
|
+
if self.sync_browser_context is not None:
|
630
|
+
self.sync_browser_context.close()
|
631
|
+
self.sync_browser_context = None
|
632
|
+
if self.sync_playwright:
|
633
|
+
self.sync_playwright.stop()
|
634
|
+
self.sync_playwright = None
|
635
|
+
|
636
|
+
async def __aexit__(
|
637
|
+
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
638
|
+
) -> None:
|
639
|
+
"""
|
640
|
+
Asynchronously exit the context.
|
641
|
+
|
642
|
+
Closes the asynchronous browser and stops Playwright.
|
643
|
+
"""
|
644
|
+
if self.async_browser_context is not None:
|
645
|
+
await self.async_browser_context.close()
|
646
|
+
self.async_browser_context = None
|
647
|
+
if self.async_playwright:
|
648
|
+
await self.async_playwright.stop()
|
649
|
+
self.async_playwright = None
|