chatterer 0.1.14__py3-none-any.whl → 0.1.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +93 -97
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/interactive.py +692 -353
- chatterer/language_model.py +533 -454
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +302 -302
- chatterer/tools/convert_to_text.py +447 -447
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -18
- chatterer/utils/base64_image.py +285 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.14.dist-info → chatterer-0.1.16.dist-info}/METADATA +392 -387
- chatterer-0.1.16.dist-info/RECORD +33 -0
- {chatterer-0.1.14.dist-info → chatterer-0.1.16.dist-info}/WHEEL +1 -1
- chatterer/utils/cli.py +0 -476
- chatterer-0.1.14.dist-info/RECORD +0 -34
- {chatterer-0.1.14.dist-info → chatterer-0.1.16.dist-info}/top_level.txt +0 -0
@@ -1,739 +1,739 @@
|
|
1
|
-
"""
|
2
|
-
PlaywrightBot
|
3
|
-
|
4
|
-
This module provides a single class that uses Playwright to:
|
5
|
-
- Fetch and render HTML pages (with JavaScript execution),
|
6
|
-
- Optionally scroll down or reload pages,
|
7
|
-
- Convert rendered HTML into Markdown,
|
8
|
-
- Extract specific elements using CSS selectors,
|
9
|
-
- Filter key information from a page via integration with a language model (Chatterer).
|
10
|
-
|
11
|
-
Both synchronous and asynchronous methods are available in this unified class.
|
12
|
-
Use the synchronous methods (without the "a" prefix) in a normal context manager,
|
13
|
-
or use the asynchronous methods (prefixed with "a") within an async context manager.
|
14
|
-
"""
|
15
|
-
|
16
|
-
from __future__ import annotations
|
17
|
-
|
18
|
-
from dataclasses import dataclass, field
|
19
|
-
from pathlib import Path
|
20
|
-
from types import TracebackType
|
21
|
-
from typing import (
|
22
|
-
TYPE_CHECKING,
|
23
|
-
Literal,
|
24
|
-
NotRequired,
|
25
|
-
Optional,
|
26
|
-
Self,
|
27
|
-
Sequence,
|
28
|
-
Type,
|
29
|
-
TypeAlias,
|
30
|
-
TypedDict,
|
31
|
-
Union,
|
32
|
-
)
|
33
|
-
|
34
|
-
from pydantic import BaseModel, Field
|
35
|
-
|
36
|
-
from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
37
|
-
from ..utils.base64_image import ImageProcessingConfig, get_default_image_processing_config, is_remote_url
|
38
|
-
from .caption_markdown_images import acaption_markdown_images, caption_markdown_images
|
39
|
-
from .convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
|
40
|
-
|
41
|
-
if TYPE_CHECKING:
|
42
|
-
import playwright.async_api
|
43
|
-
import playwright.sync_api
|
44
|
-
|
45
|
-
WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
46
|
-
DEFAULT_UA: str = (
|
47
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
48
|
-
)
|
49
|
-
|
50
|
-
|
51
|
-
class SelectedLineRanges(BaseModel):
|
52
|
-
line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
|
53
|
-
|
54
|
-
|
55
|
-
class PlaywrightLaunchOptions(TypedDict):
|
56
|
-
executable_path: NotRequired[str | Path]
|
57
|
-
channel: NotRequired[str]
|
58
|
-
args: NotRequired[Sequence[str]]
|
59
|
-
ignore_default_args: NotRequired[bool | Sequence[str]]
|
60
|
-
handle_sigint: NotRequired[bool]
|
61
|
-
handle_sigterm: NotRequired[bool]
|
62
|
-
handle_sighup: NotRequired[bool]
|
63
|
-
timeout: NotRequired[float]
|
64
|
-
env: NotRequired[dict[str, str | float | bool]]
|
65
|
-
headless: NotRequired[bool]
|
66
|
-
devtools: NotRequired[bool]
|
67
|
-
proxy: NotRequired[playwright.sync_api.ProxySettings]
|
68
|
-
downloads_path: NotRequired[str | Path]
|
69
|
-
slow_mo: NotRequired[float]
|
70
|
-
traces_dir: NotRequired[str | Path]
|
71
|
-
chromium_sandbox: NotRequired[bool]
|
72
|
-
firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
|
73
|
-
|
74
|
-
|
75
|
-
class PlaywrightPersistencyOptions(TypedDict):
|
76
|
-
user_data_dir: NotRequired[str | Path]
|
77
|
-
storage_state: NotRequired[playwright.sync_api.StorageState]
|
78
|
-
|
79
|
-
|
80
|
-
class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
|
81
|
-
|
82
|
-
|
83
|
-
def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
|
84
|
-
return {"headless": True}
|
85
|
-
|
86
|
-
|
87
|
-
@dataclass
|
88
|
-
class PlayWrightBot:
|
89
|
-
"""
|
90
|
-
A unified bot that leverages Playwright to render web pages, convert them to Markdown,
|
91
|
-
extract elements, and filter key information using a language model.
|
92
|
-
|
93
|
-
This class exposes both synchronous and asynchronous methods.
|
94
|
-
|
95
|
-
Synchronous usage:
|
96
|
-
with UnifiedPlaywrightBot() as bot:
|
97
|
-
md = bot.url_to_md("https://example.com")
|
98
|
-
headings = bot.select_and_extract("https://example.com", "h2")
|
99
|
-
filtered_md = bot.url_to_md_with_llm("https://example.com")
|
100
|
-
|
101
|
-
Asynchronous usage:
|
102
|
-
async with UnifiedPlaywrightBot() as bot:
|
103
|
-
md = await bot.aurl_to_md("https://example.com")
|
104
|
-
headings = await bot.aselect_and_extract("https://example.com", "h2")
|
105
|
-
filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
|
106
|
-
|
107
|
-
Attributes:
|
108
|
-
headless (bool): Whether to run the browser in headless mode (default True).
|
109
|
-
chatterer (Chatterer): An instance of the language model interface for processing text.
|
110
|
-
"""
|
111
|
-
|
112
|
-
engine: Literal["firefox", "chromium", "webkit"] = "firefox"
|
113
|
-
chatterer: Optional[Chatterer] = field(default_factory=Chatterer.openai)
|
114
|
-
playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
|
115
|
-
playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
|
116
|
-
html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
|
117
|
-
image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
|
118
|
-
headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
|
119
|
-
markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
|
120
|
-
|
121
|
-
You excel at the following tasks:
|
122
|
-
1. Identifying the main article content of a webpage.
|
123
|
-
2. Filtering out ads, navigation links, and other irrelevant information.
|
124
|
-
3. Selecting the line number ranges that correspond to the article content.
|
125
|
-
4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
|
126
|
-
|
127
|
-
However, there are a few rules you must follow:
|
128
|
-
1. Do not remove the title of the article, if present.
|
129
|
-
2. Do not remove the author's name or the publication date, if present.
|
130
|
-
3. Include only images that are part of the article.
|
131
|
-
|
132
|
-
Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
|
133
|
-
|
134
|
-
Markdown-formatted webpage content is provided below for your reference:
|
135
|
-
---
|
136
|
-
""".strip()
|
137
|
-
description_format: str = (
|
138
|
-
"<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
|
139
|
-
)
|
140
|
-
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
|
141
|
-
|
142
|
-
sync_playwright: Optional[playwright.sync_api.Playwright] = None
|
143
|
-
sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
|
144
|
-
async_playwright: Optional[playwright.async_api.Playwright] = None
|
145
|
-
async_browser_context: Optional[playwright.async_api.BrowserContext] = None
|
146
|
-
|
147
|
-
def get_sync_playwright(self) -> playwright.sync_api.Playwright:
|
148
|
-
if self.sync_playwright is None:
|
149
|
-
from playwright.sync_api import sync_playwright
|
150
|
-
|
151
|
-
self.sync_playwright = sync_playwright().start()
|
152
|
-
return self.sync_playwright
|
153
|
-
|
154
|
-
async def get_async_playwright(self) -> playwright.async_api.Playwright:
|
155
|
-
if self.async_playwright is None:
|
156
|
-
from playwright.async_api import async_playwright
|
157
|
-
|
158
|
-
self.async_playwright = await async_playwright().start()
|
159
|
-
return self.async_playwright
|
160
|
-
|
161
|
-
def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
|
162
|
-
if self.sync_browser_context is not None:
|
163
|
-
return self.sync_browser_context
|
164
|
-
|
165
|
-
def get_browser() -> playwright.sync_api.BrowserType:
|
166
|
-
playwright = self.get_sync_playwright()
|
167
|
-
if self.engine == "firefox":
|
168
|
-
return playwright.firefox
|
169
|
-
elif self.engine == "chromium":
|
170
|
-
return playwright.chromium
|
171
|
-
elif self.engine == "webkit":
|
172
|
-
return playwright.webkit
|
173
|
-
else:
|
174
|
-
raise ValueError(f"Unsupported engine: {self.engine}")
|
175
|
-
|
176
|
-
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
177
|
-
if user_data_dir:
|
178
|
-
# Use persistent context if user_data_dir is provided
|
179
|
-
self.sync_browser_context = get_browser().launch_persistent_context(
|
180
|
-
user_data_dir=user_data_dir, **self.playwright_launch_options
|
181
|
-
)
|
182
|
-
return self.sync_browser_context
|
183
|
-
|
184
|
-
# Otherwise, launch a new context
|
185
|
-
browser = get_browser().launch(**self.playwright_launch_options)
|
186
|
-
storage_state = self.playwright_persistency_options.get("storage_state")
|
187
|
-
if storage_state:
|
188
|
-
self.sync_browser_context = browser.new_context(storage_state=storage_state)
|
189
|
-
else:
|
190
|
-
self.sync_browser_context = browser.new_context()
|
191
|
-
return self.sync_browser_context
|
192
|
-
|
193
|
-
async def get_async_browser(self) -> playwright.async_api.BrowserContext:
|
194
|
-
if self.async_browser_context is not None:
|
195
|
-
return self.async_browser_context
|
196
|
-
|
197
|
-
async def get_browser() -> playwright.async_api.BrowserType:
|
198
|
-
playwright = await self.get_async_playwright()
|
199
|
-
if self.engine == "firefox":
|
200
|
-
return playwright.firefox
|
201
|
-
elif self.engine == "chromium":
|
202
|
-
return playwright.chromium
|
203
|
-
elif self.engine == "webkit":
|
204
|
-
return playwright.webkit
|
205
|
-
else:
|
206
|
-
raise ValueError(f"Unsupported engine: {self.engine}")
|
207
|
-
|
208
|
-
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
209
|
-
if user_data_dir:
|
210
|
-
# Use persistent context if user_data_dir is provided
|
211
|
-
self.async_browser_context = await (await get_browser()).launch_persistent_context(
|
212
|
-
user_data_dir=user_data_dir, **self.playwright_launch_options
|
213
|
-
)
|
214
|
-
return self.async_browser_context
|
215
|
-
|
216
|
-
# Otherwise, launch a new context
|
217
|
-
browser = await (await get_browser()).launch(**self.playwright_launch_options)
|
218
|
-
storage_state = self.playwright_persistency_options.get("storage_state")
|
219
|
-
if storage_state:
|
220
|
-
self.async_browser_context = await browser.new_context(storage_state=storage_state)
|
221
|
-
else:
|
222
|
-
self.async_browser_context = await browser.new_context()
|
223
|
-
return self.async_browser_context
|
224
|
-
|
225
|
-
def get_page(
|
226
|
-
self,
|
227
|
-
url: str,
|
228
|
-
timeout: float = 10.0,
|
229
|
-
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
230
|
-
referer: Optional[str] = None,
|
231
|
-
) -> playwright.sync_api.Page:
|
232
|
-
"""
|
233
|
-
Create a new page and navigate to the given URL synchronously.
|
234
|
-
|
235
|
-
Args:
|
236
|
-
url (str): URL to navigate to.
|
237
|
-
timeout (float): Maximum navigation time in seconds.
|
238
|
-
wait_until (str): Load state to wait for (e.g., "domcontentloaded").
|
239
|
-
referer (Optional[str]): Referer URL to set.
|
240
|
-
|
241
|
-
Returns:
|
242
|
-
Page: The Playwright page object.
|
243
|
-
"""
|
244
|
-
page = self.get_sync_browser().new_page()
|
245
|
-
page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
246
|
-
return page
|
247
|
-
|
248
|
-
async def aget_page(
|
249
|
-
self,
|
250
|
-
url: str,
|
251
|
-
timeout: float = 8,
|
252
|
-
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
253
|
-
referer: Optional[str] = None,
|
254
|
-
) -> playwright.async_api.Page:
|
255
|
-
"""
|
256
|
-
Create a new page and navigate to the given URL asynchronously.
|
257
|
-
|
258
|
-
Args:
|
259
|
-
url (str): URL to navigate to.
|
260
|
-
timeout (float): Maximum navigation time in seconds.
|
261
|
-
wait_until (str): Load state to wait for.
|
262
|
-
referer (Optional[str]): Referer URL to set.
|
263
|
-
|
264
|
-
Returns:
|
265
|
-
AsyncPage: The Playwright asynchronous page object.
|
266
|
-
"""
|
267
|
-
page = await (await self.get_async_browser()).new_page()
|
268
|
-
await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
269
|
-
return page
|
270
|
-
|
271
|
-
def url_to_md(
|
272
|
-
self,
|
273
|
-
url: str,
|
274
|
-
wait: float = 0.2,
|
275
|
-
scrolldown: bool = False,
|
276
|
-
sleep: int = 0,
|
277
|
-
reload: bool = True,
|
278
|
-
timeout: Union[float, int] = 8,
|
279
|
-
keep_page: bool = False,
|
280
|
-
referer: Optional[str] = None,
|
281
|
-
) -> str:
|
282
|
-
"""
|
283
|
-
Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
|
284
|
-
|
285
|
-
Args:
|
286
|
-
url (str): URL of the page.
|
287
|
-
wait (float): Time to wait after navigation (in seconds).
|
288
|
-
scrolldown (bool): If True, scroll to the bottom of the page.
|
289
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
290
|
-
reload (bool): If True, reload the page.
|
291
|
-
timeout (float | int): Navigation timeout in seconds.
|
292
|
-
keep_page (bool): If True, do not close the page after processing.
|
293
|
-
referer (Optional[str]): Referer URL to set.
|
294
|
-
|
295
|
-
Returns:
|
296
|
-
str: The page content converted to Markdown.
|
297
|
-
"""
|
298
|
-
page: Optional[playwright.sync_api.Page] = None
|
299
|
-
if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
|
300
|
-
with open(url, "r", encoding="utf-8") as f:
|
301
|
-
html = f.read()
|
302
|
-
else:
|
303
|
-
page = self.get_page(url, timeout=timeout, referer=referer)
|
304
|
-
if wait:
|
305
|
-
page.wait_for_timeout(wait * 1000)
|
306
|
-
if scrolldown:
|
307
|
-
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
308
|
-
if sleep:
|
309
|
-
page.wait_for_timeout(sleep * 1000)
|
310
|
-
if reload:
|
311
|
-
page.reload(timeout=int(timeout * 1000))
|
312
|
-
html = page.content()
|
313
|
-
|
314
|
-
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
315
|
-
if not keep_page and page is not None:
|
316
|
-
page.close()
|
317
|
-
return md
|
318
|
-
|
319
|
-
async def aurl_to_md(
|
320
|
-
self,
|
321
|
-
url: str,
|
322
|
-
wait: float = 0.2,
|
323
|
-
scrolldown: bool = False,
|
324
|
-
sleep: int = 0,
|
325
|
-
reload: bool = True,
|
326
|
-
timeout: Union[float, int] = 8,
|
327
|
-
keep_page: bool = False,
|
328
|
-
referer: Optional[str] = None,
|
329
|
-
) -> str:
|
330
|
-
"""
|
331
|
-
Asynchronously navigate to a URL, wait, scroll or reload if specified,
|
332
|
-
and convert the rendered HTML to Markdown.
|
333
|
-
|
334
|
-
Args:
|
335
|
-
url (str): URL of the page.
|
336
|
-
wait (float): Time to wait after navigation (in seconds).
|
337
|
-
scrolldown (bool): If True, scroll the page.
|
338
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
339
|
-
reload (bool): If True, reload the page.
|
340
|
-
timeout (float | int): Navigation timeout (in seconds).
|
341
|
-
keep_page (bool): If True, do not close the page after processing.
|
342
|
-
referer (Optional[str]): Referer URL to set.
|
343
|
-
|
344
|
-
Returns:
|
345
|
-
str: The page content converted to Markdown.
|
346
|
-
"""
|
347
|
-
page: Optional[playwright.async_api.Page] = None
|
348
|
-
if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
|
349
|
-
with open(url, "r", encoding="utf-8") as f:
|
350
|
-
html = f.read()
|
351
|
-
else:
|
352
|
-
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
353
|
-
if wait:
|
354
|
-
await page.wait_for_timeout(wait * 1000)
|
355
|
-
if scrolldown:
|
356
|
-
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
357
|
-
if sleep:
|
358
|
-
await page.wait_for_timeout(sleep * 1000)
|
359
|
-
if reload:
|
360
|
-
await page.reload(timeout=int(timeout * 1000))
|
361
|
-
html = await page.content()
|
362
|
-
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
363
|
-
if not keep_page and page is not None:
|
364
|
-
await page.close()
|
365
|
-
return md
|
366
|
-
|
367
|
-
def select_and_extract(
|
368
|
-
self,
|
369
|
-
url: str,
|
370
|
-
css_selector: str,
|
371
|
-
wait: float = 0.2,
|
372
|
-
scrolldown: bool = False,
|
373
|
-
sleep: int = 0,
|
374
|
-
reload: bool = True,
|
375
|
-
timeout: Union[float, int] = 8,
|
376
|
-
keep_page: bool = False,
|
377
|
-
referer: Optional[str] = None,
|
378
|
-
) -> list[str]:
|
379
|
-
"""
|
380
|
-
Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
|
381
|
-
|
382
|
-
Args:
|
383
|
-
url (str): URL of the page.
|
384
|
-
css_selector (str): CSS selector to locate elements.
|
385
|
-
wait (float): Time to wait after navigation (in seconds).
|
386
|
-
scrolldown (bool): If True, scroll the page.
|
387
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
388
|
-
reload (bool): If True, reload the page.
|
389
|
-
timeout (float | int): Maximum navigation time (in seconds).
|
390
|
-
keep_page (bool): If True, do not close the page after processing.
|
391
|
-
referer (Optional[str]): Referer URL to set.
|
392
|
-
|
393
|
-
Returns:
|
394
|
-
List[str]: A list of text contents from the matching elements.
|
395
|
-
"""
|
396
|
-
page = self.get_page(url, timeout=timeout, referer=referer)
|
397
|
-
if wait:
|
398
|
-
page.wait_for_timeout(wait * 1000)
|
399
|
-
if scrolldown:
|
400
|
-
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
401
|
-
if sleep:
|
402
|
-
page.wait_for_timeout(sleep * 1000)
|
403
|
-
if reload:
|
404
|
-
page.reload(timeout=int(timeout * 1000))
|
405
|
-
elements = page.query_selector_all(css_selector)
|
406
|
-
texts = [element.inner_text() for element in elements]
|
407
|
-
if not keep_page:
|
408
|
-
page.close()
|
409
|
-
return texts
|
410
|
-
|
411
|
-
async def aselect_and_extract(
|
412
|
-
self,
|
413
|
-
url: str,
|
414
|
-
css_selector: str,
|
415
|
-
wait: float = 0.2,
|
416
|
-
scrolldown: bool = False,
|
417
|
-
sleep: int = 0,
|
418
|
-
reload: bool = True,
|
419
|
-
timeout: Union[float, int] = 8,
|
420
|
-
keep_page: bool = False,
|
421
|
-
referer: Optional[str] = None,
|
422
|
-
) -> list[str]:
|
423
|
-
"""
|
424
|
-
Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
|
425
|
-
|
426
|
-
Args:
|
427
|
-
url (str): URL of the page.
|
428
|
-
css_selector (str): CSS selector to locate elements.
|
429
|
-
wait (float): Time to wait after navigation (in seconds).
|
430
|
-
scrolldown (bool): If True, scroll the page.
|
431
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
432
|
-
reload (bool): If True, reload the page.
|
433
|
-
timeout (float | int): Navigation timeout (in seconds).
|
434
|
-
keep_page (bool): If True, do not close the page after processing.
|
435
|
-
referer (Optional[str]): Referer URL to set.
|
436
|
-
|
437
|
-
Returns:
|
438
|
-
List[str]: A list of text contents from the matching elements.
|
439
|
-
"""
|
440
|
-
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
441
|
-
if wait:
|
442
|
-
await page.wait_for_timeout(wait * 1000)
|
443
|
-
if scrolldown:
|
444
|
-
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
445
|
-
if sleep:
|
446
|
-
await page.wait_for_timeout(sleep * 1000)
|
447
|
-
if reload:
|
448
|
-
await page.reload(timeout=int(timeout * 1000))
|
449
|
-
elements = await page.query_selector_all(css_selector)
|
450
|
-
texts: list[str] = []
|
451
|
-
for element in elements:
|
452
|
-
text = await element.inner_text()
|
453
|
-
texts.append(text)
|
454
|
-
if not keep_page:
|
455
|
-
await page.close()
|
456
|
-
return texts
|
457
|
-
|
458
|
-
def url_to_md_with_llm(
|
459
|
-
self,
|
460
|
-
url: str,
|
461
|
-
chunk_size: Optional[int] = None,
|
462
|
-
wait: float = 0.2,
|
463
|
-
scrolldown: bool = False,
|
464
|
-
sleep: int = 0,
|
465
|
-
reload: bool = True,
|
466
|
-
timeout: Union[float, int] = 8,
|
467
|
-
keep_page: bool = False,
|
468
|
-
referer: Optional[str] = None,
|
469
|
-
describe_images: bool = True,
|
470
|
-
filter: bool = True,
|
471
|
-
) -> str:
|
472
|
-
"""
|
473
|
-
Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
|
474
|
-
|
475
|
-
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
476
|
-
to select the important line ranges. It then reconstructs the filtered Markdown.
|
477
|
-
|
478
|
-
Args:
|
479
|
-
url (str): URL of the page.
|
480
|
-
chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
|
481
|
-
wait (float): Time to wait after navigation (in seconds).
|
482
|
-
scrolldown (bool): If True, scroll down the page.
|
483
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
484
|
-
reload (bool): If True, reload the page.
|
485
|
-
timeout (float | int): Navigation timeout (in seconds).
|
486
|
-
keep_page (bool): If True, do not close the page after processing.
|
487
|
-
referer (Optional[str]): Referer URL to set.
|
488
|
-
describe_images (bool): If True, describe images in the Markdown text.
|
489
|
-
filter (bool): If True, filter the important lines using the language model.
|
490
|
-
|
491
|
-
Returns:
|
492
|
-
str: Filtered Markdown containing only the important lines.
|
493
|
-
"""
|
494
|
-
if self.chatterer is None:
|
495
|
-
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
496
|
-
markdown_content = self.url_to_md(
|
497
|
-
url,
|
498
|
-
wait=wait,
|
499
|
-
scrolldown=scrolldown,
|
500
|
-
sleep=sleep,
|
501
|
-
reload=reload,
|
502
|
-
timeout=timeout,
|
503
|
-
keep_page=keep_page,
|
504
|
-
referer=referer,
|
505
|
-
)
|
506
|
-
if describe_images:
|
507
|
-
markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
|
508
|
-
if not filter:
|
509
|
-
return markdown_content
|
510
|
-
lines = markdown_content.split("\n")
|
511
|
-
line_length = len(lines)
|
512
|
-
important_lines: set[int] = set()
|
513
|
-
|
514
|
-
def _into_safe_range(value: int) -> int:
|
515
|
-
"""Ensure the line index stays within bounds."""
|
516
|
-
return min(max(value, 0), line_length - 1)
|
517
|
-
|
518
|
-
if chunk_size is None:
|
519
|
-
chunk_size = line_length
|
520
|
-
|
521
|
-
# Process the markdown in chunks.
|
522
|
-
for i in range(0, len(lines), chunk_size):
|
523
|
-
chunk_lines = lines[i : i + chunk_size]
|
524
|
-
# Prepend line numbers to each line.
|
525
|
-
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
526
|
-
# Use the language model synchronously to get the line ranges.
|
527
|
-
result: SelectedLineRanges = self.chatterer.generate_pydantic(
|
528
|
-
response_model=SelectedLineRanges,
|
529
|
-
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
530
|
-
)
|
531
|
-
for range_str in result.line_ranges:
|
532
|
-
if "-" in range_str:
|
533
|
-
start, end = map(int, range_str.split("-"))
|
534
|
-
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
535
|
-
else:
|
536
|
-
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
537
|
-
# Reconstruct the filtered markdown.
|
538
|
-
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
539
|
-
|
540
|
-
async def aurl_to_md_with_llm(
|
541
|
-
self,
|
542
|
-
url: str,
|
543
|
-
chunk_size: Optional[int] = None,
|
544
|
-
wait: float = 0.2,
|
545
|
-
scrolldown: bool = False,
|
546
|
-
sleep: int = 0,
|
547
|
-
reload: bool = True,
|
548
|
-
timeout: Union[float, int] = 8,
|
549
|
-
keep_page: bool = False,
|
550
|
-
referer: Optional[str] = None,
|
551
|
-
describe_images: bool = True,
|
552
|
-
filter: bool = True,
|
553
|
-
) -> str:
|
554
|
-
"""
|
555
|
-
Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
|
556
|
-
to filter out unimportant lines.
|
557
|
-
|
558
|
-
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
559
|
-
to select the important line ranges. It then reconstructs the filtered Markdown.
|
560
|
-
|
561
|
-
Args:
|
562
|
-
url (str): URL of the page.
|
563
|
-
chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
|
564
|
-
wait (float): Time to wait after navigation (in seconds).
|
565
|
-
scrolldown (bool): If True, scroll the page.
|
566
|
-
sleep (int): Time to wait after scrolling (in seconds).
|
567
|
-
reload (bool): If True, reload the page.
|
568
|
-
timeout (float | int): Navigation timeout (in seconds).
|
569
|
-
keep_page (bool): If True, do not close the page after processing.
|
570
|
-
referer (Optional[str]): Referer URL to set.
|
571
|
-
describe_images (bool): If True, describe images in the Markdown text.
|
572
|
-
filter (bool): If True, filter the important lines using the language model.
|
573
|
-
|
574
|
-
Returns:
|
575
|
-
str: Filtered Markdown containing only the important lines.
|
576
|
-
"""
|
577
|
-
if self.chatterer is None:
|
578
|
-
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
579
|
-
markdown_content = await self.aurl_to_md(
|
580
|
-
url,
|
581
|
-
wait=wait,
|
582
|
-
scrolldown=scrolldown,
|
583
|
-
sleep=sleep,
|
584
|
-
reload=reload,
|
585
|
-
timeout=timeout,
|
586
|
-
keep_page=keep_page,
|
587
|
-
referer=referer,
|
588
|
-
)
|
589
|
-
if describe_images:
|
590
|
-
markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
|
591
|
-
if not filter:
|
592
|
-
return markdown_content
|
593
|
-
lines = markdown_content.split("\n")
|
594
|
-
line_length = len(lines)
|
595
|
-
important_lines: set[int] = set()
|
596
|
-
|
597
|
-
def _into_safe_range(value: int) -> int:
|
598
|
-
"""Ensure the line index is within valid bounds."""
|
599
|
-
return min(max(value, 0), line_length - 1)
|
600
|
-
|
601
|
-
if chunk_size is None:
|
602
|
-
chunk_size = line_length
|
603
|
-
|
604
|
-
for i in range(0, len(lines), chunk_size):
|
605
|
-
chunk_lines = lines[i : i + chunk_size]
|
606
|
-
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
607
|
-
# Use the asynchronous language model method.
|
608
|
-
result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
|
609
|
-
response_model=SelectedLineRanges,
|
610
|
-
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
611
|
-
)
|
612
|
-
for range_str in result.line_ranges:
|
613
|
-
if "-" in range_str:
|
614
|
-
start, end = map(int, range_str.split("-"))
|
615
|
-
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
616
|
-
else:
|
617
|
-
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
618
|
-
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
619
|
-
|
620
|
-
def describe_images(self, markdown_text: str, referer_url: str) -> str:
|
621
|
-
"""
|
622
|
-
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
623
|
-
Using Playwright for fetching images to bypass CDN protections.
|
624
|
-
"""
|
625
|
-
if self.chatterer is None:
|
626
|
-
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
627
|
-
return caption_markdown_images(
|
628
|
-
markdown_text=markdown_text,
|
629
|
-
headers=self.headers | {"Referer": referer_url},
|
630
|
-
description_format=self.description_format,
|
631
|
-
image_description_instruction=self.image_description_instruction,
|
632
|
-
chatterer=self.chatterer,
|
633
|
-
image_processing_config=self.image_processing_config,
|
634
|
-
img_bytes_fetcher=self._playwright_fetch_image_bytes,
|
635
|
-
)
|
636
|
-
|
637
|
-
# 기존 adescribe_images 메서드를 다음과 같이 수정합니다.
|
638
|
-
async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
|
639
|
-
"""
|
640
|
-
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
641
|
-
Using Playwright for fetching images to bypass CDN protections.
|
642
|
-
"""
|
643
|
-
if self.chatterer is None:
|
644
|
-
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
645
|
-
return await acaption_markdown_images(
|
646
|
-
markdown_text=markdown_text,
|
647
|
-
headers=self.headers | {"Referer": referer_url},
|
648
|
-
description_format=self.description_format,
|
649
|
-
image_description_instruction=self.image_description_instruction,
|
650
|
-
chatterer=self.chatterer,
|
651
|
-
image_processing_config=self.image_processing_config,
|
652
|
-
img_bytes_fetcher=self._aplaywright_fetch_image_bytes,
|
653
|
-
)
|
654
|
-
|
655
|
-
def _playwright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
|
656
|
-
"""Playwright를 사용하여 동기적으로 이미지 바이트를 가져옵니다."""
|
657
|
-
page: Optional[playwright.sync_api.Page] = None
|
658
|
-
try:
|
659
|
-
# Get the existing synchronous browser context.
|
660
|
-
page = self.get_sync_browser().new_page()
|
661
|
-
|
662
|
-
# Set the provided headers as extra HTTP headers for the page.
|
663
|
-
# This will apply to all subsequent requests made by the page.
|
664
|
-
page.set_extra_http_headers(headers)
|
665
|
-
response = page.goto(image_url, wait_until="load", timeout=15000)
|
666
|
-
if response and response.ok:
|
667
|
-
return response.body()
|
668
|
-
else:
|
669
|
-
return b""
|
670
|
-
except Exception as e:
|
671
|
-
print(f"Playwright exception fetching image: {image_url}, Error: {e}")
|
672
|
-
return b""
|
673
|
-
finally:
|
674
|
-
if page:
|
675
|
-
page.close()
|
676
|
-
|
677
|
-
async def _aplaywright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
|
678
|
-
"""Playwright를 사용하여 비동기적으로 이미지 바이트를 가져옵니다."""
|
679
|
-
page: Optional[playwright.async_api.Page] = None
|
680
|
-
try:
|
681
|
-
# Get the existing asynchronous browser context.
|
682
|
-
page = await (await self.get_async_browser()).new_page()
|
683
|
-
|
684
|
-
# Set the provided headers as extra HTTP headers for the page.
|
685
|
-
# This will apply to all subsequent requests made by the page.
|
686
|
-
await page.set_extra_http_headers(headers)
|
687
|
-
response = await page.goto(image_url, wait_until="load", timeout=15000)
|
688
|
-
if response and response.ok:
|
689
|
-
return await response.body()
|
690
|
-
else:
|
691
|
-
# 실패 시 로그를 남기거나 None을 반환할 수 있습니다.
|
692
|
-
print(
|
693
|
-
f"Playwright failed to fetch image: {image_url}, Status: {response.status if response else 'No Response'}"
|
694
|
-
)
|
695
|
-
return b""
|
696
|
-
except Exception as e:
|
697
|
-
# 예외 발생 시 로그를 남깁니다.
|
698
|
-
print(f"Playwright exception fetching image: {image_url}, Error: {e}")
|
699
|
-
return b""
|
700
|
-
finally:
|
701
|
-
# 페이지를 항상 닫아 리소스를 정리합니다.
|
702
|
-
if page:
|
703
|
-
await page.close()
|
704
|
-
|
705
|
-
def __enter__(self) -> Self:
|
706
|
-
return self
|
707
|
-
|
708
|
-
async def __aenter__(self) -> Self:
|
709
|
-
return self
|
710
|
-
|
711
|
-
def __exit__(
|
712
|
-
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
713
|
-
) -> None:
|
714
|
-
"""
|
715
|
-
Exit the synchronous context.
|
716
|
-
|
717
|
-
Closes the browser and stops Playwright.
|
718
|
-
"""
|
719
|
-
if self.sync_browser_context is not None:
|
720
|
-
self.sync_browser_context.close()
|
721
|
-
self.sync_browser_context = None
|
722
|
-
if self.sync_playwright:
|
723
|
-
self.sync_playwright.stop()
|
724
|
-
self.sync_playwright = None
|
725
|
-
|
726
|
-
async def __aexit__(
|
727
|
-
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
728
|
-
) -> None:
|
729
|
-
"""
|
730
|
-
Asynchronously exit the context.
|
731
|
-
|
732
|
-
Closes the asynchronous browser and stops Playwright.
|
733
|
-
"""
|
734
|
-
if self.async_browser_context is not None:
|
735
|
-
await self.async_browser_context.close()
|
736
|
-
self.async_browser_context = None
|
737
|
-
if self.async_playwright:
|
738
|
-
await self.async_playwright.stop()
|
739
|
-
self.async_playwright = None
|
1
|
+
"""
|
2
|
+
PlaywrightBot
|
3
|
+
|
4
|
+
This module provides a single class that uses Playwright to:
|
5
|
+
- Fetch and render HTML pages (with JavaScript execution),
|
6
|
+
- Optionally scroll down or reload pages,
|
7
|
+
- Convert rendered HTML into Markdown,
|
8
|
+
- Extract specific elements using CSS selectors,
|
9
|
+
- Filter key information from a page via integration with a language model (Chatterer).
|
10
|
+
|
11
|
+
Both synchronous and asynchronous methods are available in this unified class.
|
12
|
+
Use the synchronous methods (without the "a" prefix) in a normal context manager,
|
13
|
+
or use the asynchronous methods (prefixed with "a") within an async context manager.
|
14
|
+
"""
|
15
|
+
|
16
|
+
from __future__ import annotations
|
17
|
+
|
18
|
+
from dataclasses import dataclass, field
|
19
|
+
from pathlib import Path
|
20
|
+
from types import TracebackType
|
21
|
+
from typing import (
|
22
|
+
TYPE_CHECKING,
|
23
|
+
Literal,
|
24
|
+
NotRequired,
|
25
|
+
Optional,
|
26
|
+
Self,
|
27
|
+
Sequence,
|
28
|
+
Type,
|
29
|
+
TypeAlias,
|
30
|
+
TypedDict,
|
31
|
+
Union,
|
32
|
+
)
|
33
|
+
|
34
|
+
from pydantic import BaseModel, Field
|
35
|
+
|
36
|
+
from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
37
|
+
from ..utils.base64_image import ImageProcessingConfig, get_default_image_processing_config, is_remote_url
|
38
|
+
from .caption_markdown_images import acaption_markdown_images, caption_markdown_images
|
39
|
+
from .convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
|
40
|
+
|
41
|
+
if TYPE_CHECKING:
|
42
|
+
import playwright.async_api
|
43
|
+
import playwright.sync_api
|
44
|
+
|
45
|
+
WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
46
|
+
DEFAULT_UA: str = (
|
47
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
48
|
+
)
|
49
|
+
|
50
|
+
|
51
|
+
class SelectedLineRanges(BaseModel):
|
52
|
+
line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
|
53
|
+
|
54
|
+
|
55
|
+
class PlaywrightLaunchOptions(TypedDict):
|
56
|
+
executable_path: NotRequired[str | Path]
|
57
|
+
channel: NotRequired[str]
|
58
|
+
args: NotRequired[Sequence[str]]
|
59
|
+
ignore_default_args: NotRequired[bool | Sequence[str]]
|
60
|
+
handle_sigint: NotRequired[bool]
|
61
|
+
handle_sigterm: NotRequired[bool]
|
62
|
+
handle_sighup: NotRequired[bool]
|
63
|
+
timeout: NotRequired[float]
|
64
|
+
env: NotRequired[dict[str, str | float | bool]]
|
65
|
+
headless: NotRequired[bool]
|
66
|
+
devtools: NotRequired[bool]
|
67
|
+
proxy: NotRequired[playwright.sync_api.ProxySettings]
|
68
|
+
downloads_path: NotRequired[str | Path]
|
69
|
+
slow_mo: NotRequired[float]
|
70
|
+
traces_dir: NotRequired[str | Path]
|
71
|
+
chromium_sandbox: NotRequired[bool]
|
72
|
+
firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
|
73
|
+
|
74
|
+
|
75
|
+
class PlaywrightPersistencyOptions(TypedDict):
|
76
|
+
user_data_dir: NotRequired[str | Path]
|
77
|
+
storage_state: NotRequired[playwright.sync_api.StorageState]
|
78
|
+
|
79
|
+
|
80
|
+
class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
|
81
|
+
|
82
|
+
|
83
|
+
def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
|
84
|
+
return {"headless": True}
|
85
|
+
|
86
|
+
|
87
|
+
@dataclass
|
88
|
+
class PlayWrightBot:
|
89
|
+
"""
|
90
|
+
A unified bot that leverages Playwright to render web pages, convert them to Markdown,
|
91
|
+
extract elements, and filter key information using a language model.
|
92
|
+
|
93
|
+
This class exposes both synchronous and asynchronous methods.
|
94
|
+
|
95
|
+
Synchronous usage:
|
96
|
+
with UnifiedPlaywrightBot() as bot:
|
97
|
+
md = bot.url_to_md("https://example.com")
|
98
|
+
headings = bot.select_and_extract("https://example.com", "h2")
|
99
|
+
filtered_md = bot.url_to_md_with_llm("https://example.com")
|
100
|
+
|
101
|
+
Asynchronous usage:
|
102
|
+
async with UnifiedPlaywrightBot() as bot:
|
103
|
+
md = await bot.aurl_to_md("https://example.com")
|
104
|
+
headings = await bot.aselect_and_extract("https://example.com", "h2")
|
105
|
+
filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
|
106
|
+
|
107
|
+
Attributes:
|
108
|
+
headless (bool): Whether to run the browser in headless mode (default True).
|
109
|
+
chatterer (Chatterer): An instance of the language model interface for processing text.
|
110
|
+
"""
|
111
|
+
|
112
|
+
engine: Literal["firefox", "chromium", "webkit"] = "firefox"
|
113
|
+
chatterer: Optional[Chatterer] = field(default_factory=Chatterer.openai)
|
114
|
+
playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
|
115
|
+
playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
|
116
|
+
html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
|
117
|
+
image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
|
118
|
+
headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
|
119
|
+
markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
|
120
|
+
|
121
|
+
You excel at the following tasks:
|
122
|
+
1. Identifying the main article content of a webpage.
|
123
|
+
2. Filtering out ads, navigation links, and other irrelevant information.
|
124
|
+
3. Selecting the line number ranges that correspond to the article content.
|
125
|
+
4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
|
126
|
+
|
127
|
+
However, there are a few rules you must follow:
|
128
|
+
1. Do not remove the title of the article, if present.
|
129
|
+
2. Do not remove the author's name or the publication date, if present.
|
130
|
+
3. Include only images that are part of the article.
|
131
|
+
|
132
|
+
Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
|
133
|
+
|
134
|
+
Markdown-formatted webpage content is provided below for your reference:
|
135
|
+
---
|
136
|
+
""".strip()
|
137
|
+
description_format: str = (
|
138
|
+
"<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
|
139
|
+
)
|
140
|
+
image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
|
141
|
+
|
142
|
+
sync_playwright: Optional[playwright.sync_api.Playwright] = None
|
143
|
+
sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
|
144
|
+
async_playwright: Optional[playwright.async_api.Playwright] = None
|
145
|
+
async_browser_context: Optional[playwright.async_api.BrowserContext] = None
|
146
|
+
|
147
|
+
def get_sync_playwright(self) -> playwright.sync_api.Playwright:
|
148
|
+
if self.sync_playwright is None:
|
149
|
+
from playwright.sync_api import sync_playwright
|
150
|
+
|
151
|
+
self.sync_playwright = sync_playwright().start()
|
152
|
+
return self.sync_playwright
|
153
|
+
|
154
|
+
async def get_async_playwright(self) -> playwright.async_api.Playwright:
|
155
|
+
if self.async_playwright is None:
|
156
|
+
from playwright.async_api import async_playwright
|
157
|
+
|
158
|
+
self.async_playwright = await async_playwright().start()
|
159
|
+
return self.async_playwright
|
160
|
+
|
161
|
+
def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
|
162
|
+
if self.sync_browser_context is not None:
|
163
|
+
return self.sync_browser_context
|
164
|
+
|
165
|
+
def get_browser() -> playwright.sync_api.BrowserType:
|
166
|
+
playwright = self.get_sync_playwright()
|
167
|
+
if self.engine == "firefox":
|
168
|
+
return playwright.firefox
|
169
|
+
elif self.engine == "chromium":
|
170
|
+
return playwright.chromium
|
171
|
+
elif self.engine == "webkit":
|
172
|
+
return playwright.webkit
|
173
|
+
else:
|
174
|
+
raise ValueError(f"Unsupported engine: {self.engine}")
|
175
|
+
|
176
|
+
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
177
|
+
if user_data_dir:
|
178
|
+
# Use persistent context if user_data_dir is provided
|
179
|
+
self.sync_browser_context = get_browser().launch_persistent_context(
|
180
|
+
user_data_dir=user_data_dir, **self.playwright_launch_options
|
181
|
+
)
|
182
|
+
return self.sync_browser_context
|
183
|
+
|
184
|
+
# Otherwise, launch a new context
|
185
|
+
browser = get_browser().launch(**self.playwright_launch_options)
|
186
|
+
storage_state = self.playwright_persistency_options.get("storage_state")
|
187
|
+
if storage_state:
|
188
|
+
self.sync_browser_context = browser.new_context(storage_state=storage_state)
|
189
|
+
else:
|
190
|
+
self.sync_browser_context = browser.new_context()
|
191
|
+
return self.sync_browser_context
|
192
|
+
|
193
|
+
async def get_async_browser(self) -> playwright.async_api.BrowserContext:
|
194
|
+
if self.async_browser_context is not None:
|
195
|
+
return self.async_browser_context
|
196
|
+
|
197
|
+
async def get_browser() -> playwright.async_api.BrowserType:
|
198
|
+
playwright = await self.get_async_playwright()
|
199
|
+
if self.engine == "firefox":
|
200
|
+
return playwright.firefox
|
201
|
+
elif self.engine == "chromium":
|
202
|
+
return playwright.chromium
|
203
|
+
elif self.engine == "webkit":
|
204
|
+
return playwright.webkit
|
205
|
+
else:
|
206
|
+
raise ValueError(f"Unsupported engine: {self.engine}")
|
207
|
+
|
208
|
+
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
209
|
+
if user_data_dir:
|
210
|
+
# Use persistent context if user_data_dir is provided
|
211
|
+
self.async_browser_context = await (await get_browser()).launch_persistent_context(
|
212
|
+
user_data_dir=user_data_dir, **self.playwright_launch_options
|
213
|
+
)
|
214
|
+
return self.async_browser_context
|
215
|
+
|
216
|
+
# Otherwise, launch a new context
|
217
|
+
browser = await (await get_browser()).launch(**self.playwright_launch_options)
|
218
|
+
storage_state = self.playwright_persistency_options.get("storage_state")
|
219
|
+
if storage_state:
|
220
|
+
self.async_browser_context = await browser.new_context(storage_state=storage_state)
|
221
|
+
else:
|
222
|
+
self.async_browser_context = await browser.new_context()
|
223
|
+
return self.async_browser_context
|
224
|
+
|
225
|
+
def get_page(
|
226
|
+
self,
|
227
|
+
url: str,
|
228
|
+
timeout: float = 10.0,
|
229
|
+
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
230
|
+
referer: Optional[str] = None,
|
231
|
+
) -> playwright.sync_api.Page:
|
232
|
+
"""
|
233
|
+
Create a new page and navigate to the given URL synchronously.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
url (str): URL to navigate to.
|
237
|
+
timeout (float): Maximum navigation time in seconds.
|
238
|
+
wait_until (str): Load state to wait for (e.g., "domcontentloaded").
|
239
|
+
referer (Optional[str]): Referer URL to set.
|
240
|
+
|
241
|
+
Returns:
|
242
|
+
Page: The Playwright page object.
|
243
|
+
"""
|
244
|
+
page = self.get_sync_browser().new_page()
|
245
|
+
page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
246
|
+
return page
|
247
|
+
|
248
|
+
async def aget_page(
|
249
|
+
self,
|
250
|
+
url: str,
|
251
|
+
timeout: float = 8,
|
252
|
+
wait_until: Optional[WaitUntil] = "domcontentloaded",
|
253
|
+
referer: Optional[str] = None,
|
254
|
+
) -> playwright.async_api.Page:
|
255
|
+
"""
|
256
|
+
Create a new page and navigate to the given URL asynchronously.
|
257
|
+
|
258
|
+
Args:
|
259
|
+
url (str): URL to navigate to.
|
260
|
+
timeout (float): Maximum navigation time in seconds.
|
261
|
+
wait_until (str): Load state to wait for.
|
262
|
+
referer (Optional[str]): Referer URL to set.
|
263
|
+
|
264
|
+
Returns:
|
265
|
+
AsyncPage: The Playwright asynchronous page object.
|
266
|
+
"""
|
267
|
+
page = await (await self.get_async_browser()).new_page()
|
268
|
+
await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
|
269
|
+
return page
|
270
|
+
|
271
|
+
def url_to_md(
|
272
|
+
self,
|
273
|
+
url: str,
|
274
|
+
wait: float = 0.2,
|
275
|
+
scrolldown: bool = False,
|
276
|
+
sleep: int = 0,
|
277
|
+
reload: bool = True,
|
278
|
+
timeout: Union[float, int] = 8,
|
279
|
+
keep_page: bool = False,
|
280
|
+
referer: Optional[str] = None,
|
281
|
+
) -> str:
|
282
|
+
"""
|
283
|
+
Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
|
284
|
+
|
285
|
+
Args:
|
286
|
+
url (str): URL of the page.
|
287
|
+
wait (float): Time to wait after navigation (in seconds).
|
288
|
+
scrolldown (bool): If True, scroll to the bottom of the page.
|
289
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
290
|
+
reload (bool): If True, reload the page.
|
291
|
+
timeout (float | int): Navigation timeout in seconds.
|
292
|
+
keep_page (bool): If True, do not close the page after processing.
|
293
|
+
referer (Optional[str]): Referer URL to set.
|
294
|
+
|
295
|
+
Returns:
|
296
|
+
str: The page content converted to Markdown.
|
297
|
+
"""
|
298
|
+
page: Optional[playwright.sync_api.Page] = None
|
299
|
+
if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
|
300
|
+
with open(url, "r", encoding="utf-8") as f:
|
301
|
+
html = f.read()
|
302
|
+
else:
|
303
|
+
page = self.get_page(url, timeout=timeout, referer=referer)
|
304
|
+
if wait:
|
305
|
+
page.wait_for_timeout(wait * 1000)
|
306
|
+
if scrolldown:
|
307
|
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
308
|
+
if sleep:
|
309
|
+
page.wait_for_timeout(sleep * 1000)
|
310
|
+
if reload:
|
311
|
+
page.reload(timeout=int(timeout * 1000))
|
312
|
+
html = page.content()
|
313
|
+
|
314
|
+
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
315
|
+
if not keep_page and page is not None:
|
316
|
+
page.close()
|
317
|
+
return md
|
318
|
+
|
319
|
+
async def aurl_to_md(
|
320
|
+
self,
|
321
|
+
url: str,
|
322
|
+
wait: float = 0.2,
|
323
|
+
scrolldown: bool = False,
|
324
|
+
sleep: int = 0,
|
325
|
+
reload: bool = True,
|
326
|
+
timeout: Union[float, int] = 8,
|
327
|
+
keep_page: bool = False,
|
328
|
+
referer: Optional[str] = None,
|
329
|
+
) -> str:
|
330
|
+
"""
|
331
|
+
Asynchronously navigate to a URL, wait, scroll or reload if specified,
|
332
|
+
and convert the rendered HTML to Markdown.
|
333
|
+
|
334
|
+
Args:
|
335
|
+
url (str): URL of the page.
|
336
|
+
wait (float): Time to wait after navigation (in seconds).
|
337
|
+
scrolldown (bool): If True, scroll the page.
|
338
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
339
|
+
reload (bool): If True, reload the page.
|
340
|
+
timeout (float | int): Navigation timeout (in seconds).
|
341
|
+
keep_page (bool): If True, do not close the page after processing.
|
342
|
+
referer (Optional[str]): Referer URL to set.
|
343
|
+
|
344
|
+
Returns:
|
345
|
+
str: The page content converted to Markdown.
|
346
|
+
"""
|
347
|
+
page: Optional[playwright.async_api.Page] = None
|
348
|
+
if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
|
349
|
+
with open(url, "r", encoding="utf-8") as f:
|
350
|
+
html = f.read()
|
351
|
+
else:
|
352
|
+
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
353
|
+
if wait:
|
354
|
+
await page.wait_for_timeout(wait * 1000)
|
355
|
+
if scrolldown:
|
356
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
357
|
+
if sleep:
|
358
|
+
await page.wait_for_timeout(sleep * 1000)
|
359
|
+
if reload:
|
360
|
+
await page.reload(timeout=int(timeout * 1000))
|
361
|
+
html = await page.content()
|
362
|
+
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
363
|
+
if not keep_page and page is not None:
|
364
|
+
await page.close()
|
365
|
+
return md
|
366
|
+
|
367
|
+
def select_and_extract(
|
368
|
+
self,
|
369
|
+
url: str,
|
370
|
+
css_selector: str,
|
371
|
+
wait: float = 0.2,
|
372
|
+
scrolldown: bool = False,
|
373
|
+
sleep: int = 0,
|
374
|
+
reload: bool = True,
|
375
|
+
timeout: Union[float, int] = 8,
|
376
|
+
keep_page: bool = False,
|
377
|
+
referer: Optional[str] = None,
|
378
|
+
) -> list[str]:
|
379
|
+
"""
|
380
|
+
Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
|
381
|
+
|
382
|
+
Args:
|
383
|
+
url (str): URL of the page.
|
384
|
+
css_selector (str): CSS selector to locate elements.
|
385
|
+
wait (float): Time to wait after navigation (in seconds).
|
386
|
+
scrolldown (bool): If True, scroll the page.
|
387
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
388
|
+
reload (bool): If True, reload the page.
|
389
|
+
timeout (float | int): Maximum navigation time (in seconds).
|
390
|
+
keep_page (bool): If True, do not close the page after processing.
|
391
|
+
referer (Optional[str]): Referer URL to set.
|
392
|
+
|
393
|
+
Returns:
|
394
|
+
List[str]: A list of text contents from the matching elements.
|
395
|
+
"""
|
396
|
+
page = self.get_page(url, timeout=timeout, referer=referer)
|
397
|
+
if wait:
|
398
|
+
page.wait_for_timeout(wait * 1000)
|
399
|
+
if scrolldown:
|
400
|
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
401
|
+
if sleep:
|
402
|
+
page.wait_for_timeout(sleep * 1000)
|
403
|
+
if reload:
|
404
|
+
page.reload(timeout=int(timeout * 1000))
|
405
|
+
elements = page.query_selector_all(css_selector)
|
406
|
+
texts = [element.inner_text() for element in elements]
|
407
|
+
if not keep_page:
|
408
|
+
page.close()
|
409
|
+
return texts
|
410
|
+
|
411
|
+
async def aselect_and_extract(
|
412
|
+
self,
|
413
|
+
url: str,
|
414
|
+
css_selector: str,
|
415
|
+
wait: float = 0.2,
|
416
|
+
scrolldown: bool = False,
|
417
|
+
sleep: int = 0,
|
418
|
+
reload: bool = True,
|
419
|
+
timeout: Union[float, int] = 8,
|
420
|
+
keep_page: bool = False,
|
421
|
+
referer: Optional[str] = None,
|
422
|
+
) -> list[str]:
|
423
|
+
"""
|
424
|
+
Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
|
425
|
+
|
426
|
+
Args:
|
427
|
+
url (str): URL of the page.
|
428
|
+
css_selector (str): CSS selector to locate elements.
|
429
|
+
wait (float): Time to wait after navigation (in seconds).
|
430
|
+
scrolldown (bool): If True, scroll the page.
|
431
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
432
|
+
reload (bool): If True, reload the page.
|
433
|
+
timeout (float | int): Navigation timeout (in seconds).
|
434
|
+
keep_page (bool): If True, do not close the page after processing.
|
435
|
+
referer (Optional[str]): Referer URL to set.
|
436
|
+
|
437
|
+
Returns:
|
438
|
+
List[str]: A list of text contents from the matching elements.
|
439
|
+
"""
|
440
|
+
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
441
|
+
if wait:
|
442
|
+
await page.wait_for_timeout(wait * 1000)
|
443
|
+
if scrolldown:
|
444
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
445
|
+
if sleep:
|
446
|
+
await page.wait_for_timeout(sleep * 1000)
|
447
|
+
if reload:
|
448
|
+
await page.reload(timeout=int(timeout * 1000))
|
449
|
+
elements = await page.query_selector_all(css_selector)
|
450
|
+
texts: list[str] = []
|
451
|
+
for element in elements:
|
452
|
+
text = await element.inner_text()
|
453
|
+
texts.append(text)
|
454
|
+
if not keep_page:
|
455
|
+
await page.close()
|
456
|
+
return texts
|
457
|
+
|
458
|
+
def url_to_md_with_llm(
|
459
|
+
self,
|
460
|
+
url: str,
|
461
|
+
chunk_size: Optional[int] = None,
|
462
|
+
wait: float = 0.2,
|
463
|
+
scrolldown: bool = False,
|
464
|
+
sleep: int = 0,
|
465
|
+
reload: bool = True,
|
466
|
+
timeout: Union[float, int] = 8,
|
467
|
+
keep_page: bool = False,
|
468
|
+
referer: Optional[str] = None,
|
469
|
+
describe_images: bool = True,
|
470
|
+
filter: bool = True,
|
471
|
+
) -> str:
|
472
|
+
"""
|
473
|
+
Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
|
474
|
+
|
475
|
+
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
476
|
+
to select the important line ranges. It then reconstructs the filtered Markdown.
|
477
|
+
|
478
|
+
Args:
|
479
|
+
url (str): URL of the page.
|
480
|
+
chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
|
481
|
+
wait (float): Time to wait after navigation (in seconds).
|
482
|
+
scrolldown (bool): If True, scroll down the page.
|
483
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
484
|
+
reload (bool): If True, reload the page.
|
485
|
+
timeout (float | int): Navigation timeout (in seconds).
|
486
|
+
keep_page (bool): If True, do not close the page after processing.
|
487
|
+
referer (Optional[str]): Referer URL to set.
|
488
|
+
describe_images (bool): If True, describe images in the Markdown text.
|
489
|
+
filter (bool): If True, filter the important lines using the language model.
|
490
|
+
|
491
|
+
Returns:
|
492
|
+
str: Filtered Markdown containing only the important lines.
|
493
|
+
"""
|
494
|
+
if self.chatterer is None:
|
495
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
496
|
+
markdown_content = self.url_to_md(
|
497
|
+
url,
|
498
|
+
wait=wait,
|
499
|
+
scrolldown=scrolldown,
|
500
|
+
sleep=sleep,
|
501
|
+
reload=reload,
|
502
|
+
timeout=timeout,
|
503
|
+
keep_page=keep_page,
|
504
|
+
referer=referer,
|
505
|
+
)
|
506
|
+
if describe_images:
|
507
|
+
markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
|
508
|
+
if not filter:
|
509
|
+
return markdown_content
|
510
|
+
lines = markdown_content.split("\n")
|
511
|
+
line_length = len(lines)
|
512
|
+
important_lines: set[int] = set()
|
513
|
+
|
514
|
+
def _into_safe_range(value: int) -> int:
|
515
|
+
"""Ensure the line index stays within bounds."""
|
516
|
+
return min(max(value, 0), line_length - 1)
|
517
|
+
|
518
|
+
if chunk_size is None:
|
519
|
+
chunk_size = line_length
|
520
|
+
|
521
|
+
# Process the markdown in chunks.
|
522
|
+
for i in range(0, len(lines), chunk_size):
|
523
|
+
chunk_lines = lines[i : i + chunk_size]
|
524
|
+
# Prepend line numbers to each line.
|
525
|
+
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
526
|
+
# Use the language model synchronously to get the line ranges.
|
527
|
+
result: SelectedLineRanges = self.chatterer.generate_pydantic(
|
528
|
+
response_model=SelectedLineRanges,
|
529
|
+
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
530
|
+
)
|
531
|
+
for range_str in result.line_ranges:
|
532
|
+
if "-" in range_str:
|
533
|
+
start, end = map(int, range_str.split("-"))
|
534
|
+
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
535
|
+
else:
|
536
|
+
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
537
|
+
# Reconstruct the filtered markdown.
|
538
|
+
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
539
|
+
|
540
|
+
async def aurl_to_md_with_llm(
|
541
|
+
self,
|
542
|
+
url: str,
|
543
|
+
chunk_size: Optional[int] = None,
|
544
|
+
wait: float = 0.2,
|
545
|
+
scrolldown: bool = False,
|
546
|
+
sleep: int = 0,
|
547
|
+
reload: bool = True,
|
548
|
+
timeout: Union[float, int] = 8,
|
549
|
+
keep_page: bool = False,
|
550
|
+
referer: Optional[str] = None,
|
551
|
+
describe_images: bool = True,
|
552
|
+
filter: bool = True,
|
553
|
+
) -> str:
|
554
|
+
"""
|
555
|
+
Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
|
556
|
+
to filter out unimportant lines.
|
557
|
+
|
558
|
+
The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
|
559
|
+
to select the important line ranges. It then reconstructs the filtered Markdown.
|
560
|
+
|
561
|
+
Args:
|
562
|
+
url (str): URL of the page.
|
563
|
+
chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
|
564
|
+
wait (float): Time to wait after navigation (in seconds).
|
565
|
+
scrolldown (bool): If True, scroll the page.
|
566
|
+
sleep (int): Time to wait after scrolling (in seconds).
|
567
|
+
reload (bool): If True, reload the page.
|
568
|
+
timeout (float | int): Navigation timeout (in seconds).
|
569
|
+
keep_page (bool): If True, do not close the page after processing.
|
570
|
+
referer (Optional[str]): Referer URL to set.
|
571
|
+
describe_images (bool): If True, describe images in the Markdown text.
|
572
|
+
filter (bool): If True, filter the important lines using the language model.
|
573
|
+
|
574
|
+
Returns:
|
575
|
+
str: Filtered Markdown containing only the important lines.
|
576
|
+
"""
|
577
|
+
if self.chatterer is None:
|
578
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
579
|
+
markdown_content = await self.aurl_to_md(
|
580
|
+
url,
|
581
|
+
wait=wait,
|
582
|
+
scrolldown=scrolldown,
|
583
|
+
sleep=sleep,
|
584
|
+
reload=reload,
|
585
|
+
timeout=timeout,
|
586
|
+
keep_page=keep_page,
|
587
|
+
referer=referer,
|
588
|
+
)
|
589
|
+
if describe_images:
|
590
|
+
markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
|
591
|
+
if not filter:
|
592
|
+
return markdown_content
|
593
|
+
lines = markdown_content.split("\n")
|
594
|
+
line_length = len(lines)
|
595
|
+
important_lines: set[int] = set()
|
596
|
+
|
597
|
+
def _into_safe_range(value: int) -> int:
|
598
|
+
"""Ensure the line index is within valid bounds."""
|
599
|
+
return min(max(value, 0), line_length - 1)
|
600
|
+
|
601
|
+
if chunk_size is None:
|
602
|
+
chunk_size = line_length
|
603
|
+
|
604
|
+
for i in range(0, len(lines), chunk_size):
|
605
|
+
chunk_lines = lines[i : i + chunk_size]
|
606
|
+
numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
|
607
|
+
# Use the asynchronous language model method.
|
608
|
+
result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
|
609
|
+
response_model=SelectedLineRanges,
|
610
|
+
messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
|
611
|
+
)
|
612
|
+
for range_str in result.line_ranges:
|
613
|
+
if "-" in range_str:
|
614
|
+
start, end = map(int, range_str.split("-"))
|
615
|
+
important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
|
616
|
+
else:
|
617
|
+
important_lines.add(_into_safe_range(int(range_str) + i - 1))
|
618
|
+
return "\n".join(lines[line_no] for line_no in sorted(important_lines))
|
619
|
+
|
620
|
+
def describe_images(self, markdown_text: str, referer_url: str) -> str:
|
621
|
+
"""
|
622
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
623
|
+
Using Playwright for fetching images to bypass CDN protections.
|
624
|
+
"""
|
625
|
+
if self.chatterer is None:
|
626
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
627
|
+
return caption_markdown_images(
|
628
|
+
markdown_text=markdown_text,
|
629
|
+
headers=self.headers | {"Referer": referer_url},
|
630
|
+
description_format=self.description_format,
|
631
|
+
image_description_instruction=self.image_description_instruction,
|
632
|
+
chatterer=self.chatterer,
|
633
|
+
image_processing_config=self.image_processing_config,
|
634
|
+
img_bytes_fetcher=self._playwright_fetch_image_bytes,
|
635
|
+
)
|
636
|
+
|
637
|
+
# 기존 adescribe_images 메서드를 다음과 같이 수정합니다.
|
638
|
+
async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
|
639
|
+
"""
|
640
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
641
|
+
Using Playwright for fetching images to bypass CDN protections.
|
642
|
+
"""
|
643
|
+
if self.chatterer is None:
|
644
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
645
|
+
return await acaption_markdown_images(
|
646
|
+
markdown_text=markdown_text,
|
647
|
+
headers=self.headers | {"Referer": referer_url},
|
648
|
+
description_format=self.description_format,
|
649
|
+
image_description_instruction=self.image_description_instruction,
|
650
|
+
chatterer=self.chatterer,
|
651
|
+
image_processing_config=self.image_processing_config,
|
652
|
+
img_bytes_fetcher=self._aplaywright_fetch_image_bytes,
|
653
|
+
)
|
654
|
+
|
655
|
+
def _playwright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
|
656
|
+
"""Playwright를 사용하여 동기적으로 이미지 바이트를 가져옵니다."""
|
657
|
+
page: Optional[playwright.sync_api.Page] = None
|
658
|
+
try:
|
659
|
+
# Get the existing synchronous browser context.
|
660
|
+
page = self.get_sync_browser().new_page()
|
661
|
+
|
662
|
+
# Set the provided headers as extra HTTP headers for the page.
|
663
|
+
# This will apply to all subsequent requests made by the page.
|
664
|
+
page.set_extra_http_headers(headers)
|
665
|
+
response = page.goto(image_url, wait_until="load", timeout=15000)
|
666
|
+
if response and response.ok:
|
667
|
+
return response.body()
|
668
|
+
else:
|
669
|
+
return b""
|
670
|
+
except Exception as e:
|
671
|
+
print(f"Playwright exception fetching image: {image_url}, Error: {e}")
|
672
|
+
return b""
|
673
|
+
finally:
|
674
|
+
if page:
|
675
|
+
page.close()
|
676
|
+
|
677
|
+
async def _aplaywright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
|
678
|
+
"""Playwright를 사용하여 비동기적으로 이미지 바이트를 가져옵니다."""
|
679
|
+
page: Optional[playwright.async_api.Page] = None
|
680
|
+
try:
|
681
|
+
# Get the existing asynchronous browser context.
|
682
|
+
page = await (await self.get_async_browser()).new_page()
|
683
|
+
|
684
|
+
# Set the provided headers as extra HTTP headers for the page.
|
685
|
+
# This will apply to all subsequent requests made by the page.
|
686
|
+
await page.set_extra_http_headers(headers)
|
687
|
+
response = await page.goto(image_url, wait_until="load", timeout=15000)
|
688
|
+
if response and response.ok:
|
689
|
+
return await response.body()
|
690
|
+
else:
|
691
|
+
# 실패 시 로그를 남기거나 None을 반환할 수 있습니다.
|
692
|
+
print(
|
693
|
+
f"Playwright failed to fetch image: {image_url}, Status: {response.status if response else 'No Response'}"
|
694
|
+
)
|
695
|
+
return b""
|
696
|
+
except Exception as e:
|
697
|
+
# 예외 발생 시 로그를 남깁니다.
|
698
|
+
print(f"Playwright exception fetching image: {image_url}, Error: {e}")
|
699
|
+
return b""
|
700
|
+
finally:
|
701
|
+
# 페이지를 항상 닫아 리소스를 정리합니다.
|
702
|
+
if page:
|
703
|
+
await page.close()
|
704
|
+
|
705
|
+
def __enter__(self) -> Self:
|
706
|
+
return self
|
707
|
+
|
708
|
+
async def __aenter__(self) -> Self:
|
709
|
+
return self
|
710
|
+
|
711
|
+
def __exit__(
|
712
|
+
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
713
|
+
) -> None:
|
714
|
+
"""
|
715
|
+
Exit the synchronous context.
|
716
|
+
|
717
|
+
Closes the browser and stops Playwright.
|
718
|
+
"""
|
719
|
+
if self.sync_browser_context is not None:
|
720
|
+
self.sync_browser_context.close()
|
721
|
+
self.sync_browser_context = None
|
722
|
+
if self.sync_playwright:
|
723
|
+
self.sync_playwright.stop()
|
724
|
+
self.sync_playwright = None
|
725
|
+
|
726
|
+
async def __aexit__(
|
727
|
+
self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
|
728
|
+
) -> None:
|
729
|
+
"""
|
730
|
+
Asynchronously exit the context.
|
731
|
+
|
732
|
+
Closes the asynchronous browser and stops Playwright.
|
733
|
+
"""
|
734
|
+
if self.async_browser_context is not None:
|
735
|
+
await self.async_browser_context.close()
|
736
|
+
self.async_browser_context = None
|
737
|
+
if self.async_playwright:
|
738
|
+
await self.async_playwright.stop()
|
739
|
+
self.async_playwright = None
|