chatterer 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +93 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__init__.py +0 -0
  5. chatterer/examples/anything_to_markdown.py +85 -91
  6. chatterer/examples/get_code_snippets.py +55 -62
  7. chatterer/examples/login_with_playwright.py +156 -167
  8. chatterer/examples/make_ppt.py +488 -497
  9. chatterer/examples/pdf_to_markdown.py +100 -107
  10. chatterer/examples/pdf_to_text.py +54 -56
  11. chatterer/examples/transcription_api.py +112 -123
  12. chatterer/examples/upstage_parser.py +89 -100
  13. chatterer/examples/webpage_to_markdown.py +70 -79
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +533 -533
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +393 -302
  30. chatterer/tools/convert_to_text.py +446 -447
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +285 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/METADATA +392 -392
  40. chatterer-0.1.20.dist-info/RECORD +44 -0
  41. {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/WHEEL +1 -1
  42. chatterer-0.1.20.dist-info/entry_points.txt +10 -0
  43. chatterer-0.1.18.dist-info/RECORD +0 -42
  44. {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/top_level.txt +0 -0
@@ -1,739 +1,739 @@
1
- """
2
- PlaywrightBot
3
-
4
- This module provides a single class that uses Playwright to:
5
- - Fetch and render HTML pages (with JavaScript execution),
6
- - Optionally scroll down or reload pages,
7
- - Convert rendered HTML into Markdown,
8
- - Extract specific elements using CSS selectors,
9
- - Filter key information from a page via integration with a language model (Chatterer).
10
-
11
- Both synchronous and asynchronous methods are available in this unified class.
12
- Use the synchronous methods (without the "a" prefix) in a normal context manager,
13
- or use the asynchronous methods (prefixed with "a") within an async context manager.
14
- """
15
-
16
- from __future__ import annotations
17
-
18
- from dataclasses import dataclass, field
19
- from pathlib import Path
20
- from types import TracebackType
21
- from typing import (
22
- TYPE_CHECKING,
23
- Literal,
24
- NotRequired,
25
- Optional,
26
- Self,
27
- Sequence,
28
- Type,
29
- TypeAlias,
30
- TypedDict,
31
- Union,
32
- )
33
-
34
- from pydantic import BaseModel, Field
35
-
36
- from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
37
- from ..utils.base64_image import ImageProcessingConfig, get_default_image_processing_config, is_remote_url
38
- from .caption_markdown_images import acaption_markdown_images, caption_markdown_images
39
- from .convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
40
-
41
- if TYPE_CHECKING:
42
- import playwright.async_api
43
- import playwright.sync_api
44
-
45
- WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
46
- DEFAULT_UA: str = (
47
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
48
- )
49
-
50
-
51
- class SelectedLineRanges(BaseModel):
52
- line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
53
-
54
-
55
- class PlaywrightLaunchOptions(TypedDict):
56
- executable_path: NotRequired[str | Path]
57
- channel: NotRequired[str]
58
- args: NotRequired[Sequence[str]]
59
- ignore_default_args: NotRequired[bool | Sequence[str]]
60
- handle_sigint: NotRequired[bool]
61
- handle_sigterm: NotRequired[bool]
62
- handle_sighup: NotRequired[bool]
63
- timeout: NotRequired[float]
64
- env: NotRequired[dict[str, str | float | bool]]
65
- headless: NotRequired[bool]
66
- devtools: NotRequired[bool]
67
- proxy: NotRequired[playwright.sync_api.ProxySettings]
68
- downloads_path: NotRequired[str | Path]
69
- slow_mo: NotRequired[float]
70
- traces_dir: NotRequired[str | Path]
71
- chromium_sandbox: NotRequired[bool]
72
- firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
73
-
74
-
75
- class PlaywrightPersistencyOptions(TypedDict):
76
- user_data_dir: NotRequired[str | Path]
77
- storage_state: NotRequired[playwright.sync_api.StorageState]
78
-
79
-
80
- class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
81
-
82
-
83
- def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
84
- return {"headless": True}
85
-
86
-
87
- @dataclass
88
- class PlayWrightBot:
89
- """
90
- A unified bot that leverages Playwright to render web pages, convert them to Markdown,
91
- extract elements, and filter key information using a language model.
92
-
93
- This class exposes both synchronous and asynchronous methods.
94
-
95
- Synchronous usage:
96
- with UnifiedPlaywrightBot() as bot:
97
- md = bot.url_to_md("https://example.com")
98
- headings = bot.select_and_extract("https://example.com", "h2")
99
- filtered_md = bot.url_to_md_with_llm("https://example.com")
100
-
101
- Asynchronous usage:
102
- async with UnifiedPlaywrightBot() as bot:
103
- md = await bot.aurl_to_md("https://example.com")
104
- headings = await bot.aselect_and_extract("https://example.com", "h2")
105
- filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
106
-
107
- Attributes:
108
- headless (bool): Whether to run the browser in headless mode (default True).
109
- chatterer (Chatterer): An instance of the language model interface for processing text.
110
- """
111
-
112
- engine: Literal["firefox", "chromium", "webkit"] = "firefox"
113
- chatterer: Optional[Chatterer] = field(default_factory=Chatterer.openai)
114
- playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
115
- playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
116
- html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
117
- image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
118
- headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
119
- markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
120
-
121
- You excel at the following tasks:
122
- 1. Identifying the main article content of a webpage.
123
- 2. Filtering out ads, navigation links, and other irrelevant information.
124
- 3. Selecting the line number ranges that correspond to the article content.
125
- 4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
126
-
127
- However, there are a few rules you must follow:
128
- 1. Do not remove the title of the article, if present.
129
- 2. Do not remove the author's name or the publication date, if present.
130
- 3. Include only images that are part of the article.
131
-
132
- Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
133
-
134
- Markdown-formatted webpage content is provided below for your reference:
135
- ---
136
- """.strip()
137
- description_format: str = (
138
- "<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
139
- )
140
- image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
141
-
142
- sync_playwright: Optional[playwright.sync_api.Playwright] = None
143
- sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
144
- async_playwright: Optional[playwright.async_api.Playwright] = None
145
- async_browser_context: Optional[playwright.async_api.BrowserContext] = None
146
-
147
- def get_sync_playwright(self) -> playwright.sync_api.Playwright:
148
- if self.sync_playwright is None:
149
- from playwright.sync_api import sync_playwright
150
-
151
- self.sync_playwright = sync_playwright().start()
152
- return self.sync_playwright
153
-
154
- async def get_async_playwright(self) -> playwright.async_api.Playwright:
155
- if self.async_playwright is None:
156
- from playwright.async_api import async_playwright
157
-
158
- self.async_playwright = await async_playwright().start()
159
- return self.async_playwright
160
-
161
- def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
162
- if self.sync_browser_context is not None:
163
- return self.sync_browser_context
164
-
165
- def get_browser() -> playwright.sync_api.BrowserType:
166
- playwright = self.get_sync_playwright()
167
- if self.engine == "firefox":
168
- return playwright.firefox
169
- elif self.engine == "chromium":
170
- return playwright.chromium
171
- elif self.engine == "webkit":
172
- return playwright.webkit
173
- else:
174
- raise ValueError(f"Unsupported engine: {self.engine}")
175
-
176
- user_data_dir = self.playwright_persistency_options.get("user_data_dir")
177
- if user_data_dir:
178
- # Use persistent context if user_data_dir is provided
179
- self.sync_browser_context = get_browser().launch_persistent_context(
180
- user_data_dir=user_data_dir, **self.playwright_launch_options
181
- )
182
- return self.sync_browser_context
183
-
184
- # Otherwise, launch a new context
185
- browser = get_browser().launch(**self.playwright_launch_options)
186
- storage_state = self.playwright_persistency_options.get("storage_state")
187
- if storage_state:
188
- self.sync_browser_context = browser.new_context(storage_state=storage_state)
189
- else:
190
- self.sync_browser_context = browser.new_context()
191
- return self.sync_browser_context
192
-
193
- async def get_async_browser(self) -> playwright.async_api.BrowserContext:
194
- if self.async_browser_context is not None:
195
- return self.async_browser_context
196
-
197
- async def get_browser() -> playwright.async_api.BrowserType:
198
- playwright = await self.get_async_playwright()
199
- if self.engine == "firefox":
200
- return playwright.firefox
201
- elif self.engine == "chromium":
202
- return playwright.chromium
203
- elif self.engine == "webkit":
204
- return playwright.webkit
205
- else:
206
- raise ValueError(f"Unsupported engine: {self.engine}")
207
-
208
- user_data_dir = self.playwright_persistency_options.get("user_data_dir")
209
- if user_data_dir:
210
- # Use persistent context if user_data_dir is provided
211
- self.async_browser_context = await (await get_browser()).launch_persistent_context(
212
- user_data_dir=user_data_dir, **self.playwright_launch_options
213
- )
214
- return self.async_browser_context
215
-
216
- # Otherwise, launch a new context
217
- browser = await (await get_browser()).launch(**self.playwright_launch_options)
218
- storage_state = self.playwright_persistency_options.get("storage_state")
219
- if storage_state:
220
- self.async_browser_context = await browser.new_context(storage_state=storage_state)
221
- else:
222
- self.async_browser_context = await browser.new_context()
223
- return self.async_browser_context
224
-
225
- def get_page(
226
- self,
227
- url: str,
228
- timeout: float = 10.0,
229
- wait_until: Optional[WaitUntil] = "domcontentloaded",
230
- referer: Optional[str] = None,
231
- ) -> playwright.sync_api.Page:
232
- """
233
- Create a new page and navigate to the given URL synchronously.
234
-
235
- Args:
236
- url (str): URL to navigate to.
237
- timeout (float): Maximum navigation time in seconds.
238
- wait_until (str): Load state to wait for (e.g., "domcontentloaded").
239
- referer (Optional[str]): Referer URL to set.
240
-
241
- Returns:
242
- Page: The Playwright page object.
243
- """
244
- page = self.get_sync_browser().new_page()
245
- page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
246
- return page
247
-
248
- async def aget_page(
249
- self,
250
- url: str,
251
- timeout: float = 8,
252
- wait_until: Optional[WaitUntil] = "domcontentloaded",
253
- referer: Optional[str] = None,
254
- ) -> playwright.async_api.Page:
255
- """
256
- Create a new page and navigate to the given URL asynchronously.
257
-
258
- Args:
259
- url (str): URL to navigate to.
260
- timeout (float): Maximum navigation time in seconds.
261
- wait_until (str): Load state to wait for.
262
- referer (Optional[str]): Referer URL to set.
263
-
264
- Returns:
265
- AsyncPage: The Playwright asynchronous page object.
266
- """
267
- page = await (await self.get_async_browser()).new_page()
268
- await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
269
- return page
270
-
271
- def url_to_md(
272
- self,
273
- url: str,
274
- wait: float = 0.2,
275
- scrolldown: bool = False,
276
- sleep: int = 0,
277
- reload: bool = True,
278
- timeout: Union[float, int] = 8,
279
- keep_page: bool = False,
280
- referer: Optional[str] = None,
281
- ) -> str:
282
- """
283
- Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
284
-
285
- Args:
286
- url (str): URL of the page.
287
- wait (float): Time to wait after navigation (in seconds).
288
- scrolldown (bool): If True, scroll to the bottom of the page.
289
- sleep (int): Time to wait after scrolling (in seconds).
290
- reload (bool): If True, reload the page.
291
- timeout (float | int): Navigation timeout in seconds.
292
- keep_page (bool): If True, do not close the page after processing.
293
- referer (Optional[str]): Referer URL to set.
294
-
295
- Returns:
296
- str: The page content converted to Markdown.
297
- """
298
- page: Optional[playwright.sync_api.Page] = None
299
- if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
300
- with open(url, "r", encoding="utf-8") as f:
301
- html = f.read()
302
- else:
303
- page = self.get_page(url, timeout=timeout, referer=referer)
304
- if wait:
305
- page.wait_for_timeout(wait * 1000)
306
- if scrolldown:
307
- page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
308
- if sleep:
309
- page.wait_for_timeout(sleep * 1000)
310
- if reload:
311
- page.reload(timeout=int(timeout * 1000))
312
- html = page.content()
313
-
314
- md = html_to_markdown(html=html, options=self.html_to_markdown_options)
315
- if not keep_page and page is not None:
316
- page.close()
317
- return md
318
-
319
- async def aurl_to_md(
320
- self,
321
- url: str,
322
- wait: float = 0.2,
323
- scrolldown: bool = False,
324
- sleep: int = 0,
325
- reload: bool = True,
326
- timeout: Union[float, int] = 8,
327
- keep_page: bool = False,
328
- referer: Optional[str] = None,
329
- ) -> str:
330
- """
331
- Asynchronously navigate to a URL, wait, scroll or reload if specified,
332
- and convert the rendered HTML to Markdown.
333
-
334
- Args:
335
- url (str): URL of the page.
336
- wait (float): Time to wait after navigation (in seconds).
337
- scrolldown (bool): If True, scroll the page.
338
- sleep (int): Time to wait after scrolling (in seconds).
339
- reload (bool): If True, reload the page.
340
- timeout (float | int): Navigation timeout (in seconds).
341
- keep_page (bool): If True, do not close the page after processing.
342
- referer (Optional[str]): Referer URL to set.
343
-
344
- Returns:
345
- str: The page content converted to Markdown.
346
- """
347
- page: Optional[playwright.async_api.Page] = None
348
- if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
349
- with open(url, "r", encoding="utf-8") as f:
350
- html = f.read()
351
- else:
352
- page = await self.aget_page(url, timeout=timeout, referer=referer)
353
- if wait:
354
- await page.wait_for_timeout(wait * 1000)
355
- if scrolldown:
356
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
357
- if sleep:
358
- await page.wait_for_timeout(sleep * 1000)
359
- if reload:
360
- await page.reload(timeout=int(timeout * 1000))
361
- html = await page.content()
362
- md = html_to_markdown(html=html, options=self.html_to_markdown_options)
363
- if not keep_page and page is not None:
364
- await page.close()
365
- return md
366
-
367
- def select_and_extract(
368
- self,
369
- url: str,
370
- css_selector: str,
371
- wait: float = 0.2,
372
- scrolldown: bool = False,
373
- sleep: int = 0,
374
- reload: bool = True,
375
- timeout: Union[float, int] = 8,
376
- keep_page: bool = False,
377
- referer: Optional[str] = None,
378
- ) -> list[str]:
379
- """
380
- Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
381
-
382
- Args:
383
- url (str): URL of the page.
384
- css_selector (str): CSS selector to locate elements.
385
- wait (float): Time to wait after navigation (in seconds).
386
- scrolldown (bool): If True, scroll the page.
387
- sleep (int): Time to wait after scrolling (in seconds).
388
- reload (bool): If True, reload the page.
389
- timeout (float | int): Maximum navigation time (in seconds).
390
- keep_page (bool): If True, do not close the page after processing.
391
- referer (Optional[str]): Referer URL to set.
392
-
393
- Returns:
394
- List[str]: A list of text contents from the matching elements.
395
- """
396
- page = self.get_page(url, timeout=timeout, referer=referer)
397
- if wait:
398
- page.wait_for_timeout(wait * 1000)
399
- if scrolldown:
400
- page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
401
- if sleep:
402
- page.wait_for_timeout(sleep * 1000)
403
- if reload:
404
- page.reload(timeout=int(timeout * 1000))
405
- elements = page.query_selector_all(css_selector)
406
- texts = [element.inner_text() for element in elements]
407
- if not keep_page:
408
- page.close()
409
- return texts
410
-
411
- async def aselect_and_extract(
412
- self,
413
- url: str,
414
- css_selector: str,
415
- wait: float = 0.2,
416
- scrolldown: bool = False,
417
- sleep: int = 0,
418
- reload: bool = True,
419
- timeout: Union[float, int] = 8,
420
- keep_page: bool = False,
421
- referer: Optional[str] = None,
422
- ) -> list[str]:
423
- """
424
- Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
425
-
426
- Args:
427
- url (str): URL of the page.
428
- css_selector (str): CSS selector to locate elements.
429
- wait (float): Time to wait after navigation (in seconds).
430
- scrolldown (bool): If True, scroll the page.
431
- sleep (int): Time to wait after scrolling (in seconds).
432
- reload (bool): If True, reload the page.
433
- timeout (float | int): Navigation timeout (in seconds).
434
- keep_page (bool): If True, do not close the page after processing.
435
- referer (Optional[str]): Referer URL to set.
436
-
437
- Returns:
438
- List[str]: A list of text contents from the matching elements.
439
- """
440
- page = await self.aget_page(url, timeout=timeout, referer=referer)
441
- if wait:
442
- await page.wait_for_timeout(wait * 1000)
443
- if scrolldown:
444
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
- if sleep:
446
- await page.wait_for_timeout(sleep * 1000)
447
- if reload:
448
- await page.reload(timeout=int(timeout * 1000))
449
- elements = await page.query_selector_all(css_selector)
450
- texts: list[str] = []
451
- for element in elements:
452
- text = await element.inner_text()
453
- texts.append(text)
454
- if not keep_page:
455
- await page.close()
456
- return texts
457
-
458
- def url_to_md_with_llm(
459
- self,
460
- url: str,
461
- chunk_size: Optional[int] = None,
462
- wait: float = 0.2,
463
- scrolldown: bool = False,
464
- sleep: int = 0,
465
- reload: bool = True,
466
- timeout: Union[float, int] = 8,
467
- keep_page: bool = False,
468
- referer: Optional[str] = None,
469
- describe_images: bool = True,
470
- filter: bool = True,
471
- ) -> str:
472
- """
473
- Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
474
-
475
- The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
476
- to select the important line ranges. It then reconstructs the filtered Markdown.
477
-
478
- Args:
479
- url (str): URL of the page.
480
- chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
481
- wait (float): Time to wait after navigation (in seconds).
482
- scrolldown (bool): If True, scroll down the page.
483
- sleep (int): Time to wait after scrolling (in seconds).
484
- reload (bool): If True, reload the page.
485
- timeout (float | int): Navigation timeout (in seconds).
486
- keep_page (bool): If True, do not close the page after processing.
487
- referer (Optional[str]): Referer URL to set.
488
- describe_images (bool): If True, describe images in the Markdown text.
489
- filter (bool): If True, filter the important lines using the language model.
490
-
491
- Returns:
492
- str: Filtered Markdown containing only the important lines.
493
- """
494
- if self.chatterer is None:
495
- raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
496
- markdown_content = self.url_to_md(
497
- url,
498
- wait=wait,
499
- scrolldown=scrolldown,
500
- sleep=sleep,
501
- reload=reload,
502
- timeout=timeout,
503
- keep_page=keep_page,
504
- referer=referer,
505
- )
506
- if describe_images:
507
- markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
508
- if not filter:
509
- return markdown_content
510
- lines = markdown_content.split("\n")
511
- line_length = len(lines)
512
- important_lines: set[int] = set()
513
-
514
- def _into_safe_range(value: int) -> int:
515
- """Ensure the line index stays within bounds."""
516
- return min(max(value, 0), line_length - 1)
517
-
518
- if chunk_size is None:
519
- chunk_size = line_length
520
-
521
- # Process the markdown in chunks.
522
- for i in range(0, len(lines), chunk_size):
523
- chunk_lines = lines[i : i + chunk_size]
524
- # Prepend line numbers to each line.
525
- numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
526
- # Use the language model synchronously to get the line ranges.
527
- result: SelectedLineRanges = self.chatterer.generate_pydantic(
528
- response_model=SelectedLineRanges,
529
- messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
530
- )
531
- for range_str in result.line_ranges:
532
- if "-" in range_str:
533
- start, end = map(int, range_str.split("-"))
534
- important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
535
- else:
536
- important_lines.add(_into_safe_range(int(range_str) + i - 1))
537
- # Reconstruct the filtered markdown.
538
- return "\n".join(lines[line_no] for line_no in sorted(important_lines))
539
-
540
- async def aurl_to_md_with_llm(
541
- self,
542
- url: str,
543
- chunk_size: Optional[int] = None,
544
- wait: float = 0.2,
545
- scrolldown: bool = False,
546
- sleep: int = 0,
547
- reload: bool = True,
548
- timeout: Union[float, int] = 8,
549
- keep_page: bool = False,
550
- referer: Optional[str] = None,
551
- describe_images: bool = True,
552
- filter: bool = True,
553
- ) -> str:
554
- """
555
- Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
556
- to filter out unimportant lines.
557
-
558
- The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
559
- to select the important line ranges. It then reconstructs the filtered Markdown.
560
-
561
- Args:
562
- url (str): URL of the page.
563
- chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
564
- wait (float): Time to wait after navigation (in seconds).
565
- scrolldown (bool): If True, scroll the page.
566
- sleep (int): Time to wait after scrolling (in seconds).
567
- reload (bool): If True, reload the page.
568
- timeout (float | int): Navigation timeout (in seconds).
569
- keep_page (bool): If True, do not close the page after processing.
570
- referer (Optional[str]): Referer URL to set.
571
- describe_images (bool): If True, describe images in the Markdown text.
572
- filter (bool): If True, filter the important lines using the language model.
573
-
574
- Returns:
575
- str: Filtered Markdown containing only the important lines.
576
- """
577
- if self.chatterer is None:
578
- raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
579
- markdown_content = await self.aurl_to_md(
580
- url,
581
- wait=wait,
582
- scrolldown=scrolldown,
583
- sleep=sleep,
584
- reload=reload,
585
- timeout=timeout,
586
- keep_page=keep_page,
587
- referer=referer,
588
- )
589
- if describe_images:
590
- markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
591
- if not filter:
592
- return markdown_content
593
- lines = markdown_content.split("\n")
594
- line_length = len(lines)
595
- important_lines: set[int] = set()
596
-
597
- def _into_safe_range(value: int) -> int:
598
- """Ensure the line index is within valid bounds."""
599
- return min(max(value, 0), line_length - 1)
600
-
601
- if chunk_size is None:
602
- chunk_size = line_length
603
-
604
- for i in range(0, len(lines), chunk_size):
605
- chunk_lines = lines[i : i + chunk_size]
606
- numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
607
- # Use the asynchronous language model method.
608
- result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
609
- response_model=SelectedLineRanges,
610
- messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
611
- )
612
- for range_str in result.line_ranges:
613
- if "-" in range_str:
614
- start, end = map(int, range_str.split("-"))
615
- important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
616
- else:
617
- important_lines.add(_into_safe_range(int(range_str) + i - 1))
618
- return "\n".join(lines[line_no] for line_no in sorted(important_lines))
619
-
620
- def describe_images(self, markdown_text: str, referer_url: str) -> str:
621
- """
622
- Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
623
- Using Playwright for fetching images to bypass CDN protections.
624
- """
625
- if self.chatterer is None:
626
- raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
627
- return caption_markdown_images(
628
- markdown_text=markdown_text,
629
- headers=self.headers | {"Referer": referer_url},
630
- description_format=self.description_format,
631
- image_description_instruction=self.image_description_instruction,
632
- chatterer=self.chatterer,
633
- image_processing_config=self.image_processing_config,
634
- img_bytes_fetcher=self._playwright_fetch_image_bytes,
635
- )
636
-
637
- # 기존 adescribe_images 메서드를 다음과 같이 수정합니다.
638
- async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
639
- """
640
- Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
641
- Using Playwright for fetching images to bypass CDN protections.
642
- """
643
- if self.chatterer is None:
644
- raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
645
- return await acaption_markdown_images(
646
- markdown_text=markdown_text,
647
- headers=self.headers | {"Referer": referer_url},
648
- description_format=self.description_format,
649
- image_description_instruction=self.image_description_instruction,
650
- chatterer=self.chatterer,
651
- image_processing_config=self.image_processing_config,
652
- img_bytes_fetcher=self._aplaywright_fetch_image_bytes,
653
- )
654
-
655
- def _playwright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
656
- """Playwright를 사용하여 동기적으로 이미지 바이트를 가져옵니다."""
657
- page: Optional[playwright.sync_api.Page] = None
658
- try:
659
- # Get the existing synchronous browser context.
660
- page = self.get_sync_browser().new_page()
661
-
662
- # Set the provided headers as extra HTTP headers for the page.
663
- # This will apply to all subsequent requests made by the page.
664
- page.set_extra_http_headers(headers)
665
- response = page.goto(image_url, wait_until="load", timeout=15000)
666
- if response and response.ok:
667
- return response.body()
668
- else:
669
- return b""
670
- except Exception as e:
671
- print(f"Playwright exception fetching image: {image_url}, Error: {e}")
672
- return b""
673
- finally:
674
- if page:
675
- page.close()
676
-
677
- async def _aplaywright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
678
- """Playwright를 사용하여 비동기적으로 이미지 바이트를 가져옵니다."""
679
- page: Optional[playwright.async_api.Page] = None
680
- try:
681
- # Get the existing asynchronous browser context.
682
- page = await (await self.get_async_browser()).new_page()
683
-
684
- # Set the provided headers as extra HTTP headers for the page.
685
- # This will apply to all subsequent requests made by the page.
686
- await page.set_extra_http_headers(headers)
687
- response = await page.goto(image_url, wait_until="load", timeout=15000)
688
- if response and response.ok:
689
- return await response.body()
690
- else:
691
- # 실패 시 로그를 남기거나 None을 반환할 수 있습니다.
692
- print(
693
- f"Playwright failed to fetch image: {image_url}, Status: {response.status if response else 'No Response'}"
694
- )
695
- return b""
696
- except Exception as e:
697
- # 예외 발생 시 로그를 남깁니다.
698
- print(f"Playwright exception fetching image: {image_url}, Error: {e}")
699
- return b""
700
- finally:
701
- # 페이지를 항상 닫아 리소스를 정리합니다.
702
- if page:
703
- await page.close()
704
-
705
- def __enter__(self) -> Self:
706
- return self
707
-
708
- async def __aenter__(self) -> Self:
709
- return self
710
-
711
- def __exit__(
712
- self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
713
- ) -> None:
714
- """
715
- Exit the synchronous context.
716
-
717
- Closes the browser and stops Playwright.
718
- """
719
- if self.sync_browser_context is not None:
720
- self.sync_browser_context.close()
721
- self.sync_browser_context = None
722
- if self.sync_playwright:
723
- self.sync_playwright.stop()
724
- self.sync_playwright = None
725
-
726
- async def __aexit__(
727
- self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
728
- ) -> None:
729
- """
730
- Asynchronously exit the context.
731
-
732
- Closes the asynchronous browser and stops Playwright.
733
- """
734
- if self.async_browser_context is not None:
735
- await self.async_browser_context.close()
736
- self.async_browser_context = None
737
- if self.async_playwright:
738
- await self.async_playwright.stop()
739
- self.async_playwright = None
1
+ """
2
+ PlaywrightBot
3
+
4
+ This module provides a single class that uses Playwright to:
5
+ - Fetch and render HTML pages (with JavaScript execution),
6
+ - Optionally scroll down or reload pages,
7
+ - Convert rendered HTML into Markdown,
8
+ - Extract specific elements using CSS selectors,
9
+ - Filter key information from a page via integration with a language model (Chatterer).
10
+
11
+ Both synchronous and asynchronous methods are available in this unified class.
12
+ Use the synchronous methods (without the "a" prefix) in a normal context manager,
13
+ or use the asynchronous methods (prefixed with "a") within an async context manager.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass, field
19
+ from pathlib import Path
20
+ from types import TracebackType
21
+ from typing import (
22
+ TYPE_CHECKING,
23
+ Literal,
24
+ NotRequired,
25
+ Optional,
26
+ Self,
27
+ Sequence,
28
+ Type,
29
+ TypeAlias,
30
+ TypedDict,
31
+ Union,
32
+ )
33
+
34
+ from pydantic import BaseModel, Field
35
+
36
+ from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
37
+ from ..utils.base64_image import ImageProcessingConfig, get_default_image_processing_config, is_remote_url
38
+ from .caption_markdown_images import acaption_markdown_images, caption_markdown_images
39
+ from .convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
40
+
41
+ if TYPE_CHECKING:
42
+ import playwright.async_api
43
+ import playwright.sync_api
44
+
45
+ WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
46
+ DEFAULT_UA: str = (
47
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
48
+ )
49
+
50
+
51
+ class SelectedLineRanges(BaseModel):
52
+ line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
53
+
54
+
55
+ class PlaywrightLaunchOptions(TypedDict):
56
+ executable_path: NotRequired[str | Path]
57
+ channel: NotRequired[str]
58
+ args: NotRequired[Sequence[str]]
59
+ ignore_default_args: NotRequired[bool | Sequence[str]]
60
+ handle_sigint: NotRequired[bool]
61
+ handle_sigterm: NotRequired[bool]
62
+ handle_sighup: NotRequired[bool]
63
+ timeout: NotRequired[float]
64
+ env: NotRequired[dict[str, str | float | bool]]
65
+ headless: NotRequired[bool]
66
+ devtools: NotRequired[bool]
67
+ proxy: NotRequired[playwright.sync_api.ProxySettings]
68
+ downloads_path: NotRequired[str | Path]
69
+ slow_mo: NotRequired[float]
70
+ traces_dir: NotRequired[str | Path]
71
+ chromium_sandbox: NotRequired[bool]
72
+ firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
73
+
74
+
75
+ class PlaywrightPersistencyOptions(TypedDict):
76
+ user_data_dir: NotRequired[str | Path]
77
+ storage_state: NotRequired[playwright.sync_api.StorageState]
78
+
79
+
80
+ class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
81
+
82
+
83
+ def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
84
+ return {"headless": True}
85
+
86
+
87
+ @dataclass
88
+ class PlayWrightBot:
89
+ """
90
+ A unified bot that leverages Playwright to render web pages, convert them to Markdown,
91
+ extract elements, and filter key information using a language model.
92
+
93
+ This class exposes both synchronous and asynchronous methods.
94
+
95
+ Synchronous usage:
96
+ with UnifiedPlaywrightBot() as bot:
97
+ md = bot.url_to_md("https://example.com")
98
+ headings = bot.select_and_extract("https://example.com", "h2")
99
+ filtered_md = bot.url_to_md_with_llm("https://example.com")
100
+
101
+ Asynchronous usage:
102
+ async with UnifiedPlaywrightBot() as bot:
103
+ md = await bot.aurl_to_md("https://example.com")
104
+ headings = await bot.aselect_and_extract("https://example.com", "h2")
105
+ filtered_md = await bot.aurl_to_md_with_llm("https://example.com")
106
+
107
+ Attributes:
108
+ headless (bool): Whether to run the browser in headless mode (default True).
109
+ chatterer (Chatterer): An instance of the language model interface for processing text.
110
+ """
111
+
112
+ engine: Literal["firefox", "chromium", "webkit"] = "firefox"
113
+ chatterer: Optional[Chatterer] = field(default_factory=Chatterer.openai)
114
+ playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
115
+ playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
116
+ html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
117
+ image_processing_config: ImageProcessingConfig = field(default_factory=get_default_image_processing_config)
118
+ headers: dict[str, str] = field(default_factory=lambda: {"User-Agent": DEFAULT_UA})
119
+ markdown_filtering_instruction: str = """You are a web parser bot, an AI agent that filters out redundant fields from a webpage.
120
+
121
+ You excel at the following tasks:
122
+ 1. Identifying the main article content of a webpage.
123
+ 2. Filtering out ads, navigation links, and other irrelevant information.
124
+ 3. Selecting the line number ranges that correspond to the article content.
125
+ 4. Providing these inclusive ranges in the format 'start-end' or 'single_line_number'.
126
+
127
+ However, there are a few rules you must follow:
128
+ 1. Do not remove the title of the article, if present.
129
+ 2. Do not remove the author's name or the publication date, if present.
130
+ 3. Include only images that are part of the article.
131
+
132
+ Now, return a valid JSON object, for example: {'line_ranges': ['1-3', '5-5', '7-10']}.
133
+
134
+ Markdown-formatted webpage content is provided below for your reference:
135
+ ---
136
+ """.strip()
137
+ description_format: str = (
138
+ "<details><summary>{image_summary}</summary><img src='{url}' alt='{inline_text}'></details>"
139
+ )
140
+ image_description_instruction: str = DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION
141
+
142
+ sync_playwright: Optional[playwright.sync_api.Playwright] = None
143
+ sync_browser_context: Optional[playwright.sync_api.BrowserContext] = None
144
+ async_playwright: Optional[playwright.async_api.Playwright] = None
145
+ async_browser_context: Optional[playwright.async_api.BrowserContext] = None
146
+
147
+ def get_sync_playwright(self) -> playwright.sync_api.Playwright:
148
+ if self.sync_playwright is None:
149
+ from playwright.sync_api import sync_playwright
150
+
151
+ self.sync_playwright = sync_playwright().start()
152
+ return self.sync_playwright
153
+
154
+ async def get_async_playwright(self) -> playwright.async_api.Playwright:
155
+ if self.async_playwright is None:
156
+ from playwright.async_api import async_playwright
157
+
158
+ self.async_playwright = await async_playwright().start()
159
+ return self.async_playwright
160
+
161
+ def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
162
+ if self.sync_browser_context is not None:
163
+ return self.sync_browser_context
164
+
165
+ def get_browser() -> playwright.sync_api.BrowserType:
166
+ playwright = self.get_sync_playwright()
167
+ if self.engine == "firefox":
168
+ return playwright.firefox
169
+ elif self.engine == "chromium":
170
+ return playwright.chromium
171
+ elif self.engine == "webkit":
172
+ return playwright.webkit
173
+ else:
174
+ raise ValueError(f"Unsupported engine: {self.engine}")
175
+
176
+ user_data_dir = self.playwright_persistency_options.get("user_data_dir")
177
+ if user_data_dir:
178
+ # Use persistent context if user_data_dir is provided
179
+ self.sync_browser_context = get_browser().launch_persistent_context(
180
+ user_data_dir=user_data_dir, **self.playwright_launch_options
181
+ )
182
+ return self.sync_browser_context
183
+
184
+ # Otherwise, launch a new context
185
+ browser = get_browser().launch(**self.playwright_launch_options)
186
+ storage_state = self.playwright_persistency_options.get("storage_state")
187
+ if storage_state:
188
+ self.sync_browser_context = browser.new_context(storage_state=storage_state)
189
+ else:
190
+ self.sync_browser_context = browser.new_context()
191
+ return self.sync_browser_context
192
+
193
+ async def get_async_browser(self) -> playwright.async_api.BrowserContext:
194
+ if self.async_browser_context is not None:
195
+ return self.async_browser_context
196
+
197
+ async def get_browser() -> playwright.async_api.BrowserType:
198
+ playwright = await self.get_async_playwright()
199
+ if self.engine == "firefox":
200
+ return playwright.firefox
201
+ elif self.engine == "chromium":
202
+ return playwright.chromium
203
+ elif self.engine == "webkit":
204
+ return playwright.webkit
205
+ else:
206
+ raise ValueError(f"Unsupported engine: {self.engine}")
207
+
208
+ user_data_dir = self.playwright_persistency_options.get("user_data_dir")
209
+ if user_data_dir:
210
+ # Use persistent context if user_data_dir is provided
211
+ self.async_browser_context = await (await get_browser()).launch_persistent_context(
212
+ user_data_dir=user_data_dir, **self.playwright_launch_options
213
+ )
214
+ return self.async_browser_context
215
+
216
+ # Otherwise, launch a new context
217
+ browser = await (await get_browser()).launch(**self.playwright_launch_options)
218
+ storage_state = self.playwright_persistency_options.get("storage_state")
219
+ if storage_state:
220
+ self.async_browser_context = await browser.new_context(storage_state=storage_state)
221
+ else:
222
+ self.async_browser_context = await browser.new_context()
223
+ return self.async_browser_context
224
+
225
+ def get_page(
226
+ self,
227
+ url: str,
228
+ timeout: float = 10.0,
229
+ wait_until: Optional[WaitUntil] = "domcontentloaded",
230
+ referer: Optional[str] = None,
231
+ ) -> playwright.sync_api.Page:
232
+ """
233
+ Create a new page and navigate to the given URL synchronously.
234
+
235
+ Args:
236
+ url (str): URL to navigate to.
237
+ timeout (float): Maximum navigation time in seconds.
238
+ wait_until (str): Load state to wait for (e.g., "domcontentloaded").
239
+ referer (Optional[str]): Referer URL to set.
240
+
241
+ Returns:
242
+ Page: The Playwright page object.
243
+ """
244
+ page = self.get_sync_browser().new_page()
245
+ page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
246
+ return page
247
+
248
+ async def aget_page(
249
+ self,
250
+ url: str,
251
+ timeout: float = 8,
252
+ wait_until: Optional[WaitUntil] = "domcontentloaded",
253
+ referer: Optional[str] = None,
254
+ ) -> playwright.async_api.Page:
255
+ """
256
+ Create a new page and navigate to the given URL asynchronously.
257
+
258
+ Args:
259
+ url (str): URL to navigate to.
260
+ timeout (float): Maximum navigation time in seconds.
261
+ wait_until (str): Load state to wait for.
262
+ referer (Optional[str]): Referer URL to set.
263
+
264
+ Returns:
265
+ AsyncPage: The Playwright asynchronous page object.
266
+ """
267
+ page = await (await self.get_async_browser()).new_page()
268
+ await page.goto(url, timeout=int(timeout * 1000), wait_until=wait_until, referer=referer)
269
+ return page
270
+
271
+ def url_to_md(
272
+ self,
273
+ url: str,
274
+ wait: float = 0.2,
275
+ scrolldown: bool = False,
276
+ sleep: int = 0,
277
+ reload: bool = True,
278
+ timeout: Union[float, int] = 8,
279
+ keep_page: bool = False,
280
+ referer: Optional[str] = None,
281
+ ) -> str:
282
+ """
283
+ Navigate to a URL, optionally wait, scroll, or reload the page, and convert the rendered HTML to Markdown.
284
+
285
+ Args:
286
+ url (str): URL of the page.
287
+ wait (float): Time to wait after navigation (in seconds).
288
+ scrolldown (bool): If True, scroll to the bottom of the page.
289
+ sleep (int): Time to wait after scrolling (in seconds).
290
+ reload (bool): If True, reload the page.
291
+ timeout (float | int): Navigation timeout in seconds.
292
+ keep_page (bool): If True, do not close the page after processing.
293
+ referer (Optional[str]): Referer URL to set.
294
+
295
+ Returns:
296
+ str: The page content converted to Markdown.
297
+ """
298
+ page: Optional[playwright.sync_api.Page] = None
299
+ if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
300
+ with open(url, "r", encoding="utf-8") as f:
301
+ html = f.read()
302
+ else:
303
+ page = self.get_page(url, timeout=timeout, referer=referer)
304
+ if wait:
305
+ page.wait_for_timeout(wait * 1000)
306
+ if scrolldown:
307
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
308
+ if sleep:
309
+ page.wait_for_timeout(sleep * 1000)
310
+ if reload:
311
+ page.reload(timeout=int(timeout * 1000))
312
+ html = page.content()
313
+
314
+ md = html_to_markdown(html=html, options=self.html_to_markdown_options)
315
+ if not keep_page and page is not None:
316
+ page.close()
317
+ return md
318
+
319
+ async def aurl_to_md(
320
+ self,
321
+ url: str,
322
+ wait: float = 0.2,
323
+ scrolldown: bool = False,
324
+ sleep: int = 0,
325
+ reload: bool = True,
326
+ timeout: Union[float, int] = 8,
327
+ keep_page: bool = False,
328
+ referer: Optional[str] = None,
329
+ ) -> str:
330
+ """
331
+ Asynchronously navigate to a URL, wait, scroll or reload if specified,
332
+ and convert the rendered HTML to Markdown.
333
+
334
+ Args:
335
+ url (str): URL of the page.
336
+ wait (float): Time to wait after navigation (in seconds).
337
+ scrolldown (bool): If True, scroll the page.
338
+ sleep (int): Time to wait after scrolling (in seconds).
339
+ reload (bool): If True, reload the page.
340
+ timeout (float | int): Navigation timeout (in seconds).
341
+ keep_page (bool): If True, do not close the page after processing.
342
+ referer (Optional[str]): Referer URL to set.
343
+
344
+ Returns:
345
+ str: The page content converted to Markdown.
346
+ """
347
+ page: Optional[playwright.async_api.Page] = None
348
+ if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
349
+ with open(url, "r", encoding="utf-8") as f:
350
+ html = f.read()
351
+ else:
352
+ page = await self.aget_page(url, timeout=timeout, referer=referer)
353
+ if wait:
354
+ await page.wait_for_timeout(wait * 1000)
355
+ if scrolldown:
356
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
357
+ if sleep:
358
+ await page.wait_for_timeout(sleep * 1000)
359
+ if reload:
360
+ await page.reload(timeout=int(timeout * 1000))
361
+ html = await page.content()
362
+ md = html_to_markdown(html=html, options=self.html_to_markdown_options)
363
+ if not keep_page and page is not None:
364
+ await page.close()
365
+ return md
366
+
367
+ def select_and_extract(
368
+ self,
369
+ url: str,
370
+ css_selector: str,
371
+ wait: float = 0.2,
372
+ scrolldown: bool = False,
373
+ sleep: int = 0,
374
+ reload: bool = True,
375
+ timeout: Union[float, int] = 8,
376
+ keep_page: bool = False,
377
+ referer: Optional[str] = None,
378
+ ) -> list[str]:
379
+ """
380
+ Navigate to a URL, render the page, and extract text from elements matching the given CSS selector.
381
+
382
+ Args:
383
+ url (str): URL of the page.
384
+ css_selector (str): CSS selector to locate elements.
385
+ wait (float): Time to wait after navigation (in seconds).
386
+ scrolldown (bool): If True, scroll the page.
387
+ sleep (int): Time to wait after scrolling (in seconds).
388
+ reload (bool): If True, reload the page.
389
+ timeout (float | int): Maximum navigation time (in seconds).
390
+ keep_page (bool): If True, do not close the page after processing.
391
+ referer (Optional[str]): Referer URL to set.
392
+
393
+ Returns:
394
+ List[str]: A list of text contents from the matching elements.
395
+ """
396
+ page = self.get_page(url, timeout=timeout, referer=referer)
397
+ if wait:
398
+ page.wait_for_timeout(wait * 1000)
399
+ if scrolldown:
400
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
401
+ if sleep:
402
+ page.wait_for_timeout(sleep * 1000)
403
+ if reload:
404
+ page.reload(timeout=int(timeout * 1000))
405
+ elements = page.query_selector_all(css_selector)
406
+ texts = [element.inner_text() for element in elements]
407
+ if not keep_page:
408
+ page.close()
409
+ return texts
410
+
411
+ async def aselect_and_extract(
412
+ self,
413
+ url: str,
414
+ css_selector: str,
415
+ wait: float = 0.2,
416
+ scrolldown: bool = False,
417
+ sleep: int = 0,
418
+ reload: bool = True,
419
+ timeout: Union[float, int] = 8,
420
+ keep_page: bool = False,
421
+ referer: Optional[str] = None,
422
+ ) -> list[str]:
423
+ """
424
+ Asynchronously navigate to a URL, render the page, and extract text from elements matching the CSS selector.
425
+
426
+ Args:
427
+ url (str): URL of the page.
428
+ css_selector (str): CSS selector to locate elements.
429
+ wait (float): Time to wait after navigation (in seconds).
430
+ scrolldown (bool): If True, scroll the page.
431
+ sleep (int): Time to wait after scrolling (in seconds).
432
+ reload (bool): If True, reload the page.
433
+ timeout (float | int): Navigation timeout (in seconds).
434
+ keep_page (bool): If True, do not close the page after processing.
435
+ referer (Optional[str]): Referer URL to set.
436
+
437
+ Returns:
438
+ List[str]: A list of text contents from the matching elements.
439
+ """
440
+ page = await self.aget_page(url, timeout=timeout, referer=referer)
441
+ if wait:
442
+ await page.wait_for_timeout(wait * 1000)
443
+ if scrolldown:
444
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
445
+ if sleep:
446
+ await page.wait_for_timeout(sleep * 1000)
447
+ if reload:
448
+ await page.reload(timeout=int(timeout * 1000))
449
+ elements = await page.query_selector_all(css_selector)
450
+ texts: list[str] = []
451
+ for element in elements:
452
+ text = await element.inner_text()
453
+ texts.append(text)
454
+ if not keep_page:
455
+ await page.close()
456
+ return texts
457
+
458
+ def url_to_md_with_llm(
459
+ self,
460
+ url: str,
461
+ chunk_size: Optional[int] = None,
462
+ wait: float = 0.2,
463
+ scrolldown: bool = False,
464
+ sleep: int = 0,
465
+ reload: bool = True,
466
+ timeout: Union[float, int] = 8,
467
+ keep_page: bool = False,
468
+ referer: Optional[str] = None,
469
+ describe_images: bool = True,
470
+ filter: bool = True,
471
+ ) -> str:
472
+ """
473
+ Convert a URL's page to Markdown and use a language model (Chatterer) to filter out unimportant lines.
474
+
475
+ The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
476
+ to select the important line ranges. It then reconstructs the filtered Markdown.
477
+
478
+ Args:
479
+ url (str): URL of the page.
480
+ chunk_size (Optional[int]): Number of lines per chunk. Defaults to the full content.
481
+ wait (float): Time to wait after navigation (in seconds).
482
+ scrolldown (bool): If True, scroll down the page.
483
+ sleep (int): Time to wait after scrolling (in seconds).
484
+ reload (bool): If True, reload the page.
485
+ timeout (float | int): Navigation timeout (in seconds).
486
+ keep_page (bool): If True, do not close the page after processing.
487
+ referer (Optional[str]): Referer URL to set.
488
+ describe_images (bool): If True, describe images in the Markdown text.
489
+ filter (bool): If True, filter the important lines using the language model.
490
+
491
+ Returns:
492
+ str: Filtered Markdown containing only the important lines.
493
+ """
494
+ if self.chatterer is None:
495
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
496
+ markdown_content = self.url_to_md(
497
+ url,
498
+ wait=wait,
499
+ scrolldown=scrolldown,
500
+ sleep=sleep,
501
+ reload=reload,
502
+ timeout=timeout,
503
+ keep_page=keep_page,
504
+ referer=referer,
505
+ )
506
+ if describe_images:
507
+ markdown_content = self.describe_images(markdown_text=markdown_content, referer_url=url)
508
+ if not filter:
509
+ return markdown_content
510
+ lines = markdown_content.split("\n")
511
+ line_length = len(lines)
512
+ important_lines: set[int] = set()
513
+
514
+ def _into_safe_range(value: int) -> int:
515
+ """Ensure the line index stays within bounds."""
516
+ return min(max(value, 0), line_length - 1)
517
+
518
+ if chunk_size is None:
519
+ chunk_size = line_length
520
+
521
+ # Process the markdown in chunks.
522
+ for i in range(0, len(lines), chunk_size):
523
+ chunk_lines = lines[i : i + chunk_size]
524
+ # Prepend line numbers to each line.
525
+ numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
526
+ # Use the language model synchronously to get the line ranges.
527
+ result: SelectedLineRanges = self.chatterer.generate_pydantic(
528
+ response_model=SelectedLineRanges,
529
+ messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
530
+ )
531
+ for range_str in result.line_ranges:
532
+ if "-" in range_str:
533
+ start, end = map(int, range_str.split("-"))
534
+ important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
535
+ else:
536
+ important_lines.add(_into_safe_range(int(range_str) + i - 1))
537
+ # Reconstruct the filtered markdown.
538
+ return "\n".join(lines[line_no] for line_no in sorted(important_lines))
539
+
540
+ async def aurl_to_md_with_llm(
541
+ self,
542
+ url: str,
543
+ chunk_size: Optional[int] = None,
544
+ wait: float = 0.2,
545
+ scrolldown: bool = False,
546
+ sleep: int = 0,
547
+ reload: bool = True,
548
+ timeout: Union[float, int] = 8,
549
+ keep_page: bool = False,
550
+ referer: Optional[str] = None,
551
+ describe_images: bool = True,
552
+ filter: bool = True,
553
+ ) -> str:
554
+ """
555
+ Asynchronously convert a URL's page to Markdown and use the language model (Chatterer)
556
+ to filter out unimportant lines.
557
+
558
+ The method splits the Markdown text into chunks, prepends line numbers, and prompts the LLM
559
+ to select the important line ranges. It then reconstructs the filtered Markdown.
560
+
561
+ Args:
562
+ url (str): URL of the page.
563
+ chunk_size (Optional[int]): Number of lines per chunk; defaults to the full content.
564
+ wait (float): Time to wait after navigation (in seconds).
565
+ scrolldown (bool): If True, scroll the page.
566
+ sleep (int): Time to wait after scrolling (in seconds).
567
+ reload (bool): If True, reload the page.
568
+ timeout (float | int): Navigation timeout (in seconds).
569
+ keep_page (bool): If True, do not close the page after processing.
570
+ referer (Optional[str]): Referer URL to set.
571
+ describe_images (bool): If True, describe images in the Markdown text.
572
+ filter (bool): If True, filter the important lines using the language model.
573
+
574
+ Returns:
575
+ str: Filtered Markdown containing only the important lines.
576
+ """
577
+ if self.chatterer is None:
578
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
579
+ markdown_content = await self.aurl_to_md(
580
+ url,
581
+ wait=wait,
582
+ scrolldown=scrolldown,
583
+ sleep=sleep,
584
+ reload=reload,
585
+ timeout=timeout,
586
+ keep_page=keep_page,
587
+ referer=referer,
588
+ )
589
+ if describe_images:
590
+ markdown_content = await self.adescribe_images(markdown_text=markdown_content, referer_url=url)
591
+ if not filter:
592
+ return markdown_content
593
+ lines = markdown_content.split("\n")
594
+ line_length = len(lines)
595
+ important_lines: set[int] = set()
596
+
597
+ def _into_safe_range(value: int) -> int:
598
+ """Ensure the line index is within valid bounds."""
599
+ return min(max(value, 0), line_length - 1)
600
+
601
+ if chunk_size is None:
602
+ chunk_size = line_length
603
+
604
+ for i in range(0, len(lines), chunk_size):
605
+ chunk_lines = lines[i : i + chunk_size]
606
+ numbered_markdown = "\n".join(f"[Ln {line_no}] {line}" for line_no, line in enumerate(chunk_lines, start=1))
607
+ # Use the asynchronous language model method.
608
+ result: SelectedLineRanges = await self.chatterer.agenerate_pydantic(
609
+ response_model=SelectedLineRanges,
610
+ messages=f"{self.markdown_filtering_instruction}\n{numbered_markdown}",
611
+ )
612
+ for range_str in result.line_ranges:
613
+ if "-" in range_str:
614
+ start, end = map(int, range_str.split("-"))
615
+ important_lines.update(range(_into_safe_range(start + i - 1), _into_safe_range(end + i)))
616
+ else:
617
+ important_lines.add(_into_safe_range(int(range_str) + i - 1))
618
+ return "\n".join(lines[line_no] for line_no in sorted(important_lines))
619
+
620
+ def describe_images(self, markdown_text: str, referer_url: str) -> str:
621
+ """
622
+ Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
623
+ Using Playwright for fetching images to bypass CDN protections.
624
+ """
625
+ if self.chatterer is None:
626
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
627
+ return caption_markdown_images(
628
+ markdown_text=markdown_text,
629
+ headers=self.headers | {"Referer": referer_url},
630
+ description_format=self.description_format,
631
+ image_description_instruction=self.image_description_instruction,
632
+ chatterer=self.chatterer,
633
+ image_processing_config=self.image_processing_config,
634
+ img_bytes_fetcher=self._playwright_fetch_image_bytes,
635
+ )
636
+
637
+ # 기존 adescribe_images 메서드를 다음과 같이 수정합니다.
638
+ async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
639
+ """
640
+ Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
641
+ Using Playwright for fetching images to bypass CDN protections.
642
+ """
643
+ if self.chatterer is None:
644
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
645
+ return await acaption_markdown_images(
646
+ markdown_text=markdown_text,
647
+ headers=self.headers | {"Referer": referer_url},
648
+ description_format=self.description_format,
649
+ image_description_instruction=self.image_description_instruction,
650
+ chatterer=self.chatterer,
651
+ image_processing_config=self.image_processing_config,
652
+ img_bytes_fetcher=self._aplaywright_fetch_image_bytes,
653
+ )
654
+
655
+ def _playwright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
656
+ """Playwright를 사용하여 동기적으로 이미지 바이트를 가져옵니다."""
657
+ page: Optional[playwright.sync_api.Page] = None
658
+ try:
659
+ # Get the existing synchronous browser context.
660
+ page = self.get_sync_browser().new_page()
661
+
662
+ # Set the provided headers as extra HTTP headers for the page.
663
+ # This will apply to all subsequent requests made by the page.
664
+ page.set_extra_http_headers(headers)
665
+ response = page.goto(image_url, wait_until="load", timeout=15000)
666
+ if response and response.ok:
667
+ return response.body()
668
+ else:
669
+ return b""
670
+ except Exception as e:
671
+ print(f"Playwright exception fetching image: {image_url}, Error: {e}")
672
+ return b""
673
+ finally:
674
+ if page:
675
+ page.close()
676
+
677
+ async def _aplaywright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
678
+ """Playwright를 사용하여 비동기적으로 이미지 바이트를 가져옵니다."""
679
+ page: Optional[playwright.async_api.Page] = None
680
+ try:
681
+ # Get the existing asynchronous browser context.
682
+ page = await (await self.get_async_browser()).new_page()
683
+
684
+ # Set the provided headers as extra HTTP headers for the page.
685
+ # This will apply to all subsequent requests made by the page.
686
+ await page.set_extra_http_headers(headers)
687
+ response = await page.goto(image_url, wait_until="load", timeout=15000)
688
+ if response and response.ok:
689
+ return await response.body()
690
+ else:
691
+ # 실패 시 로그를 남기거나 None을 반환할 수 있습니다.
692
+ print(
693
+ f"Playwright failed to fetch image: {image_url}, Status: {response.status if response else 'No Response'}"
694
+ )
695
+ return b""
696
+ except Exception as e:
697
+ # 예외 발생 시 로그를 남깁니다.
698
+ print(f"Playwright exception fetching image: {image_url}, Error: {e}")
699
+ return b""
700
+ finally:
701
+ # 페이지를 항상 닫아 리소스를 정리합니다.
702
+ if page:
703
+ await page.close()
704
+
705
+ def __enter__(self) -> Self:
706
+ return self
707
+
708
+ async def __aenter__(self) -> Self:
709
+ return self
710
+
711
+ def __exit__(
712
+ self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
713
+ ) -> None:
714
+ """
715
+ Exit the synchronous context.
716
+
717
+ Closes the browser and stops Playwright.
718
+ """
719
+ if self.sync_browser_context is not None:
720
+ self.sync_browser_context.close()
721
+ self.sync_browser_context = None
722
+ if self.sync_playwright:
723
+ self.sync_playwright.stop()
724
+ self.sync_playwright = None
725
+
726
+ async def __aexit__(
727
+ self, exc_type: Optional[Type[BaseException]], exc_val: Optional[BaseException], exc_tb: Optional[TracebackType]
728
+ ) -> None:
729
+ """
730
+ Asynchronously exit the context.
731
+
732
+ Closes the asynchronous browser and stops Playwright.
733
+ """
734
+ if self.async_browser_context is not None:
735
+ await self.async_browser_context.close()
736
+ self.async_browser_context = None
737
+ if self.async_playwright:
738
+ await self.async_playwright.stop()
739
+ self.async_playwright = None