chatterer 0.1.13__py3-none-any.whl → 0.1.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,41 +13,77 @@ Use the synchronous methods (without the "a" prefix) in a normal context manager
13
13
  or use the asynchronous methods (prefixed with "a") within an async context manager.
14
14
  """
15
15
 
16
- import asyncio
16
+ from __future__ import annotations
17
+
17
18
  from dataclasses import dataclass, field
18
- from traceback import format_exception_only, print_exc
19
+ from pathlib import Path
19
20
  from types import TracebackType
20
21
  from typing import (
21
- Awaitable,
22
+ TYPE_CHECKING,
23
+ Literal,
24
+ NotRequired,
22
25
  Optional,
23
26
  Self,
27
+ Sequence,
24
28
  Type,
25
- TypeGuard,
29
+ TypeAlias,
30
+ TypedDict,
26
31
  Union,
27
32
  )
28
33
 
29
- import playwright.async_api
30
- import playwright.sync_api
31
-
32
- from ...language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
33
- from ...utils.image import Base64Image, get_default_image_processing_config
34
- from ..convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
35
- from .utils import (
36
- DEFAULT_UA,
37
- ImageDescriptionAndReferences,
38
- ImageProcessingConfig,
39
- MarkdownLink,
40
- PlaywrightLaunchOptions,
41
- PlaywrightPersistencyOptions,
42
- SelectedLineRanges,
43
- WaitUntil,
44
- aget_image_url_and_markdown_links,
45
- get_default_playwright_launch_options,
46
- get_image_url_and_markdown_links,
47
- replace_images,
34
+ from pydantic import BaseModel, Field
35
+
36
+ from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
37
+ from ..utils.base64_image import ImageProcessingConfig, get_default_image_processing_config, is_remote_url
38
+ from .caption_markdown_images import acaption_markdown_images, caption_markdown_images
39
+ from .convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
40
+
41
+ if TYPE_CHECKING:
42
+ import playwright.async_api
43
+ import playwright.sync_api
44
+
45
+ WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
46
+ DEFAULT_UA: str = (
47
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
48
48
  )
49
49
 
50
50
 
51
+ class SelectedLineRanges(BaseModel):
52
+ line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
53
+
54
+
55
+ class PlaywrightLaunchOptions(TypedDict):
56
+ executable_path: NotRequired[str | Path]
57
+ channel: NotRequired[str]
58
+ args: NotRequired[Sequence[str]]
59
+ ignore_default_args: NotRequired[bool | Sequence[str]]
60
+ handle_sigint: NotRequired[bool]
61
+ handle_sigterm: NotRequired[bool]
62
+ handle_sighup: NotRequired[bool]
63
+ timeout: NotRequired[float]
64
+ env: NotRequired[dict[str, str | float | bool]]
65
+ headless: NotRequired[bool]
66
+ devtools: NotRequired[bool]
67
+ proxy: NotRequired[playwright.sync_api.ProxySettings]
68
+ downloads_path: NotRequired[str | Path]
69
+ slow_mo: NotRequired[float]
70
+ traces_dir: NotRequired[str | Path]
71
+ chromium_sandbox: NotRequired[bool]
72
+ firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
73
+
74
+
75
+ class PlaywrightPersistencyOptions(TypedDict):
76
+ user_data_dir: NotRequired[str | Path]
77
+ storage_state: NotRequired[playwright.sync_api.StorageState]
78
+
79
+
80
+ class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
81
+
82
+
83
+ def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
84
+ return {"headless": True}
85
+
86
+
51
87
  @dataclass
52
88
  class PlayWrightBot:
53
89
  """
@@ -73,7 +109,8 @@ class PlayWrightBot:
73
109
  chatterer (Chatterer): An instance of the language model interface for processing text.
74
110
  """
75
111
 
76
- chatterer: Chatterer = field(default_factory=Chatterer.openai)
112
+ engine: Literal["firefox", "chromium", "webkit"] = "firefox"
113
+ chatterer: Optional[Chatterer] = field(default_factory=Chatterer.openai)
77
114
  playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
78
115
  playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
79
116
  html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
@@ -109,28 +146,43 @@ Markdown-formatted webpage content is provided below for your reference:
109
146
 
110
147
  def get_sync_playwright(self) -> playwright.sync_api.Playwright:
111
148
  if self.sync_playwright is None:
112
- self.sync_playwright = playwright.sync_api.sync_playwright().start()
149
+ from playwright.sync_api import sync_playwright
150
+
151
+ self.sync_playwright = sync_playwright().start()
113
152
  return self.sync_playwright
114
153
 
115
154
  async def get_async_playwright(self) -> playwright.async_api.Playwright:
116
155
  if self.async_playwright is None:
117
- self.async_playwright = await playwright.async_api.async_playwright().start()
156
+ from playwright.async_api import async_playwright
157
+
158
+ self.async_playwright = await async_playwright().start()
118
159
  return self.async_playwright
119
160
 
120
161
  def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
121
162
  if self.sync_browser_context is not None:
122
163
  return self.sync_browser_context
123
164
 
165
+ def get_browser() -> playwright.sync_api.BrowserType:
166
+ playwright = self.get_sync_playwright()
167
+ if self.engine == "firefox":
168
+ return playwright.firefox
169
+ elif self.engine == "chromium":
170
+ return playwright.chromium
171
+ elif self.engine == "webkit":
172
+ return playwright.webkit
173
+ else:
174
+ raise ValueError(f"Unsupported engine: {self.engine}")
175
+
124
176
  user_data_dir = self.playwright_persistency_options.get("user_data_dir")
125
177
  if user_data_dir:
126
178
  # Use persistent context if user_data_dir is provided
127
- self.sync_browser_context = self.get_sync_playwright().chromium.launch_persistent_context(
179
+ self.sync_browser_context = get_browser().launch_persistent_context(
128
180
  user_data_dir=user_data_dir, **self.playwright_launch_options
129
181
  )
130
182
  return self.sync_browser_context
131
183
 
132
184
  # Otherwise, launch a new context
133
- browser = self.get_sync_playwright().chromium.launch(**self.playwright_launch_options)
185
+ browser = get_browser().launch(**self.playwright_launch_options)
134
186
  storage_state = self.playwright_persistency_options.get("storage_state")
135
187
  if storage_state:
136
188
  self.sync_browser_context = browser.new_context(storage_state=storage_state)
@@ -142,16 +194,27 @@ Markdown-formatted webpage content is provided below for your reference:
142
194
  if self.async_browser_context is not None:
143
195
  return self.async_browser_context
144
196
 
197
+ async def get_browser() -> playwright.async_api.BrowserType:
198
+ playwright = await self.get_async_playwright()
199
+ if self.engine == "firefox":
200
+ return playwright.firefox
201
+ elif self.engine == "chromium":
202
+ return playwright.chromium
203
+ elif self.engine == "webkit":
204
+ return playwright.webkit
205
+ else:
206
+ raise ValueError(f"Unsupported engine: {self.engine}")
207
+
145
208
  user_data_dir = self.playwright_persistency_options.get("user_data_dir")
146
209
  if user_data_dir:
147
210
  # Use persistent context if user_data_dir is provided
148
- self.async_browser_context = await (await self.get_async_playwright()).chromium.launch_persistent_context(
211
+ self.async_browser_context = await (await get_browser()).launch_persistent_context(
149
212
  user_data_dir=user_data_dir, **self.playwright_launch_options
150
213
  )
151
214
  return self.async_browser_context
152
215
 
153
216
  # Otherwise, launch a new context
154
- browser = await (await self.get_async_playwright()).chromium.launch(**self.playwright_launch_options)
217
+ browser = await (await get_browser()).launch(**self.playwright_launch_options)
155
218
  storage_state = self.playwright_persistency_options.get("storage_state")
156
219
  if storage_state:
157
220
  self.async_browser_context = await browser.new_context(storage_state=storage_state)
@@ -232,18 +295,24 @@ Markdown-formatted webpage content is provided below for your reference:
232
295
  Returns:
233
296
  str: The page content converted to Markdown.
234
297
  """
235
- page = self.get_page(url, timeout=timeout, referer=referer)
236
- if wait:
237
- page.wait_for_timeout(wait * 1000)
238
- if scrolldown:
239
- page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
240
- if sleep:
241
- page.wait_for_timeout(sleep * 1000)
242
- if reload:
243
- page.reload(timeout=int(timeout * 1000))
244
- html = page.content()
298
+ page: Optional[playwright.sync_api.Page] = None
299
+ if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
300
+ with open(url, "r", encoding="utf-8") as f:
301
+ html = f.read()
302
+ else:
303
+ page = self.get_page(url, timeout=timeout, referer=referer)
304
+ if wait:
305
+ page.wait_for_timeout(wait * 1000)
306
+ if scrolldown:
307
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
308
+ if sleep:
309
+ page.wait_for_timeout(sleep * 1000)
310
+ if reload:
311
+ page.reload(timeout=int(timeout * 1000))
312
+ html = page.content()
313
+
245
314
  md = html_to_markdown(html=html, options=self.html_to_markdown_options)
246
- if not keep_page:
315
+ if not keep_page and page is not None:
247
316
  page.close()
248
317
  return md
249
318
 
@@ -275,18 +344,23 @@ Markdown-formatted webpage content is provided below for your reference:
275
344
  Returns:
276
345
  str: The page content converted to Markdown.
277
346
  """
278
- page = await self.aget_page(url, timeout=timeout, referer=referer)
279
- if wait:
280
- await page.wait_for_timeout(wait * 1000)
281
- if scrolldown:
282
- await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
283
- if sleep:
284
- await page.wait_for_timeout(sleep * 1000)
285
- if reload:
286
- await page.reload(timeout=int(timeout * 1000))
287
- html = await page.content()
347
+ page: Optional[playwright.async_api.Page] = None
348
+ if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
349
+ with open(url, "r", encoding="utf-8") as f:
350
+ html = f.read()
351
+ else:
352
+ page = await self.aget_page(url, timeout=timeout, referer=referer)
353
+ if wait:
354
+ await page.wait_for_timeout(wait * 1000)
355
+ if scrolldown:
356
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
357
+ if sleep:
358
+ await page.wait_for_timeout(sleep * 1000)
359
+ if reload:
360
+ await page.reload(timeout=int(timeout * 1000))
361
+ html = await page.content()
288
362
  md = html_to_markdown(html=html, options=self.html_to_markdown_options)
289
- if not keep_page:
363
+ if not keep_page and page is not None:
290
364
  await page.close()
291
365
  return md
292
366
 
@@ -417,6 +491,8 @@ Markdown-formatted webpage content is provided below for your reference:
417
491
  Returns:
418
492
  str: Filtered Markdown containing only the important lines.
419
493
  """
494
+ if self.chatterer is None:
495
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
420
496
  markdown_content = self.url_to_md(
421
497
  url,
422
498
  wait=wait,
@@ -498,6 +574,8 @@ Markdown-formatted webpage content is provided below for your reference:
498
574
  Returns:
499
575
  str: Filtered Markdown containing only the important lines.
500
576
  """
577
+ if self.chatterer is None:
578
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
501
579
  markdown_content = await self.aurl_to_md(
502
580
  url,
503
581
  wait=wait,
@@ -542,75 +620,87 @@ Markdown-formatted webpage content is provided below for your reference:
542
620
  def describe_images(self, markdown_text: str, referer_url: str) -> str:
543
621
  """
544
622
  Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
623
+ Using Playwright for fetching images to bypass CDN protections.
545
624
  """
546
- image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = (
547
- get_image_url_and_markdown_links(
548
- markdown_text=markdown_text,
549
- headers=self.headers | {"Referer": referer_url},
550
- config=self.image_processing_config,
551
- )
552
- )
553
-
554
- image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
555
- for image_url, markdown_links in image_url_and_markdown_links.items():
556
- if image_url is not None:
557
- try:
558
- image_summary: str = self.chatterer.describe_image(
559
- image_url=image_url.data_uri,
560
- instruction=self.image_description_instruction,
561
- )
562
- except Exception:
563
- print_exc()
564
- continue
565
- image_description_and_references[image_summary] = markdown_links
566
- else:
567
- image_description_and_references[None] = markdown_links
568
-
569
- return replace_images(
625
+ if self.chatterer is None:
626
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
627
+ return caption_markdown_images(
570
628
  markdown_text=markdown_text,
571
- image_description_and_references=image_description_and_references,
629
+ headers=self.headers | {"Referer": referer_url},
572
630
  description_format=self.description_format,
631
+ image_description_instruction=self.image_description_instruction,
632
+ chatterer=self.chatterer,
633
+ image_processing_config=self.image_processing_config,
634
+ img_bytes_fetcher=self._playwright_fetch_image_bytes,
573
635
  )
574
636
 
637
+ # 기존 adescribe_images 메서드를 다음과 같이 수정합니다.
575
638
  async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
576
639
  """
577
640
  Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
641
+ Using Playwright for fetching images to bypass CDN protections.
578
642
  """
579
- image_url_and_markdown_links: dict[
580
- Optional[Base64Image], list[MarkdownLink]
581
- ] = await aget_image_url_and_markdown_links(
643
+ if self.chatterer is None:
644
+ raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
645
+ return await acaption_markdown_images(
582
646
  markdown_text=markdown_text,
583
647
  headers=self.headers | {"Referer": referer_url},
584
- config=self.image_processing_config,
648
+ description_format=self.description_format,
649
+ image_description_instruction=self.image_description_instruction,
650
+ chatterer=self.chatterer,
651
+ image_processing_config=self.image_processing_config,
652
+ img_bytes_fetcher=self._aplaywright_fetch_image_bytes,
585
653
  )
586
654
 
587
- async def dummy() -> None:
588
- pass
589
-
590
- def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
591
- if isinstance(e, BaseException):
592
- print(format_exception_only(type(e), e))
593
- return False
594
- return True
595
-
596
- coros: list[Awaitable[Optional[str]]] = [
597
- self.chatterer.adescribe_image(image_url=image_url.data_uri, instruction=self.image_description_instruction)
598
- if image_url is not None
599
- else dummy()
600
- for image_url in image_url_and_markdown_links.keys()
601
- ]
602
-
603
- return replace_images(
604
- markdown_text=markdown_text,
605
- image_description_and_references=ImageDescriptionAndReferences({
606
- image_summary: markdown_links
607
- for markdown_links, image_summary in zip(
608
- image_url_and_markdown_links.values(), await asyncio.gather(*coros, return_exceptions=True)
655
+ def _playwright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
656
+ """Playwright를 사용하여 동기적으로 이미지 바이트를 가져옵니다."""
657
+ page: Optional[playwright.sync_api.Page] = None
658
+ try:
659
+ # Get the existing synchronous browser context.
660
+ page = self.get_sync_browser().new_page()
661
+
662
+ # Set the provided headers as extra HTTP headers for the page.
663
+ # This will apply to all subsequent requests made by the page.
664
+ page.set_extra_http_headers(headers)
665
+ response = page.goto(image_url, wait_until="load", timeout=15000)
666
+ if response and response.ok:
667
+ return response.body()
668
+ else:
669
+ return b""
670
+ except Exception as e:
671
+ print(f"Playwright exception fetching image: {image_url}, Error: {e}")
672
+ return b""
673
+ finally:
674
+ if page:
675
+ page.close()
676
+
677
+ async def _aplaywright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
678
+ """Playwright를 사용하여 비동기적으로 이미지 바이트를 가져옵니다."""
679
+ page: Optional[playwright.async_api.Page] = None
680
+ try:
681
+ # Get the existing asynchronous browser context.
682
+ page = await (await self.get_async_browser()).new_page()
683
+
684
+ # Set the provided headers as extra HTTP headers for the page.
685
+ # This will apply to all subsequent requests made by the page.
686
+ await page.set_extra_http_headers(headers)
687
+ response = await page.goto(image_url, wait_until="load", timeout=15000)
688
+ if response and response.ok:
689
+ return await response.body()
690
+ else:
691
+ # 실패 시 로그를 남기거나 None을 반환할 수 있습니다.
692
+ print(
693
+ f"Playwright failed to fetch image: {image_url}, Status: {response.status if response else 'No Response'}"
609
694
  )
610
- if _handle_exception(image_summary)
611
- }),
612
- description_format=self.description_format,
613
- )
695
+ return b""
696
+ except Exception as e:
697
+ # 예외 발생 시 로그를 남깁니다.
698
+ print(f"Playwright exception fetching image: {image_url}, Error: {e}")
699
+ return b""
700
+ finally:
701
+ # 페이지를 항상 닫아 리소스를 정리합니다.
702
+ if page:
703
+ await page.close()
614
704
 
615
705
  def __enter__(self) -> Self:
616
706
  return self
@@ -29,7 +29,7 @@ def get_youtube_video_details(
29
29
  def get_youtube_video_subtitle(video_id: str) -> str:
30
30
  """Get the transcript of a YouTube video using the given video ID."""
31
31
 
32
- from youtube_transcript_api._api import YouTubeTranscriptApi
32
+ from youtube_transcript_api import YouTubeTranscriptApi # pyright: ignore[reportPrivateImportUsage]
33
33
 
34
34
  get_transcript = YouTubeTranscriptApi.get_transcript # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
35
35
  list_transcripts = YouTubeTranscriptApi.list_transcripts # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
@@ -144,3 +144,4 @@ class YoutubeSearchResult:
144
144
  if __name__ == "__main__":
145
145
  print(get_youtube_video_details("BTS"))
146
146
  # print(get_youtube_transcript("y7jrpS8GHxs"))
147
+
@@ -1,10 +1,10 @@
1
+ from .base64_image import Base64Image
1
2
  from .code_agent import (
2
3
  CodeExecutionResult,
3
4
  FunctionSignature,
4
5
  get_default_repl_tool,
5
6
  insert_callables_into_global,
6
7
  )
7
- from .image import Base64Image
8
8
 
9
9
  __all__ = [
10
10
  "Base64Image",
@@ -7,6 +7,7 @@ from logging import getLogger
7
7
  from pathlib import Path
8
8
  from typing import (
9
9
  Awaitable,
10
+ Callable,
10
11
  ClassVar,
11
12
  Literal,
12
13
  NotRequired,
@@ -18,7 +19,6 @@ from typing import (
18
19
  TypeGuard,
19
20
  cast,
20
21
  get_args,
21
- overload,
22
22
  )
23
23
  from urllib.parse import urlparse
24
24
 
@@ -55,10 +55,11 @@ def get_default_image_processing_config() -> ImageProcessingConfig:
55
55
  "min_largest_side": 200,
56
56
  "resize_if_min_side_exceeds": 2000,
57
57
  "resize_target_for_min_side": 1000,
58
- "formats": ["png", "jpeg", "gif", "bmp", "webp"],
58
+ "formats": ["png", "jpeg", "jpg", "gif", "bmp", "webp"],
59
59
  }
60
60
 
61
61
 
62
+ # image_url: str, headers: dict[str, str]) -> Optional[bytes]:
62
63
  class Base64Image(BaseModel):
63
64
  ext: ImageType
64
65
  data: str
@@ -86,44 +87,51 @@ class Base64Image(BaseModel):
86
87
  def from_bytes(cls, data: bytes, ext: ImageType) -> Self:
87
88
  return cls(ext=ext, data=b64encode(data).decode("utf-8"))
88
89
 
89
- @overload
90
90
  @classmethod
91
91
  def from_url_or_path(
92
92
  cls,
93
93
  url_or_path: str,
94
94
  *,
95
- headers: dict[str, str] = ...,
96
- config: ImageProcessingConfig = ...,
97
- return_coro: Literal[True],
98
- ) -> Awaitable[Optional[Self]]: ...
99
-
100
- @overload
101
- @classmethod
102
- def from_url_or_path(
103
- cls,
104
- url_or_path: str,
105
- *,
106
- headers: dict[str, str] = ...,
107
- config: ImageProcessingConfig = ...,
108
- return_coro: Literal[False] = False,
109
- ) -> Optional[Self]: ...
95
+ headers: dict[str, str] = {},
96
+ config: ImageProcessingConfig = get_default_image_processing_config(),
97
+ img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
98
+ ) -> Optional[Self]:
99
+ """Return a Base64Image instance from a URL or local file path."""
100
+ if maybe_base64 := cls.from_string(url_or_path):
101
+ return maybe_base64
102
+ elif is_remote_url(url_or_path):
103
+ if img_bytes_fetcher:
104
+ img_bytes = img_bytes_fetcher(url_or_path, headers)
105
+ else:
106
+ img_bytes = cls._fetch_remote_image(url_or_path, headers)
107
+ if not img_bytes:
108
+ return None
109
+ return cls._convert_image_into_base64(img_bytes, config)
110
+ try:
111
+ return cls._process_local_image(Path(url_or_path), config)
112
+ except Exception:
113
+ return None
110
114
 
111
115
  @classmethod
112
- def from_url_or_path(
116
+ async def afrom_url_or_path(
113
117
  cls,
114
118
  url_or_path: str,
115
119
  *,
116
120
  headers: dict[str, str] = {},
117
121
  config: ImageProcessingConfig = get_default_image_processing_config(),
118
- return_coro: bool = False,
119
- ) -> Optional[Self] | Awaitable[Optional[Self]]:
122
+ img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
123
+ ) -> Optional[Self]:
120
124
  """Return a Base64Image instance from a URL or local file path."""
121
125
  if maybe_base64 := cls.from_string(url_or_path):
122
126
  return maybe_base64
123
- elif _is_remote_url(url_or_path):
124
- if return_coro:
125
- return cls._afetch_remote_image(url_or_path, headers, config)
126
- return cls._fetch_remote_image(url_or_path, headers, config)
127
+ elif is_remote_url(url_or_path):
128
+ if img_bytes_fetcher:
129
+ img_bytes = await img_bytes_fetcher(url_or_path, headers)
130
+ else:
131
+ img_bytes = await cls._afetch_remote_image(url_or_path, headers)
132
+ if not img_bytes:
133
+ return None
134
+ return cls._convert_image_into_base64(img_bytes, config)
127
135
  try:
128
136
  return cls._process_local_image(Path(url_or_path), config)
129
137
  except Exception:
@@ -142,20 +150,27 @@ class Base64Image(BaseModel):
142
150
  return ext in allowed_types
143
151
 
144
152
  @classmethod
145
- def _fetch_remote_image(cls, url: str, headers: dict[str, str], config: ImageProcessingConfig) -> Optional[Self]:
146
- image_bytes = _get_image_bytes(image_url=url.strip(), headers=headers)
147
- if not image_bytes:
148
- return None
149
- return cls._convert_image_into_base64(image_bytes, config)
153
+ def _fetch_remote_image(cls, url: str, headers: dict[str, str]) -> bytes:
154
+ try:
155
+ with requests.Session() as session:
156
+ response = session.get(url.strip(), headers={k: str(v) for k, v in headers.items()})
157
+ response.raise_for_status()
158
+ image_bytes = bytes(response.content or b"")
159
+ if not image_bytes:
160
+ return b""
161
+ return image_bytes
162
+ except Exception:
163
+ return b""
150
164
 
151
165
  @classmethod
152
- async def _afetch_remote_image(
153
- cls, url: str, headers: dict[str, str], config: ImageProcessingConfig
154
- ) -> Optional[Self]:
155
- image_bytes = await _aget_image_bytes(image_url=url.strip(), headers=headers)
156
- if not image_bytes:
157
- return None
158
- return cls._convert_image_into_base64(image_bytes, config)
166
+ async def _afetch_remote_image(cls, url: str, headers: dict[str, str]) -> bytes:
167
+ try:
168
+ async with ClientSession() as session:
169
+ async with session.get(url.strip(), headers={k: str(v) for k, v in headers.items()}) as response:
170
+ response.raise_for_status()
171
+ return await response.read()
172
+ except Exception:
173
+ return b""
159
174
 
160
175
  @classmethod
161
176
  def _convert_image_into_base64(cls, image_data: bytes, config: Optional[ImageProcessingConfig]) -> Optional[Self]:
@@ -163,6 +178,7 @@ class Base64Image(BaseModel):
163
178
  Retrieve an image in bytes and return a base64-encoded data URL,
164
179
  applying dynamic rules from 'config'.
165
180
  """
181
+
166
182
  if not config:
167
183
  # config 없으면 그냥 기존 헤더만 보고 돌려주는 간단 로직
168
184
  return cls._simple_base64_encode(image_data)
@@ -225,7 +241,7 @@ class Base64Image(BaseModel):
225
241
  """
226
242
  Retrieve an image URL and return a base64-encoded data URL.
227
243
  """
228
- ext = _detect_image_type(image_data)
244
+ ext = detect_image_type(image_data)
229
245
  if not ext:
230
246
  return
231
247
  return cls(ext=ext, data=b64encode(image_data).decode("utf-8"))
@@ -241,12 +257,12 @@ class Base64Image(BaseModel):
241
257
  return cls(ext=ext, data=b64encode(path.read_bytes()).decode("ascii"))
242
258
 
243
259
 
244
- def _is_remote_url(path: str) -> bool:
260
+ def is_remote_url(path: str) -> bool:
245
261
  parsed = urlparse(path)
246
262
  return bool(parsed.scheme and parsed.netloc)
247
263
 
248
264
 
249
- def _detect_image_type(image_data: bytes) -> Optional[ImageType]:
265
+ def detect_image_type(image_data: bytes) -> Optional[ImageType]:
250
266
  """
251
267
  Detect the image format based on the image binary signature (header).
252
268
  Only JPEG, PNG, GIF, WEBP, and BMP are handled as examples.
@@ -267,25 +283,3 @@ def _detect_image_type(image_data: bytes) -> Optional[ImageType]:
267
283
  # BMP: 시작 바이트가 BM
268
284
  elif image_data.startswith(b"BM"):
269
285
  return "bmp"
270
-
271
-
272
- def _get_image_bytes(image_url: str, headers: dict[str, str]) -> Optional[bytes]:
273
- try:
274
- with requests.Session() as session:
275
- response = session.get(image_url, headers={k: str(v) for k, v in headers.items()})
276
- if not response.ok:
277
- return
278
- return bytes(response.content or b"")
279
- except Exception:
280
- return
281
-
282
-
283
- async def _aget_image_bytes(image_url: str, headers: dict[str, str]) -> Optional[bytes]:
284
- try:
285
- async with ClientSession() as session:
286
- async with session.get(image_url, headers={k: str(v) for k, v in headers.items()}) as response:
287
- if not response.ok:
288
- return
289
- return await response.read()
290
- except Exception:
291
- return