chatterer 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +41 -4
- chatterer/common_types/__init__.py +21 -0
- chatterer/common_types/io.py +19 -0
- chatterer/interactive.py +353 -0
- chatterer/language_model.py +129 -252
- chatterer/messages.py +13 -1
- chatterer/tools/__init__.py +27 -9
- chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
- chatterer/tools/convert_pdf_to_markdown.py +302 -0
- chatterer/tools/convert_to_text.py +49 -65
- chatterer/tools/upstage_document_parser.py +705 -0
- chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
- chatterer/tools/youtube.py +2 -1
- chatterer/utils/__init__.py +4 -1
- chatterer/utils/{image.py → base64_image.py} +56 -62
- chatterer/utils/bytesio.py +59 -0
- chatterer/utils/cli.py +476 -0
- chatterer/utils/code_agent.py +137 -38
- chatterer/utils/imghdr.py +148 -0
- chatterer-0.1.14.dist-info/METADATA +387 -0
- chatterer-0.1.14.dist-info/RECORD +34 -0
- chatterer/tools/webpage_to_markdown/__init__.py +0 -4
- chatterer-0.1.12.dist-info/METADATA +0 -170
- chatterer-0.1.12.dist-info/RECORD +0 -27
- {chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/WHEEL +0 -0
- {chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/top_level.txt +0 -0
@@ -13,41 +13,77 @@ Use the synchronous methods (without the "a" prefix) in a normal context manager
|
|
13
13
|
or use the asynchronous methods (prefixed with "a") within an async context manager.
|
14
14
|
"""
|
15
15
|
|
16
|
-
import
|
16
|
+
from __future__ import annotations
|
17
|
+
|
17
18
|
from dataclasses import dataclass, field
|
18
|
-
from
|
19
|
+
from pathlib import Path
|
19
20
|
from types import TracebackType
|
20
21
|
from typing import (
|
21
|
-
|
22
|
+
TYPE_CHECKING,
|
23
|
+
Literal,
|
24
|
+
NotRequired,
|
22
25
|
Optional,
|
23
26
|
Self,
|
27
|
+
Sequence,
|
24
28
|
Type,
|
25
|
-
|
29
|
+
TypeAlias,
|
30
|
+
TypedDict,
|
26
31
|
Union,
|
27
32
|
)
|
28
33
|
|
29
|
-
import
|
30
|
-
|
31
|
-
|
32
|
-
from
|
33
|
-
from
|
34
|
-
from
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
WaitUntil,
|
44
|
-
aget_image_url_and_markdown_links,
|
45
|
-
get_default_playwright_launch_options,
|
46
|
-
get_image_url_and_markdown_links,
|
47
|
-
replace_images,
|
34
|
+
from pydantic import BaseModel, Field
|
35
|
+
|
36
|
+
from ..language_model import DEFAULT_IMAGE_DESCRIPTION_INSTRUCTION, Chatterer
|
37
|
+
from ..utils.base64_image import ImageProcessingConfig, get_default_image_processing_config, is_remote_url
|
38
|
+
from .caption_markdown_images import acaption_markdown_images, caption_markdown_images
|
39
|
+
from .convert_to_text import HtmlToMarkdownOptions, get_default_html_to_markdown_options, html_to_markdown
|
40
|
+
|
41
|
+
if TYPE_CHECKING:
|
42
|
+
import playwright.async_api
|
43
|
+
import playwright.sync_api
|
44
|
+
|
45
|
+
WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
46
|
+
DEFAULT_UA: str = (
|
47
|
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
48
48
|
)
|
49
49
|
|
50
50
|
|
51
|
+
class SelectedLineRanges(BaseModel):
|
52
|
+
line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
|
53
|
+
|
54
|
+
|
55
|
+
class PlaywrightLaunchOptions(TypedDict):
|
56
|
+
executable_path: NotRequired[str | Path]
|
57
|
+
channel: NotRequired[str]
|
58
|
+
args: NotRequired[Sequence[str]]
|
59
|
+
ignore_default_args: NotRequired[bool | Sequence[str]]
|
60
|
+
handle_sigint: NotRequired[bool]
|
61
|
+
handle_sigterm: NotRequired[bool]
|
62
|
+
handle_sighup: NotRequired[bool]
|
63
|
+
timeout: NotRequired[float]
|
64
|
+
env: NotRequired[dict[str, str | float | bool]]
|
65
|
+
headless: NotRequired[bool]
|
66
|
+
devtools: NotRequired[bool]
|
67
|
+
proxy: NotRequired[playwright.sync_api.ProxySettings]
|
68
|
+
downloads_path: NotRequired[str | Path]
|
69
|
+
slow_mo: NotRequired[float]
|
70
|
+
traces_dir: NotRequired[str | Path]
|
71
|
+
chromium_sandbox: NotRequired[bool]
|
72
|
+
firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
|
73
|
+
|
74
|
+
|
75
|
+
class PlaywrightPersistencyOptions(TypedDict):
|
76
|
+
user_data_dir: NotRequired[str | Path]
|
77
|
+
storage_state: NotRequired[playwright.sync_api.StorageState]
|
78
|
+
|
79
|
+
|
80
|
+
class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
|
81
|
+
|
82
|
+
|
83
|
+
def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
|
84
|
+
return {"headless": True}
|
85
|
+
|
86
|
+
|
51
87
|
@dataclass
|
52
88
|
class PlayWrightBot:
|
53
89
|
"""
|
@@ -73,7 +109,8 @@ class PlayWrightBot:
|
|
73
109
|
chatterer (Chatterer): An instance of the language model interface for processing text.
|
74
110
|
"""
|
75
111
|
|
76
|
-
|
112
|
+
engine: Literal["firefox", "chromium", "webkit"] = "firefox"
|
113
|
+
chatterer: Optional[Chatterer] = field(default_factory=Chatterer.openai)
|
77
114
|
playwright_launch_options: PlaywrightLaunchOptions = field(default_factory=get_default_playwright_launch_options)
|
78
115
|
playwright_persistency_options: PlaywrightPersistencyOptions = field(default_factory=PlaywrightPersistencyOptions)
|
79
116
|
html_to_markdown_options: HtmlToMarkdownOptions = field(default_factory=get_default_html_to_markdown_options)
|
@@ -109,28 +146,43 @@ Markdown-formatted webpage content is provided below for your reference:
|
|
109
146
|
|
110
147
|
def get_sync_playwright(self) -> playwright.sync_api.Playwright:
|
111
148
|
if self.sync_playwright is None:
|
112
|
-
|
149
|
+
from playwright.sync_api import sync_playwright
|
150
|
+
|
151
|
+
self.sync_playwright = sync_playwright().start()
|
113
152
|
return self.sync_playwright
|
114
153
|
|
115
154
|
async def get_async_playwright(self) -> playwright.async_api.Playwright:
|
116
155
|
if self.async_playwright is None:
|
117
|
-
|
156
|
+
from playwright.async_api import async_playwright
|
157
|
+
|
158
|
+
self.async_playwright = await async_playwright().start()
|
118
159
|
return self.async_playwright
|
119
160
|
|
120
161
|
def get_sync_browser(self) -> playwright.sync_api.BrowserContext:
|
121
162
|
if self.sync_browser_context is not None:
|
122
163
|
return self.sync_browser_context
|
123
164
|
|
165
|
+
def get_browser() -> playwright.sync_api.BrowserType:
|
166
|
+
playwright = self.get_sync_playwright()
|
167
|
+
if self.engine == "firefox":
|
168
|
+
return playwright.firefox
|
169
|
+
elif self.engine == "chromium":
|
170
|
+
return playwright.chromium
|
171
|
+
elif self.engine == "webkit":
|
172
|
+
return playwright.webkit
|
173
|
+
else:
|
174
|
+
raise ValueError(f"Unsupported engine: {self.engine}")
|
175
|
+
|
124
176
|
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
125
177
|
if user_data_dir:
|
126
178
|
# Use persistent context if user_data_dir is provided
|
127
|
-
self.sync_browser_context =
|
179
|
+
self.sync_browser_context = get_browser().launch_persistent_context(
|
128
180
|
user_data_dir=user_data_dir, **self.playwright_launch_options
|
129
181
|
)
|
130
182
|
return self.sync_browser_context
|
131
183
|
|
132
184
|
# Otherwise, launch a new context
|
133
|
-
browser =
|
185
|
+
browser = get_browser().launch(**self.playwright_launch_options)
|
134
186
|
storage_state = self.playwright_persistency_options.get("storage_state")
|
135
187
|
if storage_state:
|
136
188
|
self.sync_browser_context = browser.new_context(storage_state=storage_state)
|
@@ -142,16 +194,27 @@ Markdown-formatted webpage content is provided below for your reference:
|
|
142
194
|
if self.async_browser_context is not None:
|
143
195
|
return self.async_browser_context
|
144
196
|
|
197
|
+
async def get_browser() -> playwright.async_api.BrowserType:
|
198
|
+
playwright = await self.get_async_playwright()
|
199
|
+
if self.engine == "firefox":
|
200
|
+
return playwright.firefox
|
201
|
+
elif self.engine == "chromium":
|
202
|
+
return playwright.chromium
|
203
|
+
elif self.engine == "webkit":
|
204
|
+
return playwright.webkit
|
205
|
+
else:
|
206
|
+
raise ValueError(f"Unsupported engine: {self.engine}")
|
207
|
+
|
145
208
|
user_data_dir = self.playwright_persistency_options.get("user_data_dir")
|
146
209
|
if user_data_dir:
|
147
210
|
# Use persistent context if user_data_dir is provided
|
148
|
-
self.async_browser_context = await (await
|
211
|
+
self.async_browser_context = await (await get_browser()).launch_persistent_context(
|
149
212
|
user_data_dir=user_data_dir, **self.playwright_launch_options
|
150
213
|
)
|
151
214
|
return self.async_browser_context
|
152
215
|
|
153
216
|
# Otherwise, launch a new context
|
154
|
-
browser = await (await
|
217
|
+
browser = await (await get_browser()).launch(**self.playwright_launch_options)
|
155
218
|
storage_state = self.playwright_persistency_options.get("storage_state")
|
156
219
|
if storage_state:
|
157
220
|
self.async_browser_context = await browser.new_context(storage_state=storage_state)
|
@@ -232,18 +295,24 @@ Markdown-formatted webpage content is provided below for your reference:
|
|
232
295
|
Returns:
|
233
296
|
str: The page content converted to Markdown.
|
234
297
|
"""
|
235
|
-
page
|
236
|
-
if
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
298
|
+
page: Optional[playwright.sync_api.Page] = None
|
299
|
+
if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
|
300
|
+
with open(url, "r", encoding="utf-8") as f:
|
301
|
+
html = f.read()
|
302
|
+
else:
|
303
|
+
page = self.get_page(url, timeout=timeout, referer=referer)
|
304
|
+
if wait:
|
305
|
+
page.wait_for_timeout(wait * 1000)
|
306
|
+
if scrolldown:
|
307
|
+
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
308
|
+
if sleep:
|
309
|
+
page.wait_for_timeout(sleep * 1000)
|
310
|
+
if reload:
|
311
|
+
page.reload(timeout=int(timeout * 1000))
|
312
|
+
html = page.content()
|
313
|
+
|
245
314
|
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
246
|
-
if not keep_page:
|
315
|
+
if not keep_page and page is not None:
|
247
316
|
page.close()
|
248
317
|
return md
|
249
318
|
|
@@ -275,18 +344,23 @@ Markdown-formatted webpage content is provided below for your reference:
|
|
275
344
|
Returns:
|
276
345
|
str: The page content converted to Markdown.
|
277
346
|
"""
|
278
|
-
page
|
279
|
-
if
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
347
|
+
page: Optional[playwright.async_api.Page] = None
|
348
|
+
if not is_remote_url(url) and Path(url).is_file() and Path(url).suffix.lower() == ".html":
|
349
|
+
with open(url, "r", encoding="utf-8") as f:
|
350
|
+
html = f.read()
|
351
|
+
else:
|
352
|
+
page = await self.aget_page(url, timeout=timeout, referer=referer)
|
353
|
+
if wait:
|
354
|
+
await page.wait_for_timeout(wait * 1000)
|
355
|
+
if scrolldown:
|
356
|
+
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
|
357
|
+
if sleep:
|
358
|
+
await page.wait_for_timeout(sleep * 1000)
|
359
|
+
if reload:
|
360
|
+
await page.reload(timeout=int(timeout * 1000))
|
361
|
+
html = await page.content()
|
288
362
|
md = html_to_markdown(html=html, options=self.html_to_markdown_options)
|
289
|
-
if not keep_page:
|
363
|
+
if not keep_page and page is not None:
|
290
364
|
await page.close()
|
291
365
|
return md
|
292
366
|
|
@@ -417,6 +491,8 @@ Markdown-formatted webpage content is provided below for your reference:
|
|
417
491
|
Returns:
|
418
492
|
str: Filtered Markdown containing only the important lines.
|
419
493
|
"""
|
494
|
+
if self.chatterer is None:
|
495
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
420
496
|
markdown_content = self.url_to_md(
|
421
497
|
url,
|
422
498
|
wait=wait,
|
@@ -498,6 +574,8 @@ Markdown-formatted webpage content is provided below for your reference:
|
|
498
574
|
Returns:
|
499
575
|
str: Filtered Markdown containing only the important lines.
|
500
576
|
"""
|
577
|
+
if self.chatterer is None:
|
578
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
501
579
|
markdown_content = await self.aurl_to_md(
|
502
580
|
url,
|
503
581
|
wait=wait,
|
@@ -542,75 +620,87 @@ Markdown-formatted webpage content is provided below for your reference:
|
|
542
620
|
def describe_images(self, markdown_text: str, referer_url: str) -> str:
|
543
621
|
"""
|
544
622
|
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
623
|
+
Using Playwright for fetching images to bypass CDN protections.
|
545
624
|
"""
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
headers=self.headers | {"Referer": referer_url},
|
550
|
-
config=self.image_processing_config,
|
551
|
-
)
|
552
|
-
)
|
553
|
-
|
554
|
-
image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
|
555
|
-
for image_url, markdown_links in image_url_and_markdown_links.items():
|
556
|
-
if image_url is not None:
|
557
|
-
try:
|
558
|
-
image_summary: str = self.chatterer.describe_image(
|
559
|
-
image_url=image_url.data_uri,
|
560
|
-
instruction=self.image_description_instruction,
|
561
|
-
)
|
562
|
-
except Exception:
|
563
|
-
print_exc()
|
564
|
-
continue
|
565
|
-
image_description_and_references[image_summary] = markdown_links
|
566
|
-
else:
|
567
|
-
image_description_and_references[None] = markdown_links
|
568
|
-
|
569
|
-
return replace_images(
|
625
|
+
if self.chatterer is None:
|
626
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
627
|
+
return caption_markdown_images(
|
570
628
|
markdown_text=markdown_text,
|
571
|
-
|
629
|
+
headers=self.headers | {"Referer": referer_url},
|
572
630
|
description_format=self.description_format,
|
631
|
+
image_description_instruction=self.image_description_instruction,
|
632
|
+
chatterer=self.chatterer,
|
633
|
+
image_processing_config=self.image_processing_config,
|
634
|
+
img_bytes_fetcher=self._playwright_fetch_image_bytes,
|
573
635
|
)
|
574
636
|
|
637
|
+
# 기존 adescribe_images 메서드를 다음과 같이 수정합니다.
|
575
638
|
async def adescribe_images(self, markdown_text: str, referer_url: str) -> str:
|
576
639
|
"""
|
577
640
|
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
641
|
+
Using Playwright for fetching images to bypass CDN protections.
|
578
642
|
"""
|
579
|
-
|
580
|
-
|
581
|
-
|
643
|
+
if self.chatterer is None:
|
644
|
+
raise ValueError("Chatterer instance is not set. Please provide a valid Chatterer instance.")
|
645
|
+
return await acaption_markdown_images(
|
582
646
|
markdown_text=markdown_text,
|
583
647
|
headers=self.headers | {"Referer": referer_url},
|
584
|
-
|
648
|
+
description_format=self.description_format,
|
649
|
+
image_description_instruction=self.image_description_instruction,
|
650
|
+
chatterer=self.chatterer,
|
651
|
+
image_processing_config=self.image_processing_config,
|
652
|
+
img_bytes_fetcher=self._aplaywright_fetch_image_bytes,
|
585
653
|
)
|
586
654
|
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
if
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
604
|
-
|
605
|
-
|
606
|
-
|
607
|
-
|
608
|
-
|
655
|
+
def _playwright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
|
656
|
+
"""Playwright를 사용하여 동기적으로 이미지 바이트를 가져옵니다."""
|
657
|
+
page: Optional[playwright.sync_api.Page] = None
|
658
|
+
try:
|
659
|
+
# Get the existing synchronous browser context.
|
660
|
+
page = self.get_sync_browser().new_page()
|
661
|
+
|
662
|
+
# Set the provided headers as extra HTTP headers for the page.
|
663
|
+
# This will apply to all subsequent requests made by the page.
|
664
|
+
page.set_extra_http_headers(headers)
|
665
|
+
response = page.goto(image_url, wait_until="load", timeout=15000)
|
666
|
+
if response and response.ok:
|
667
|
+
return response.body()
|
668
|
+
else:
|
669
|
+
return b""
|
670
|
+
except Exception as e:
|
671
|
+
print(f"Playwright exception fetching image: {image_url}, Error: {e}")
|
672
|
+
return b""
|
673
|
+
finally:
|
674
|
+
if page:
|
675
|
+
page.close()
|
676
|
+
|
677
|
+
async def _aplaywright_fetch_image_bytes(self, image_url: str, headers: dict[str, str]) -> bytes:
|
678
|
+
"""Playwright를 사용하여 비동기적으로 이미지 바이트를 가져옵니다."""
|
679
|
+
page: Optional[playwright.async_api.Page] = None
|
680
|
+
try:
|
681
|
+
# Get the existing asynchronous browser context.
|
682
|
+
page = await (await self.get_async_browser()).new_page()
|
683
|
+
|
684
|
+
# Set the provided headers as extra HTTP headers for the page.
|
685
|
+
# This will apply to all subsequent requests made by the page.
|
686
|
+
await page.set_extra_http_headers(headers)
|
687
|
+
response = await page.goto(image_url, wait_until="load", timeout=15000)
|
688
|
+
if response and response.ok:
|
689
|
+
return await response.body()
|
690
|
+
else:
|
691
|
+
# 실패 시 로그를 남기거나 None을 반환할 수 있습니다.
|
692
|
+
print(
|
693
|
+
f"Playwright failed to fetch image: {image_url}, Status: {response.status if response else 'No Response'}"
|
609
694
|
)
|
610
|
-
|
611
|
-
|
612
|
-
|
613
|
-
|
695
|
+
return b""
|
696
|
+
except Exception as e:
|
697
|
+
# 예외 발생 시 로그를 남깁니다.
|
698
|
+
print(f"Playwright exception fetching image: {image_url}, Error: {e}")
|
699
|
+
return b""
|
700
|
+
finally:
|
701
|
+
# 페이지를 항상 닫아 리소스를 정리합니다.
|
702
|
+
if page:
|
703
|
+
await page.close()
|
614
704
|
|
615
705
|
def __enter__(self) -> Self:
|
616
706
|
return self
|
chatterer/tools/youtube.py
CHANGED
@@ -29,7 +29,7 @@ def get_youtube_video_details(
|
|
29
29
|
def get_youtube_video_subtitle(video_id: str) -> str:
|
30
30
|
"""Get the transcript of a YouTube video using the given video ID."""
|
31
31
|
|
32
|
-
from youtube_transcript_api
|
32
|
+
from youtube_transcript_api import YouTubeTranscriptApi # pyright: ignore[reportPrivateImportUsage]
|
33
33
|
|
34
34
|
get_transcript = YouTubeTranscriptApi.get_transcript # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
35
35
|
list_transcripts = YouTubeTranscriptApi.list_transcripts # pyright: ignore[reportUnknownMemberType, reportUnknownVariableType]
|
@@ -144,3 +144,4 @@ class YoutubeSearchResult:
|
|
144
144
|
if __name__ == "__main__":
|
145
145
|
print(get_youtube_video_details("BTS"))
|
146
146
|
# print(get_youtube_transcript("y7jrpS8GHxs"))
|
147
|
+
|
chatterer/utils/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
|
+
from .base64_image import Base64Image
|
2
|
+
from .cli import ArgumentSpec, BaseArguments
|
1
3
|
from .code_agent import (
|
2
4
|
CodeExecutionResult,
|
3
5
|
FunctionSignature,
|
4
6
|
get_default_repl_tool,
|
5
7
|
insert_callables_into_global,
|
6
8
|
)
|
7
|
-
from .image import Base64Image
|
8
9
|
|
9
10
|
__all__ = [
|
10
11
|
"Base64Image",
|
@@ -12,4 +13,6 @@ __all__ = [
|
|
12
13
|
"CodeExecutionResult",
|
13
14
|
"get_default_repl_tool",
|
14
15
|
"insert_callables_into_global",
|
16
|
+
"BaseArguments",
|
17
|
+
"ArgumentSpec",
|
15
18
|
]
|
@@ -7,6 +7,7 @@ from logging import getLogger
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import (
|
9
9
|
Awaitable,
|
10
|
+
Callable,
|
10
11
|
ClassVar,
|
11
12
|
Literal,
|
12
13
|
NotRequired,
|
@@ -18,7 +19,6 @@ from typing import (
|
|
18
19
|
TypeGuard,
|
19
20
|
cast,
|
20
21
|
get_args,
|
21
|
-
overload,
|
22
22
|
)
|
23
23
|
from urllib.parse import urlparse
|
24
24
|
|
@@ -55,10 +55,11 @@ def get_default_image_processing_config() -> ImageProcessingConfig:
|
|
55
55
|
"min_largest_side": 200,
|
56
56
|
"resize_if_min_side_exceeds": 2000,
|
57
57
|
"resize_target_for_min_side": 1000,
|
58
|
-
"formats": ["png", "jpeg", "gif", "bmp", "webp"],
|
58
|
+
"formats": ["png", "jpeg", "jpg", "gif", "bmp", "webp"],
|
59
59
|
}
|
60
60
|
|
61
61
|
|
62
|
+
# image_url: str, headers: dict[str, str]) -> Optional[bytes]:
|
62
63
|
class Base64Image(BaseModel):
|
63
64
|
ext: ImageType
|
64
65
|
data: str
|
@@ -86,44 +87,51 @@ class Base64Image(BaseModel):
|
|
86
87
|
def from_bytes(cls, data: bytes, ext: ImageType) -> Self:
|
87
88
|
return cls(ext=ext, data=b64encode(data).decode("utf-8"))
|
88
89
|
|
89
|
-
@overload
|
90
90
|
@classmethod
|
91
91
|
def from_url_or_path(
|
92
92
|
cls,
|
93
93
|
url_or_path: str,
|
94
94
|
*,
|
95
|
-
headers: dict[str, str] =
|
96
|
-
config: ImageProcessingConfig =
|
97
|
-
|
98
|
-
) ->
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
95
|
+
headers: dict[str, str] = {},
|
96
|
+
config: ImageProcessingConfig = get_default_image_processing_config(),
|
97
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
|
98
|
+
) -> Optional[Self]:
|
99
|
+
"""Return a Base64Image instance from a URL or local file path."""
|
100
|
+
if maybe_base64 := cls.from_string(url_or_path):
|
101
|
+
return maybe_base64
|
102
|
+
elif is_remote_url(url_or_path):
|
103
|
+
if img_bytes_fetcher:
|
104
|
+
img_bytes = img_bytes_fetcher(url_or_path, headers)
|
105
|
+
else:
|
106
|
+
img_bytes = cls._fetch_remote_image(url_or_path, headers)
|
107
|
+
if not img_bytes:
|
108
|
+
return None
|
109
|
+
return cls._convert_image_into_base64(img_bytes, config)
|
110
|
+
try:
|
111
|
+
return cls._process_local_image(Path(url_or_path), config)
|
112
|
+
except Exception:
|
113
|
+
return None
|
110
114
|
|
111
115
|
@classmethod
|
112
|
-
def
|
116
|
+
async def afrom_url_or_path(
|
113
117
|
cls,
|
114
118
|
url_or_path: str,
|
115
119
|
*,
|
116
120
|
headers: dict[str, str] = {},
|
117
121
|
config: ImageProcessingConfig = get_default_image_processing_config(),
|
118
|
-
|
119
|
-
) -> Optional[Self]
|
122
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
|
123
|
+
) -> Optional[Self]:
|
120
124
|
"""Return a Base64Image instance from a URL or local file path."""
|
121
125
|
if maybe_base64 := cls.from_string(url_or_path):
|
122
126
|
return maybe_base64
|
123
|
-
elif
|
124
|
-
if
|
125
|
-
|
126
|
-
|
127
|
+
elif is_remote_url(url_or_path):
|
128
|
+
if img_bytes_fetcher:
|
129
|
+
img_bytes = await img_bytes_fetcher(url_or_path, headers)
|
130
|
+
else:
|
131
|
+
img_bytes = await cls._afetch_remote_image(url_or_path, headers)
|
132
|
+
if not img_bytes:
|
133
|
+
return None
|
134
|
+
return cls._convert_image_into_base64(img_bytes, config)
|
127
135
|
try:
|
128
136
|
return cls._process_local_image(Path(url_or_path), config)
|
129
137
|
except Exception:
|
@@ -142,20 +150,27 @@ class Base64Image(BaseModel):
|
|
142
150
|
return ext in allowed_types
|
143
151
|
|
144
152
|
@classmethod
|
145
|
-
def _fetch_remote_image(cls, url: str, headers: dict[str, str]
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
153
|
+
def _fetch_remote_image(cls, url: str, headers: dict[str, str]) -> bytes:
|
154
|
+
try:
|
155
|
+
with requests.Session() as session:
|
156
|
+
response = session.get(url.strip(), headers={k: str(v) for k, v in headers.items()})
|
157
|
+
response.raise_for_status()
|
158
|
+
image_bytes = bytes(response.content or b"")
|
159
|
+
if not image_bytes:
|
160
|
+
return b""
|
161
|
+
return image_bytes
|
162
|
+
except Exception:
|
163
|
+
return b""
|
150
164
|
|
151
165
|
@classmethod
|
152
|
-
async def _afetch_remote_image(
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
166
|
+
async def _afetch_remote_image(cls, url: str, headers: dict[str, str]) -> bytes:
|
167
|
+
try:
|
168
|
+
async with ClientSession() as session:
|
169
|
+
async with session.get(url.strip(), headers={k: str(v) for k, v in headers.items()}) as response:
|
170
|
+
response.raise_for_status()
|
171
|
+
return await response.read()
|
172
|
+
except Exception:
|
173
|
+
return b""
|
159
174
|
|
160
175
|
@classmethod
|
161
176
|
def _convert_image_into_base64(cls, image_data: bytes, config: Optional[ImageProcessingConfig]) -> Optional[Self]:
|
@@ -163,6 +178,7 @@ class Base64Image(BaseModel):
|
|
163
178
|
Retrieve an image in bytes and return a base64-encoded data URL,
|
164
179
|
applying dynamic rules from 'config'.
|
165
180
|
"""
|
181
|
+
|
166
182
|
if not config:
|
167
183
|
# config 없으면 그냥 기존 헤더만 보고 돌려주는 간단 로직
|
168
184
|
return cls._simple_base64_encode(image_data)
|
@@ -225,7 +241,7 @@ class Base64Image(BaseModel):
|
|
225
241
|
"""
|
226
242
|
Retrieve an image URL and return a base64-encoded data URL.
|
227
243
|
"""
|
228
|
-
ext =
|
244
|
+
ext = detect_image_type(image_data)
|
229
245
|
if not ext:
|
230
246
|
return
|
231
247
|
return cls(ext=ext, data=b64encode(image_data).decode("utf-8"))
|
@@ -241,12 +257,12 @@ class Base64Image(BaseModel):
|
|
241
257
|
return cls(ext=ext, data=b64encode(path.read_bytes()).decode("ascii"))
|
242
258
|
|
243
259
|
|
244
|
-
def
|
260
|
+
def is_remote_url(path: str) -> bool:
|
245
261
|
parsed = urlparse(path)
|
246
262
|
return bool(parsed.scheme and parsed.netloc)
|
247
263
|
|
248
264
|
|
249
|
-
def
|
265
|
+
def detect_image_type(image_data: bytes) -> Optional[ImageType]:
|
250
266
|
"""
|
251
267
|
Detect the image format based on the image binary signature (header).
|
252
268
|
Only JPEG, PNG, GIF, WEBP, and BMP are handled as examples.
|
@@ -267,25 +283,3 @@ def _detect_image_type(image_data: bytes) -> Optional[ImageType]:
|
|
267
283
|
# BMP: 시작 바이트가 BM
|
268
284
|
elif image_data.startswith(b"BM"):
|
269
285
|
return "bmp"
|
270
|
-
|
271
|
-
|
272
|
-
def _get_image_bytes(image_url: str, headers: dict[str, str]) -> Optional[bytes]:
|
273
|
-
try:
|
274
|
-
with requests.Session() as session:
|
275
|
-
response = session.get(image_url, headers={k: str(v) for k, v in headers.items()})
|
276
|
-
if not response.ok:
|
277
|
-
return
|
278
|
-
return bytes(response.content or b"")
|
279
|
-
except Exception:
|
280
|
-
return
|
281
|
-
|
282
|
-
|
283
|
-
async def _aget_image_bytes(image_url: str, headers: dict[str, str]) -> Optional[bytes]:
|
284
|
-
try:
|
285
|
-
async with ClientSession() as session:
|
286
|
-
async with session.get(image_url, headers={k: str(v) for k, v in headers.items()}) as response:
|
287
|
-
if not response.ok:
|
288
|
-
return
|
289
|
-
return await response.read()
|
290
|
-
except Exception:
|
291
|
-
return
|