chatterer 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,556 +1,329 @@
1
- from __future__ import annotations
2
-
3
- import os.path
4
- import re
5
- from base64 import b64encode
6
- from io import BytesIO
7
- from pathlib import Path
8
- from traceback import print_exc
9
- from typing import (
10
- ClassVar,
11
- Literal,
12
- NamedTuple,
13
- NewType,
14
- NotRequired,
15
- Optional,
16
- Self,
17
- Sequence,
18
- TypeAlias,
19
- TypedDict,
20
- TypeGuard,
21
- cast,
22
- )
23
- from urllib.parse import urljoin, urlparse
24
-
25
- import mistune
26
- import playwright.sync_api
27
- import requests
28
- from aiohttp import ClientSession
29
- from PIL.Image import Resampling
30
- from PIL.Image import open as image_open
31
- from pydantic import BaseModel, Field
32
-
33
-
34
- # Define a Pydantic model for the selected line ranges returned by the LLM.
35
- class SelectedLineRanges(BaseModel):
36
- line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
37
-
38
-
39
- class PlaywrightLaunchOptions(TypedDict):
40
- executable_path: NotRequired[str | Path]
41
- channel: NotRequired[str]
42
- args: NotRequired[Sequence[str]]
43
- ignore_default_args: NotRequired[bool | Sequence[str]]
44
- handle_sigint: NotRequired[bool]
45
- handle_sigterm: NotRequired[bool]
46
- handle_sighup: NotRequired[bool]
47
- timeout: NotRequired[float]
48
- env: NotRequired[dict[str, str | float | bool]]
49
- headless: NotRequired[bool]
50
- devtools: NotRequired[bool]
51
- proxy: NotRequired[playwright.sync_api.ProxySettings]
52
- downloads_path: NotRequired[str | Path]
53
- slow_mo: NotRequired[float]
54
- traces_dir: NotRequired[str | Path]
55
- chromium_sandbox: NotRequired[bool]
56
- firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
57
-
58
-
59
- class PlaywrightPersistencyOptions(TypedDict):
60
- user_data_dir: NotRequired[str | Path]
61
- storage_state: NotRequired[playwright.sync_api.StorageState]
62
-
63
-
64
- class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
65
-
66
-
67
- def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
68
- return {"headless": True}
69
-
70
-
71
- class ImageProcessingConfig(TypedDict):
72
- """
73
- 이미지 필터링/변환 시 사용할 설정.
74
- - formats: (Sequence[str]) 허용할 이미지 포맷(소문자, 예: ["jpeg", "png", "webp"]).
75
- - max_size_mb: (float) 이미지 용량 상한(MB). 초과 시 제외.
76
- - min_largest_side: (int) 가로나 세로 중 가장 큰 변의 최소 크기. 미만 시 제외.
77
- - resize_if_min_side_exceeds: (int) 가로나 세로 중 작은 변이 이 값 이상이면 리스케일.
78
- - resize_target_for_min_side: (int) 리스케일시, '가장 작은 변'을 이 값으로 줄임(비율 유지는 Lanczos).
79
- """
80
-
81
- formats: Sequence[str]
82
- max_size_mb: NotRequired[float]
83
- min_largest_side: NotRequired[int]
84
- resize_if_min_side_exceeds: NotRequired[int]
85
- resize_target_for_min_side: NotRequired[int]
86
-
87
-
88
- def get_default_image_processing_config() -> ImageProcessingConfig:
89
- return {
90
- "max_size_mb": 5,
91
- "min_largest_side": 200,
92
- "resize_if_min_side_exceeds": 2000,
93
- "resize_target_for_min_side": 1000,
94
- "formats": ["png", "jpg", "jpeg", "gif", "bmp", "webp"],
95
- }
96
-
97
-
98
- class _TrackingInlineState(mistune.InlineState):
99
- meta_offset: int = 0 # Where in the original text does self.src start?
100
-
101
- def copy(self) -> Self:
102
- new_state = self.__class__(self.env)
103
- new_state.src = self.src
104
- new_state.tokens = []
105
- new_state.in_image = self.in_image
106
- new_state.in_link = self.in_link
107
- new_state.in_emphasis = self.in_emphasis
108
- new_state.in_strong = self.in_strong
109
- new_state.meta_offset = self.meta_offset
110
- return new_state
111
-
112
-
113
- class MarkdownLink(NamedTuple):
114
- type: Literal["link", "image"]
115
- url: str
116
- text: str
117
- title: Optional[str]
118
- pos: int
119
- end_pos: int
120
-
121
- @classmethod
122
- def from_markdown(cls, markdown_text: str, referer_url: Optional[str]) -> list[Self]:
123
- """
124
- The main function that returns the list of MarkdownLink for the input text.
125
- For simplicity, we do a "pure inline parse" of the entire text
126
- instead of letting the block parser break it up. That ensures that
127
- link tokens cover the global positions of the entire input.
128
- """
129
- md = mistune.Markdown(inline=_TrackingInlineParser())
130
- # Create an inline state that references the full text.
131
- state = _TrackingInlineState({})
132
- state.src = markdown_text
133
-
134
- # Instead of calling md.parse, we can directly run the inline parser on
135
- # the entire text, so that positions match the entire input:
136
- md.inline.parse(state)
137
-
138
- # Now gather all the link info from the tokens.
139
- return cls._extract_links(tokens=state.tokens, referer_url=referer_url)
140
-
141
- @property
142
- def inline_text(self) -> str:
143
- return self.text.replace("\n", " ").strip()
144
-
145
- @property
146
- def inline_title(self) -> str:
147
- return self.title.replace("\n", " ").strip() if self.title else ""
148
-
149
- @property
150
- def link_markdown(self) -> str:
151
- if self.title:
152
- return f'[{self.inline_text}]({self.url} "{self.inline_title}")'
153
- return f"[{self.inline_text}]({self.url})"
154
-
155
- @classmethod
156
- def replace(cls, text: str, replacements: list[tuple[Self, str]]) -> str:
157
- for self, replacement in sorted(replacements, key=lambda x: x[0].pos, reverse=True):
158
- text = text[: self.pos] + replacement + text[self.end_pos :]
159
- return text
160
-
161
- @classmethod
162
- def _extract_links(cls, tokens: list[dict[str, object]], referer_url: Optional[str]) -> list[Self]:
163
- results: list[Self] = []
164
- for token in tokens:
165
- if (
166
- (type := token.get("type")) in ("link", "image")
167
- and "global_pos" in token
168
- and "attrs" in token
169
- and _attrs_typeguard(attrs := token["attrs"])
170
- and "url" in attrs
171
- and _url_typeguard(url := attrs["url"])
172
- and _global_pos_typeguard(global_pos := token["global_pos"])
173
- ):
174
- if referer_url:
175
- url = _to_absolute_path(path=url, referer=referer_url)
176
- children: object | None = token.get("children")
177
- if _children_typeguard(children):
178
- text = _extract_text(children)
179
- else:
180
- text = ""
181
-
182
- if "title" in attrs:
183
- title = str(attrs["title"])
184
- else:
185
- title = None
186
-
187
- start, end = global_pos
188
- results.append(cls(type, url, text, title, start, end))
189
- if "children" in token and _children_typeguard(children := token["children"]):
190
- results.extend(cls._extract_links(children, referer_url))
191
-
192
- return results
193
-
194
-
195
- class _TrackingInlineParser(mistune.InlineParser):
196
- state_cls: ClassVar = _TrackingInlineState
197
-
198
- def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
199
- self, m: re.Match[str], state: _TrackingInlineState
200
- ) -> Optional[int]:
201
- """
202
- Mistune calls parse_link with a match object for the link syntax
203
- and the current inline state. If we successfully parse the link,
204
- super().parse_link(...) returns the new position *within self.src*.
205
- We add that to state.meta_offset for the global position.
206
-
207
- Because parse_link in mistune might return None or an int, we only
208
- record positions if we get an int back (meaning success).
209
- """
210
- offset = state.meta_offset
211
- new_pos: int | None = super().parse_link(m, state)
212
- if new_pos is not None:
213
- # We have successfully parsed a link.
214
- # The link token we just added should be the last token in state.tokens:
215
- if state.tokens:
216
- token = state.tokens[-1]
217
- # The local end is new_pos in the substring.
218
- # So the global start/end in the *original* text is offset + local positions.
219
- token["global_pos"] = (offset + m.start(), offset + new_pos)
220
- return new_pos
221
-
222
-
223
- # --------------------------------------------------------------------
224
- # Type Guards & Helper to gather plain text from nested tokens (for the link text).
225
- # --------------------------------------------------------------------
226
- def _children_typeguard(obj: object) -> TypeGuard[list[dict[str, object]]]:
227
- if not isinstance(obj, list):
228
- return False
229
- return all(isinstance(i, dict) for i in cast(list[object], obj))
230
-
231
-
232
- def _attrs_typeguard(obj: object) -> TypeGuard[dict[str, object]]:
233
- if not isinstance(obj, dict):
234
- return False
235
- return all(isinstance(k, str) for k in cast(dict[object, object], obj))
236
-
237
-
238
- def _global_pos_typeguard(obj: object) -> TypeGuard[tuple[int, int]]:
239
- if not isinstance(obj, tuple):
240
- return False
241
- obj = cast(tuple[object, ...], obj)
242
- if len(obj) != 2:
243
- return False
244
- return all(isinstance(i, int) for i in obj)
245
-
246
-
247
- def _url_typeguard(obj: object) -> TypeGuard[str]:
248
- return isinstance(obj, str)
249
-
250
-
251
- def _extract_text(tokens: list[dict[str, object]]) -> str:
252
- parts: list[str] = []
253
- for t in tokens:
254
- if t.get("type") == "text":
255
- parts.append(str(t.get("raw", "")))
256
- elif "children" in t:
257
- children: object = t["children"]
258
- if not _children_typeguard(children):
259
- continue
260
- parts.append(_extract_text(children))
261
- return "".join(parts)
262
-
263
-
264
- def _is_url(path: str) -> bool:
265
- """
266
- path가 절대 URL 형태인지 여부를 bool로 반환
267
- (scheme과 netloc이 모두 존재하면 URL로 간주)
268
- """
269
- parsed = urlparse(path)
270
- return bool(parsed.scheme and parsed.netloc)
271
-
272
-
273
- def _to_absolute_path(path: str, referer: str) -> str:
274
- """
275
- path : 변환할 경로(상대/절대 경로 혹은 URL일 수도 있음)
276
- referer : 기준이 되는 절대경로(혹은 URL)
277
- """
278
- # referer가 URL인지 파일 경로인지 먼저 판별
279
- ref_parsed = urlparse(referer)
280
- is_referer_url = bool(ref_parsed.scheme and ref_parsed.netloc)
281
-
282
- if is_referer_url:
283
- # referer가 URL이라면,
284
- # 1) path 자체가 이미 절대 URL인지 확인
285
- parsed = urlparse(path)
286
- if parsed.scheme and parsed.netloc:
287
- # path가 이미 완전한 URL (예: http://, https:// 등)
288
- return path
289
- else:
290
- # 그렇지 않다면(슬래시로 시작 포함), urljoin을 써서 referer + path 로 합침
291
- return urljoin(referer, path)
292
- else:
293
- # referer가 로컬 경로라면,
294
- # path가 로컬 파일 시스템에서의 절대경로인지 판단
295
- if os.path.isabs(path):
296
- return path
297
- else:
298
- # 파일이면 referer의 디렉토리만 추출
299
- if not os.path.isdir(referer):
300
- referer_dir = os.path.dirname(referer)
301
- else:
302
- referer_dir = referer
303
-
304
- combined = os.path.join(referer_dir, path)
305
- return os.path.abspath(combined)
306
-
307
-
308
- # =======================
309
-
310
-
311
- def _get_image_bytes(image_url: str, headers: dict[str, str]) -> Optional[bytes]:
312
- try:
313
- with requests.Session() as session:
314
- response = session.get(image_url, headers={k: str(v) for k, v in headers.items()})
315
- if not response.ok:
316
- return
317
- return bytes(response.content or b"")
318
- except Exception:
319
- return
320
-
321
-
322
- async def _aget_image_bytes(image_url: str, headers: dict[str, str]) -> Optional[bytes]:
323
- try:
324
- async with ClientSession() as session:
325
- async with session.get(image_url, headers={k: str(v) for k, v in headers.items()}) as response:
326
- if not response.ok:
327
- return
328
- return await response.read()
329
- except Exception:
330
- return
331
-
332
-
333
- # =======================
334
-
335
-
336
- def _fetch_remote_image(url: str, headers: dict[str, str], config: ImageProcessingConfig) -> Optional[str]:
337
- image_bytes = _get_image_bytes(image_url=url.strip(), headers=headers)
338
- if not image_bytes:
339
- return None
340
- return _convert_image_into_base64(image_bytes, config)
341
-
342
-
343
- async def _afetch_remote_image(url: str, headers: dict[str, str], config: ImageProcessingConfig) -> Optional[str]:
344
- image_bytes = await _aget_image_bytes(image_url=url.strip(), headers=headers)
345
- if not image_bytes:
346
- return None
347
- return _convert_image_into_base64(image_bytes, config)
348
-
349
-
350
- # =======================
351
-
352
-
353
- def _process_markdown_image(
354
- markdown_link: MarkdownLink, headers: dict[str, str], config: ImageProcessingConfig
355
- ) -> Optional[str]:
356
- """마크다운 이미지 패턴에 매칭된 하나의 이미지를 처리해 Base64 URL을 반환(동기)."""
357
- if markdown_link.type != "image":
358
- return
359
- url: str = markdown_link.url
360
- if url.startswith("data:image/"):
361
- return url
362
- elif _is_url(url):
363
- return _fetch_remote_image(url, headers, config)
364
- return _process_local_image(Path(url), config)
365
-
366
-
367
- async def _aprocess_markdown_image(
368
- markdown_link: MarkdownLink, headers: dict[str, str], config: ImageProcessingConfig
369
- ) -> Optional[str]:
370
- """마크다운 이미지 패턴에 매칭된 하나의 이미지를 처리해 Base64 URL을 반환(비동기)."""
371
- if markdown_link.type != "image":
372
- return
373
- url: str = markdown_link.url
374
- if url.startswith("data:image/"):
375
- return url
376
- elif _is_url(url):
377
- return await _afetch_remote_image(url, headers, config)
378
- return _process_local_image(Path(url), config)
379
-
380
-
381
- # =======================
382
-
383
-
384
- def get_image_url_and_markdown_links(
385
- markdown_text: str, headers: dict[str, str], config: ImageProcessingConfig
386
- ) -> dict[Optional[str], list[MarkdownLink]]:
387
- image_matches: dict[Optional[str], list[MarkdownLink]] = {}
388
- for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
389
- if markdown_link.type == "link":
390
- image_matches.setdefault(None, []).append(markdown_link)
391
- continue
392
- image_data = _process_markdown_image(markdown_link, headers, config)
393
- if not image_data:
394
- continue
395
- image_matches.setdefault(image_data, []).append(markdown_link)
396
- return image_matches
397
-
398
-
399
- async def aget_image_url_and_markdown_links(
400
- markdown_text: str, headers: dict[str, str], config: ImageProcessingConfig
401
- ) -> dict[Optional[str], list[MarkdownLink]]:
402
- image_matches: dict[Optional[str], list[MarkdownLink]] = {}
403
- for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
404
- if markdown_link.type == "link":
405
- image_matches.setdefault(None, []).append(markdown_link)
406
- continue
407
- image_data = await _aprocess_markdown_image(markdown_link, headers, config)
408
- if not image_data:
409
- continue
410
- image_matches.setdefault(image_data, []).append(markdown_link)
411
- return image_matches
412
-
413
-
414
- # =======================
415
-
416
-
417
- def _simple_base64_encode(image_data: bytes) -> Optional[str]:
418
- """
419
- Retrieve an image URL and return a base64-encoded data URL.
420
- """
421
- image_type = _detect_image_type(image_data)
422
- if not image_type:
423
- return
424
- encoded_data = b64encode(image_data).decode("utf-8")
425
- return f"data:image/{image_type};base64,{encoded_data}"
426
-
427
-
428
- def _convert_image_into_base64(image_data: bytes, config: Optional[ImageProcessingConfig]) -> Optional[str]:
429
- """
430
- Retrieve an image in bytes and return a base64-encoded data URL,
431
- applying dynamic rules from 'config'.
432
- """
433
- if not config:
434
- # config 없으면 그냥 기존 헤더만 보고 돌려주는 간단 로직
435
- return _simple_base64_encode(image_data)
436
-
437
- # 1) 용량 검사
438
- max_size_mb = config.get("max_size_mb", float("inf"))
439
- image_size_mb = len(image_data) / (1024 * 1024)
440
- if image_size_mb > max_size_mb:
441
- print(f"Image too large: {image_size_mb:.2f} MB > {max_size_mb} MB")
442
- return None
443
-
444
- # 2) Pillow로 이미지 열기
445
- try:
446
- with image_open(BytesIO(image_data)) as im:
447
- w, h = im.size
448
- # 가장 큰 변
449
- largest_side = max(w, h)
450
- # 가장 작은 변
451
- smallest_side = min(w, h)
452
-
453
- # min_largest_side 기준
454
- min_largest_side = config.get("min_largest_side", 1)
455
- if largest_side < min_largest_side:
456
- print(f"Image too small: {largest_side} < {min_largest_side}")
457
- return None
458
-
459
- # resize 로직
460
- resize_if_min_side_exceeds = config.get("resize_if_min_side_exceeds", float("inf"))
461
- if smallest_side >= resize_if_min_side_exceeds:
462
- # resize_target_for_min_side 로 축소
463
- resize_target = config.get("resize_target_for_min_side", 1000)
464
- ratio = resize_target / float(smallest_side)
465
- new_w = int(w * ratio)
466
- new_h = int(h * ratio)
467
- im = im.resize((new_w, new_h), Resampling.LANCZOS)
468
-
469
- # 포맷 제한
470
- # PIL이 인식한 포맷이 대문자(JPEG)일 수 있으므로 소문자로
471
- pil_format = (im.format or "").lower()
472
- allowed_formats = config.get("formats", [])
473
- if pil_format not in allowed_formats:
474
- print(f"Invalid format: {pil_format} not in {allowed_formats}")
475
- return None
476
-
477
- # JPG -> JPEG 로 포맷명 정리
478
- if pil_format == "jpg":
479
- pil_format = "jpeg"
480
-
481
- # 다시 bytes 로 저장
482
- output_buffer = BytesIO()
483
- im.save(output_buffer, format=pil_format.upper()) # PIL에 맞춰서 대문자로
484
- output_buffer.seek(0)
485
- final_bytes = output_buffer.read()
486
-
487
- except Exception:
488
- print_exc()
489
- return None
490
-
491
- # 최종 base64 인코딩
492
- encoded_data = b64encode(final_bytes).decode("utf-8")
493
- return f"data:image/{pil_format};base64,{encoded_data}"
494
-
495
-
496
- def _detect_image_type(image_data: bytes) -> Optional[str]:
497
- """
498
- Detect the image format based on the image binary signature (header).
499
- Only JPEG, PNG, GIF, WEBP, and BMP are handled as examples.
500
- If the format is not recognized, return None.
501
- """
502
- # JPEG: 시작 바이트가 FF D8 FF
503
- if image_data.startswith(b"\xff\xd8\xff"):
504
- return "jpeg"
505
- # PNG: 시작 바이트가 89 50 4E 47 0D 0A 1A 0A
506
- elif image_data.startswith(b"\x89PNG\r\n\x1a\n"):
507
- return "png"
508
- # GIF: 시작 바이트가 GIF87a 또는 GIF89a
509
- elif image_data.startswith(b"GIF87a") or image_data.startswith(b"GIF89a"):
510
- return "gif"
511
- # WEBP: 시작 바이트가 RIFF....WEBP
512
- elif image_data.startswith(b"RIFF") and image_data[8:12] == b"WEBP":
513
- return "webp"
514
- # BMP: 시작 바이트가 BM
515
- elif image_data.startswith(b"BM"):
516
- return "bmp"
517
-
518
-
519
- def _process_local_image(path: Path, config: ImageProcessingConfig) -> Optional[str]:
520
- """로컬 파일이 존재하고 유효한 이미지 포맷이면 Base64 데이터 URL을 반환, 아니면 None."""
521
- if not path.is_file():
522
- return None
523
- lowered_suffix = path.suffix.lower()
524
- if not lowered_suffix or (lowered_suffix_without_dot := lowered_suffix[1:]) not in config["formats"]:
525
- return None
526
- return f"data:image/{lowered_suffix_without_dot};base64,{path.read_bytes().hex()}"
527
-
528
-
529
- def replace_images(
530
- markdown_text: str, image_description_and_references: ImageDescriptionAndReferences, description_format: str
531
- ) -> str:
532
- replacements: list[tuple[MarkdownLink, str]] = []
533
- for image_description, markdown_links in image_description_and_references.items():
534
- for markdown_link in markdown_links:
535
- if image_description is None:
536
- replacements.append((markdown_link, markdown_link.link_markdown))
537
- else:
538
- replacements.append((
539
- markdown_link,
540
- description_format.format(
541
- image_summary=image_description.replace("\n", " "),
542
- inline_text=markdown_link.inline_text,
543
- **markdown_link._asdict(),
544
- ),
545
- ))
546
-
547
- return MarkdownLink.replace(markdown_text, replacements)
548
-
549
-
550
- ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
551
- ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
552
- WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
553
-
554
- DEFAULT_UA: str = (
555
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
556
- )
1
+ from __future__ import annotations
2
+
3
+ import os.path
4
+ import re
5
+ from pathlib import Path
6
+ from typing import (
7
+ ClassVar,
8
+ Literal,
9
+ NamedTuple,
10
+ NewType,
11
+ NotRequired,
12
+ Optional,
13
+ Self,
14
+ Sequence,
15
+ TypeAlias,
16
+ TypedDict,
17
+ TypeGuard,
18
+ cast,
19
+ )
20
+ from urllib.parse import urljoin, urlparse
21
+
22
+ import mistune
23
+ import playwright.sync_api
24
+ from pydantic import BaseModel, Field
25
+
26
+ from ...utils.image import Base64Image, ImageProcessingConfig
27
+
28
+
29
+ class SelectedLineRanges(BaseModel):
30
+ line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
31
+
32
+
33
+ class PlaywrightLaunchOptions(TypedDict):
34
+ executable_path: NotRequired[str | Path]
35
+ channel: NotRequired[str]
36
+ args: NotRequired[Sequence[str]]
37
+ ignore_default_args: NotRequired[bool | Sequence[str]]
38
+ handle_sigint: NotRequired[bool]
39
+ handle_sigterm: NotRequired[bool]
40
+ handle_sighup: NotRequired[bool]
41
+ timeout: NotRequired[float]
42
+ env: NotRequired[dict[str, str | float | bool]]
43
+ headless: NotRequired[bool]
44
+ devtools: NotRequired[bool]
45
+ proxy: NotRequired[playwright.sync_api.ProxySettings]
46
+ downloads_path: NotRequired[str | Path]
47
+ slow_mo: NotRequired[float]
48
+ traces_dir: NotRequired[str | Path]
49
+ chromium_sandbox: NotRequired[bool]
50
+ firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
51
+
52
+
53
+ class PlaywrightPersistencyOptions(TypedDict):
54
+ user_data_dir: NotRequired[str | Path]
55
+ storage_state: NotRequired[playwright.sync_api.StorageState]
56
+
57
+
58
+ class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
59
+
60
+
61
+ def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
62
+ return {"headless": True}
63
+
64
+
65
+ class _TrackingInlineState(mistune.InlineState):
66
+ meta_offset: int = 0 # Where in the original text does self.src start?
67
+
68
+ def copy(self) -> Self:
69
+ new_state = self.__class__(self.env)
70
+ new_state.src = self.src
71
+ new_state.tokens = []
72
+ new_state.in_image = self.in_image
73
+ new_state.in_link = self.in_link
74
+ new_state.in_emphasis = self.in_emphasis
75
+ new_state.in_strong = self.in_strong
76
+ new_state.meta_offset = self.meta_offset
77
+ return new_state
78
+
79
+
80
+ class MarkdownLink(NamedTuple):
81
+ type: Literal["link", "image"]
82
+ url: str
83
+ text: str
84
+ title: Optional[str]
85
+ pos: int
86
+ end_pos: int
87
+
88
+ @classmethod
89
+ def from_markdown(cls, markdown_text: str, referer_url: Optional[str]) -> list[Self]:
90
+ """
91
+ The main function that returns the list of MarkdownLink for the input text.
92
+ For simplicity, we do a "pure inline parse" of the entire text
93
+ instead of letting the block parser break it up. That ensures that
94
+ link tokens cover the global positions of the entire input.
95
+ """
96
+ md = mistune.Markdown(inline=_TrackingInlineParser())
97
+ # Create an inline state that references the full text.
98
+ state = _TrackingInlineState({})
99
+ state.src = markdown_text
100
+
101
+ # Instead of calling md.parse, we can directly run the inline parser on
102
+ # the entire text, so that positions match the entire input:
103
+ md.inline.parse(state)
104
+
105
+ # Now gather all the link info from the tokens.
106
+ return cls._extract_links(tokens=state.tokens, referer_url=referer_url)
107
+
108
+ @property
109
+ def inline_text(self) -> str:
110
+ return self.text.replace("\n", " ").strip()
111
+
112
+ @property
113
+ def inline_title(self) -> str:
114
+ return self.title.replace("\n", " ").strip() if self.title else ""
115
+
116
+ @property
117
+ def link_markdown(self) -> str:
118
+ if self.title:
119
+ return f'[{self.inline_text}]({self.url} "{self.inline_title}")'
120
+ return f"[{self.inline_text}]({self.url})"
121
+
122
+ @classmethod
123
+ def replace(cls, text: str, replacements: list[tuple[Self, str]]) -> str:
124
+ for self, replacement in sorted(replacements, key=lambda x: x[0].pos, reverse=True):
125
+ text = text[: self.pos] + replacement + text[self.end_pos :]
126
+ return text
127
+
128
+ @classmethod
129
+ def _extract_links(cls, tokens: list[dict[str, object]], referer_url: Optional[str]) -> list[Self]:
130
+ results: list[Self] = []
131
+ for token in tokens:
132
+ if (
133
+ (type := token.get("type")) in ("link", "image")
134
+ and "global_pos" in token
135
+ and "attrs" in token
136
+ and _attrs_typeguard(attrs := token["attrs"])
137
+ and "url" in attrs
138
+ and _url_typeguard(url := attrs["url"])
139
+ and _global_pos_typeguard(global_pos := token["global_pos"])
140
+ ):
141
+ if referer_url:
142
+ url = _to_absolute_path(path=url, referer=referer_url)
143
+ children: object | None = token.get("children")
144
+ if _children_typeguard(children):
145
+ text = _extract_text(children)
146
+ else:
147
+ text = ""
148
+
149
+ if "title" in attrs:
150
+ title = str(attrs["title"])
151
+ else:
152
+ title = None
153
+
154
+ start, end = global_pos
155
+ results.append(cls(type, url, text, title, start, end))
156
+ if "children" in token and _children_typeguard(children := token["children"]):
157
+ results.extend(cls._extract_links(children, referer_url))
158
+
159
+ return results
160
+
161
+
162
+ class _TrackingInlineParser(mistune.InlineParser):
163
+ state_cls: ClassVar = _TrackingInlineState
164
+
165
+ def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
166
+ self, m: re.Match[str], state: _TrackingInlineState
167
+ ) -> Optional[int]:
168
+ """
169
+ Mistune calls parse_link with a match object for the link syntax
170
+ and the current inline state. If we successfully parse the link,
171
+ super().parse_link(...) returns the new position *within self.src*.
172
+ We add that to state.meta_offset for the global position.
173
+
174
+ Because parse_link in mistune might return None or an int, we only
175
+ record positions if we get an int back (meaning success).
176
+ """
177
+ offset = state.meta_offset
178
+ new_pos: int | None = super().parse_link(m, state)
179
+ if new_pos is not None:
180
+ # We have successfully parsed a link.
181
+ # The link token we just added should be the last token in state.tokens:
182
+ if state.tokens:
183
+ token = state.tokens[-1]
184
+ # The local end is new_pos in the substring.
185
+ # So the global start/end in the *original* text is offset + local positions.
186
+ token["global_pos"] = (offset + m.start(), offset + new_pos)
187
+ return new_pos
188
+
189
+
190
+ # --------------------------------------------------------------------
191
+ # Type Guards & Helper to gather plain text from nested tokens (for the link text).
192
+ # --------------------------------------------------------------------
193
+ def _children_typeguard(obj: object) -> TypeGuard[list[dict[str, object]]]:
194
+ if not isinstance(obj, list):
195
+ return False
196
+ return all(isinstance(i, dict) for i in cast(list[object], obj))
197
+
198
+
199
+ def _attrs_typeguard(obj: object) -> TypeGuard[dict[str, object]]:
200
+ if not isinstance(obj, dict):
201
+ return False
202
+ return all(isinstance(k, str) for k in cast(dict[object, object], obj))
203
+
204
+
205
+ def _global_pos_typeguard(obj: object) -> TypeGuard[tuple[int, int]]:
206
+ if not isinstance(obj, tuple):
207
+ return False
208
+ obj = cast(tuple[object, ...], obj)
209
+ if len(obj) != 2:
210
+ return False
211
+ return all(isinstance(i, int) for i in obj)
212
+
213
+
214
+ def _url_typeguard(obj: object) -> TypeGuard[str]:
215
+ return isinstance(obj, str)
216
+
217
+
218
+ def _extract_text(tokens: list[dict[str, object]]) -> str:
219
+ parts: list[str] = []
220
+ for t in tokens:
221
+ if t.get("type") == "text":
222
+ parts.append(str(t.get("raw", "")))
223
+ elif "children" in t:
224
+ children: object = t["children"]
225
+ if not _children_typeguard(children):
226
+ continue
227
+ parts.append(_extract_text(children))
228
+ return "".join(parts)
229
+
230
+
231
+ def _to_absolute_path(path: str, referer: str) -> str:
232
+ """
233
+ path : 변환할 경로(상대/절대 경로 혹은 URL일 수도 있음)
234
+ referer : 기준이 되는 절대경로(혹은 URL)
235
+ """
236
+ # referer가 URL인지 파일 경로인지 먼저 판별
237
+ ref_parsed = urlparse(referer)
238
+ is_referer_url = bool(ref_parsed.scheme and ref_parsed.netloc)
239
+
240
+ if is_referer_url:
241
+ # referer가 URL이라면,
242
+ # 1) path 자체가 이미 절대 URL인지 확인
243
+ parsed = urlparse(path)
244
+ if parsed.scheme and parsed.netloc:
245
+ # path가 이미 완전한 URL (예: http://, https:// 등)
246
+ return path
247
+ else:
248
+ # 그렇지 않다면(슬래시로 시작 포함), urljoin을 써서 referer + path 로 합침
249
+ return urljoin(referer, path)
250
+ else:
251
+ # referer가 로컬 경로라면,
252
+ # path가 로컬 파일 시스템에서의 절대경로인지 판단
253
+ if os.path.isabs(path):
254
+ return path
255
+ else:
256
+ # 파일이면 referer의 디렉토리만 추출
257
+ if not os.path.isdir(referer):
258
+ referer_dir = os.path.dirname(referer)
259
+ else:
260
+ referer_dir = referer
261
+
262
+ combined = os.path.join(referer_dir, path)
263
+ return os.path.abspath(combined)
264
+
265
+
266
+ # =======================
267
+
268
+
269
+ def get_image_url_and_markdown_links(
270
+ markdown_text: str, headers: dict[str, str], config: ImageProcessingConfig
271
+ ) -> dict[Optional[Base64Image], list[MarkdownLink]]:
272
+ image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
273
+ for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
274
+ if markdown_link.type == "link":
275
+ image_matches.setdefault(None, []).append(markdown_link)
276
+ continue
277
+
278
+ image_data = Base64Image.from_url_or_path(markdown_link.url, headers=headers, config=config)
279
+ if not image_data:
280
+ continue
281
+ image_matches.setdefault(image_data, []).append(markdown_link)
282
+ return image_matches
283
+
284
+
285
+ async def aget_image_url_and_markdown_links(
286
+ markdown_text: str, headers: dict[str, str], config: ImageProcessingConfig
287
+ ) -> dict[Optional[Base64Image], list[MarkdownLink]]:
288
+ image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
289
+ for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
290
+ if markdown_link.type == "link":
291
+ image_matches.setdefault(None, []).append(markdown_link)
292
+ continue
293
+ image_data = await Base64Image.from_url_or_path(
294
+ markdown_link.url, headers=headers, config=config, return_coro=True
295
+ )
296
+ if not image_data:
297
+ continue
298
+ image_matches.setdefault(image_data, []).append(markdown_link)
299
+ return image_matches
300
+
301
+
302
+ def replace_images(
303
+ markdown_text: str, image_description_and_references: ImageDescriptionAndReferences, description_format: str
304
+ ) -> str:
305
+ replacements: list[tuple[MarkdownLink, str]] = []
306
+ for image_description, markdown_links in image_description_and_references.items():
307
+ for markdown_link in markdown_links:
308
+ if image_description is None:
309
+ replacements.append((markdown_link, markdown_link.link_markdown))
310
+ else:
311
+ replacements.append((
312
+ markdown_link,
313
+ description_format.format(
314
+ image_summary=image_description.replace("\n", " "),
315
+ inline_text=markdown_link.inline_text,
316
+ **markdown_link._asdict(),
317
+ ),
318
+ ))
319
+
320
+ return MarkdownLink.replace(markdown_text, replacements)
321
+
322
+
323
+ ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
324
+ ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
325
+ WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
326
+
327
+ DEFAULT_UA: str = (
328
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
329
+ )