chatterer 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +93 -93
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/examples/anything_to_markdown.py +91 -0
- chatterer/examples/get_code_snippets.py +62 -0
- chatterer/examples/login_with_playwright.py +167 -0
- chatterer/examples/make_ppt.py +497 -0
- chatterer/examples/pdf_to_markdown.py +107 -0
- chatterer/examples/pdf_to_text.py +56 -0
- chatterer/examples/transcription_api.py +123 -0
- chatterer/examples/upstage_parser.py +100 -0
- chatterer/examples/webpage_to_markdown.py +79 -0
- chatterer/interactive.py +354 -692
- chatterer/language_model.py +533 -533
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +302 -302
- chatterer/tools/convert_to_text.py +447 -447
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +285 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.16.dist-info → chatterer-0.1.18.dist-info}/METADATA +392 -392
- chatterer-0.1.18.dist-info/RECORD +42 -0
- {chatterer-0.1.16.dist-info → chatterer-0.1.18.dist-info}/WHEEL +1 -1
- chatterer-0.1.16.dist-info/RECORD +0 -33
- {chatterer-0.1.16.dist-info → chatterer-0.1.18.dist-info}/top_level.txt +0 -0
@@ -1,384 +1,384 @@
|
|
1
|
-
import os.path
|
2
|
-
import re
|
3
|
-
from asyncio import gather
|
4
|
-
from traceback import format_exception_only, print_exc
|
5
|
-
from typing import (
|
6
|
-
Awaitable,
|
7
|
-
Callable,
|
8
|
-
ClassVar,
|
9
|
-
Literal,
|
10
|
-
NamedTuple,
|
11
|
-
NewType,
|
12
|
-
Optional,
|
13
|
-
Self,
|
14
|
-
TypeGuard,
|
15
|
-
cast,
|
16
|
-
)
|
17
|
-
from urllib.parse import urljoin, urlparse
|
18
|
-
|
19
|
-
from chatterer.language_model import Chatterer
|
20
|
-
|
21
|
-
from ..utils.base64_image import Base64Image, ImageProcessingConfig
|
22
|
-
|
23
|
-
|
24
|
-
class MarkdownLink(NamedTuple):
|
25
|
-
type: Literal["link", "image"]
|
26
|
-
url: str
|
27
|
-
text: str
|
28
|
-
title: Optional[str]
|
29
|
-
pos: int
|
30
|
-
end_pos: int
|
31
|
-
|
32
|
-
@classmethod
|
33
|
-
def from_markdown(cls, markdown_text: str, referer_url: Optional[str]) -> list[Self]:
|
34
|
-
"""
|
35
|
-
The main function that returns the list of MarkdownLink for the input text.
|
36
|
-
For simplicity, we do a "pure inline parse" of the entire text
|
37
|
-
instead of letting the block parser break it up. That ensures that
|
38
|
-
link tokens cover the global positions of the entire input.
|
39
|
-
"""
|
40
|
-
|
41
|
-
from mistune import InlineParser, InlineState, Markdown
|
42
|
-
|
43
|
-
class _TrackingInlineState(InlineState):
|
44
|
-
meta_offset: int = 0 # Where in the original text does self.src start?
|
45
|
-
|
46
|
-
def copy(self) -> Self:
|
47
|
-
new_state = self.__class__(self.env)
|
48
|
-
new_state.src = self.src
|
49
|
-
new_state.tokens = []
|
50
|
-
new_state.in_image = self.in_image
|
51
|
-
new_state.in_link = self.in_link
|
52
|
-
new_state.in_emphasis = self.in_emphasis
|
53
|
-
new_state.in_strong = self.in_strong
|
54
|
-
new_state.meta_offset = self.meta_offset
|
55
|
-
return new_state
|
56
|
-
|
57
|
-
class _TrackingInlineParser(InlineParser):
|
58
|
-
state_cls: ClassVar = _TrackingInlineState
|
59
|
-
|
60
|
-
def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
|
61
|
-
self, m: re.Match[str], state: _TrackingInlineState
|
62
|
-
) -> Optional[int]:
|
63
|
-
"""
|
64
|
-
Mistune calls parse_link with a match object for the link syntax
|
65
|
-
and the current inline state. If we successfully parse the link,
|
66
|
-
super().parse_link(...) returns the new position *within self.src*.
|
67
|
-
We add that to state.meta_offset for the global position.
|
68
|
-
|
69
|
-
Because parse_link in mistune might return None or an int, we only
|
70
|
-
record positions if we get an int back (meaning success).
|
71
|
-
"""
|
72
|
-
offset = state.meta_offset
|
73
|
-
new_pos: int | None = super().parse_link(m, state)
|
74
|
-
if new_pos is not None:
|
75
|
-
# We have successfully parsed a link.
|
76
|
-
# The link token we just added should be the last token in state.tokens:
|
77
|
-
if state.tokens:
|
78
|
-
token = state.tokens[-1]
|
79
|
-
# The local end is new_pos in the substring.
|
80
|
-
# So the global start/end in the *original* text is offset + local positions.
|
81
|
-
token["global_pos"] = (offset + m.start(), offset + new_pos)
|
82
|
-
return new_pos
|
83
|
-
|
84
|
-
md = Markdown(inline=_TrackingInlineParser())
|
85
|
-
# Create an inline state that references the full text.
|
86
|
-
state = _TrackingInlineState({})
|
87
|
-
state.src = markdown_text
|
88
|
-
|
89
|
-
# Instead of calling md.parse, we can directly run the inline parser on
|
90
|
-
# the entire text, so that positions match the entire input:
|
91
|
-
md.inline.parse(state)
|
92
|
-
|
93
|
-
# Now gather all the link info from the tokens.
|
94
|
-
return cls._extract_links(tokens=state.tokens, referer_url=referer_url)
|
95
|
-
|
96
|
-
@property
|
97
|
-
def inline_text(self) -> str:
|
98
|
-
return self.text.replace("\n", " ").strip()
|
99
|
-
|
100
|
-
@property
|
101
|
-
def inline_title(self) -> str:
|
102
|
-
return self.title.replace("\n", " ").strip() if self.title else ""
|
103
|
-
|
104
|
-
@property
|
105
|
-
def link_markdown(self) -> str:
|
106
|
-
if self.title:
|
107
|
-
return f'[{self.inline_text}]({self.url} "{self.inline_title}")'
|
108
|
-
return f"[{self.inline_text}]({self.url})"
|
109
|
-
|
110
|
-
@classmethod
|
111
|
-
def replace(cls, text: str, replacements: list[tuple[Self, str]]) -> str:
|
112
|
-
for self, replacement in sorted(replacements, key=lambda x: x[0].pos, reverse=True):
|
113
|
-
text = text[: self.pos] + replacement + text[self.end_pos :]
|
114
|
-
return text
|
115
|
-
|
116
|
-
@classmethod
|
117
|
-
def _extract_links(cls, tokens: list[dict[str, object]], referer_url: Optional[str]) -> list[Self]:
|
118
|
-
results: list[Self] = []
|
119
|
-
for token in tokens:
|
120
|
-
if (
|
121
|
-
(type := token.get("type")) in ("link", "image")
|
122
|
-
and "global_pos" in token
|
123
|
-
and "attrs" in token
|
124
|
-
and _attrs_typeguard(attrs := token["attrs"])
|
125
|
-
and "url" in attrs
|
126
|
-
and _url_typeguard(url := attrs["url"])
|
127
|
-
and _global_pos_typeguard(global_pos := token["global_pos"])
|
128
|
-
):
|
129
|
-
if referer_url:
|
130
|
-
url = _to_absolute_path(path=url, referer=referer_url)
|
131
|
-
children: object | None = token.get("children")
|
132
|
-
if _children_typeguard(children):
|
133
|
-
text = _extract_text(children)
|
134
|
-
else:
|
135
|
-
text = ""
|
136
|
-
|
137
|
-
if "title" in attrs:
|
138
|
-
title = str(attrs["title"])
|
139
|
-
else:
|
140
|
-
title = None
|
141
|
-
|
142
|
-
start, end = global_pos
|
143
|
-
results.append(cls(type, url, text, title, start, end))
|
144
|
-
if "children" in token and _children_typeguard(children := token["children"]):
|
145
|
-
results.extend(cls._extract_links(children, referer_url))
|
146
|
-
return results
|
147
|
-
|
148
|
-
|
149
|
-
ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
|
150
|
-
ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
|
151
|
-
|
152
|
-
|
153
|
-
def caption_markdown_images(
|
154
|
-
markdown_text: str,
|
155
|
-
headers: dict[str, str],
|
156
|
-
image_processing_config: ImageProcessingConfig,
|
157
|
-
description_format: str,
|
158
|
-
image_description_instruction: str,
|
159
|
-
chatterer: Chatterer,
|
160
|
-
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
|
161
|
-
) -> str:
|
162
|
-
"""
|
163
|
-
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
164
|
-
"""
|
165
|
-
image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = _get_image_url_and_markdown_links(
|
166
|
-
markdown_text=markdown_text,
|
167
|
-
headers=headers,
|
168
|
-
config=image_processing_config,
|
169
|
-
img_bytes_fetcher=img_bytes_fetcher,
|
170
|
-
)
|
171
|
-
|
172
|
-
image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
|
173
|
-
for image_url, markdown_links in image_url_and_markdown_links.items():
|
174
|
-
if image_url is not None:
|
175
|
-
try:
|
176
|
-
image_summary: str = chatterer.describe_image(
|
177
|
-
image_url=image_url.data_uri,
|
178
|
-
instruction=image_description_instruction,
|
179
|
-
)
|
180
|
-
except Exception:
|
181
|
-
print_exc()
|
182
|
-
continue
|
183
|
-
image_description_and_references[image_summary] = markdown_links
|
184
|
-
else:
|
185
|
-
image_description_and_references[None] = markdown_links
|
186
|
-
|
187
|
-
return _replace_images(
|
188
|
-
markdown_text=markdown_text,
|
189
|
-
image_description_and_references=image_description_and_references,
|
190
|
-
description_format=description_format,
|
191
|
-
)
|
192
|
-
|
193
|
-
|
194
|
-
async def acaption_markdown_images(
|
195
|
-
markdown_text: str,
|
196
|
-
headers: dict[str, str],
|
197
|
-
image_processing_config: ImageProcessingConfig,
|
198
|
-
description_format: str,
|
199
|
-
image_description_instruction: str,
|
200
|
-
chatterer: Chatterer,
|
201
|
-
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
|
202
|
-
) -> str:
|
203
|
-
"""
|
204
|
-
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
205
|
-
"""
|
206
|
-
image_url_and_markdown_links: dict[
|
207
|
-
Optional[Base64Image], list[MarkdownLink]
|
208
|
-
] = await _aget_image_url_and_markdown_links(
|
209
|
-
markdown_text=markdown_text,
|
210
|
-
headers=headers,
|
211
|
-
config=image_processing_config,
|
212
|
-
img_bytes_fetcher=img_bytes_fetcher,
|
213
|
-
)
|
214
|
-
|
215
|
-
async def dummy() -> None:
|
216
|
-
pass
|
217
|
-
|
218
|
-
def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
|
219
|
-
if isinstance(e, BaseException):
|
220
|
-
print(format_exception_only(type(e), e))
|
221
|
-
return False
|
222
|
-
return True
|
223
|
-
|
224
|
-
coros: list[Awaitable[Optional[str]]] = [
|
225
|
-
chatterer.adescribe_image(image_url=image_url.data_uri, instruction=image_description_instruction)
|
226
|
-
if image_url is not None
|
227
|
-
else dummy()
|
228
|
-
for image_url in image_url_and_markdown_links.keys()
|
229
|
-
]
|
230
|
-
|
231
|
-
return _replace_images(
|
232
|
-
markdown_text=markdown_text,
|
233
|
-
image_description_and_references=ImageDescriptionAndReferences({
|
234
|
-
image_summary: markdown_links
|
235
|
-
for markdown_links, image_summary in zip(
|
236
|
-
image_url_and_markdown_links.values(), await gather(*coros, return_exceptions=True)
|
237
|
-
)
|
238
|
-
if _handle_exception(image_summary)
|
239
|
-
}),
|
240
|
-
description_format=description_format,
|
241
|
-
)
|
242
|
-
|
243
|
-
|
244
|
-
# --------------------------------------------------------------------
|
245
|
-
# Type Guards & Helper to gather plain text from nested tokens (for the link text).
|
246
|
-
# --------------------------------------------------------------------
|
247
|
-
def _children_typeguard(obj: object) -> TypeGuard[list[dict[str, object]]]:
|
248
|
-
if not isinstance(obj, list):
|
249
|
-
return False
|
250
|
-
return all(isinstance(i, dict) for i in cast(list[object], obj))
|
251
|
-
|
252
|
-
|
253
|
-
def _attrs_typeguard(obj: object) -> TypeGuard[dict[str, object]]:
|
254
|
-
if not isinstance(obj, dict):
|
255
|
-
return False
|
256
|
-
return all(isinstance(k, str) for k in cast(dict[object, object], obj))
|
257
|
-
|
258
|
-
|
259
|
-
def _global_pos_typeguard(obj: object) -> TypeGuard[tuple[int, int]]:
|
260
|
-
if not isinstance(obj, tuple):
|
261
|
-
return False
|
262
|
-
obj = cast(tuple[object, ...], obj)
|
263
|
-
if len(obj) != 2:
|
264
|
-
return False
|
265
|
-
return all(isinstance(i, int) for i in obj)
|
266
|
-
|
267
|
-
|
268
|
-
def _url_typeguard(obj: object) -> TypeGuard[str]:
|
269
|
-
return isinstance(obj, str)
|
270
|
-
|
271
|
-
|
272
|
-
def _extract_text(tokens: list[dict[str, object]]) -> str:
|
273
|
-
parts: list[str] = []
|
274
|
-
for t in tokens:
|
275
|
-
if t.get("type") == "text":
|
276
|
-
parts.append(str(t.get("raw", "")))
|
277
|
-
elif "children" in t:
|
278
|
-
children: object = t["children"]
|
279
|
-
if not _children_typeguard(children):
|
280
|
-
continue
|
281
|
-
parts.append(_extract_text(children))
|
282
|
-
return "".join(parts)
|
283
|
-
|
284
|
-
|
285
|
-
def _to_absolute_path(path: str, referer: str) -> str:
|
286
|
-
"""
|
287
|
-
path : 변환할 경로(상대/절대 경로 혹은 URL일 수도 있음)
|
288
|
-
referer : 기준이 되는 절대경로(혹은 URL)
|
289
|
-
"""
|
290
|
-
# referer가 URL인지 파일 경로인지 먼저 판별
|
291
|
-
ref_parsed = urlparse(referer)
|
292
|
-
is_referer_url = bool(ref_parsed.scheme and ref_parsed.netloc)
|
293
|
-
|
294
|
-
if is_referer_url:
|
295
|
-
# referer가 URL이라면,
|
296
|
-
# 1) path 자체가 이미 절대 URL인지 확인
|
297
|
-
parsed = urlparse(path)
|
298
|
-
if parsed.scheme and parsed.netloc:
|
299
|
-
# path가 이미 완전한 URL (예: http://, https:// 등)
|
300
|
-
return path
|
301
|
-
else:
|
302
|
-
# 그렇지 않다면(슬래시로 시작 포함), urljoin을 써서 referer + path 로 합침
|
303
|
-
return urljoin(referer, path)
|
304
|
-
else:
|
305
|
-
# referer가 로컬 경로라면,
|
306
|
-
# path가 로컬 파일 시스템에서의 절대경로인지 판단
|
307
|
-
if os.path.isabs(path):
|
308
|
-
return path
|
309
|
-
else:
|
310
|
-
# 파일이면 referer의 디렉토리만 추출
|
311
|
-
if not os.path.isdir(referer):
|
312
|
-
referer_dir = os.path.dirname(referer)
|
313
|
-
else:
|
314
|
-
referer_dir = referer
|
315
|
-
|
316
|
-
combined = os.path.join(referer_dir, path)
|
317
|
-
return os.path.abspath(combined)
|
318
|
-
|
319
|
-
|
320
|
-
def _get_image_url_and_markdown_links(
|
321
|
-
markdown_text: str,
|
322
|
-
headers: dict[str, str],
|
323
|
-
config: ImageProcessingConfig,
|
324
|
-
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
|
325
|
-
) -> dict[Optional[Base64Image], list[MarkdownLink]]:
|
326
|
-
image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
|
327
|
-
for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
|
328
|
-
if markdown_link.type == "link":
|
329
|
-
image_matches.setdefault(None, []).append(markdown_link)
|
330
|
-
continue
|
331
|
-
|
332
|
-
image_data = Base64Image.from_url_or_path(
|
333
|
-
markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
|
334
|
-
)
|
335
|
-
if not image_data:
|
336
|
-
image_matches.setdefault(None, []).append(markdown_link)
|
337
|
-
continue
|
338
|
-
image_matches.setdefault(image_data, []).append(markdown_link)
|
339
|
-
return image_matches
|
340
|
-
|
341
|
-
|
342
|
-
async def _aget_image_url_and_markdown_links(
|
343
|
-
markdown_text: str,
|
344
|
-
headers: dict[str, str],
|
345
|
-
config: ImageProcessingConfig,
|
346
|
-
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
|
347
|
-
) -> dict[Optional[Base64Image], list[MarkdownLink]]:
|
348
|
-
image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
|
349
|
-
for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
|
350
|
-
if markdown_link.type == "link":
|
351
|
-
image_matches.setdefault(None, []).append(markdown_link)
|
352
|
-
continue
|
353
|
-
image_data = await Base64Image.afrom_url_or_path(
|
354
|
-
markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
|
355
|
-
)
|
356
|
-
if not image_data:
|
357
|
-
image_matches.setdefault(None, []).append(markdown_link)
|
358
|
-
continue
|
359
|
-
image_matches.setdefault(image_data, []).append(markdown_link)
|
360
|
-
return image_matches
|
361
|
-
|
362
|
-
|
363
|
-
def _replace_images(
|
364
|
-
markdown_text: str, image_description_and_references: ImageDescriptionAndReferences, description_format: str
|
365
|
-
) -> str:
|
366
|
-
replacements: list[tuple[MarkdownLink, str]] = []
|
367
|
-
for image_description, markdown_links in image_description_and_references.items():
|
368
|
-
for markdown_link in markdown_links:
|
369
|
-
if image_description is None:
|
370
|
-
if markdown_link.type == "link":
|
371
|
-
replacements.append((markdown_link, markdown_link.link_markdown))
|
372
|
-
elif markdown_link.type == "image":
|
373
|
-
replacements.append((markdown_link, f""))
|
374
|
-
else:
|
375
|
-
replacements.append((
|
376
|
-
markdown_link,
|
377
|
-
description_format.format(
|
378
|
-
image_summary=image_description.replace("\n", " "),
|
379
|
-
inline_text=markdown_link.inline_text,
|
380
|
-
**markdown_link._asdict(),
|
381
|
-
),
|
382
|
-
))
|
383
|
-
|
384
|
-
return MarkdownLink.replace(markdown_text, replacements)
|
1
|
+
import os.path
|
2
|
+
import re
|
3
|
+
from asyncio import gather
|
4
|
+
from traceback import format_exception_only, print_exc
|
5
|
+
from typing import (
|
6
|
+
Awaitable,
|
7
|
+
Callable,
|
8
|
+
ClassVar,
|
9
|
+
Literal,
|
10
|
+
NamedTuple,
|
11
|
+
NewType,
|
12
|
+
Optional,
|
13
|
+
Self,
|
14
|
+
TypeGuard,
|
15
|
+
cast,
|
16
|
+
)
|
17
|
+
from urllib.parse import urljoin, urlparse
|
18
|
+
|
19
|
+
from chatterer.language_model import Chatterer
|
20
|
+
|
21
|
+
from ..utils.base64_image import Base64Image, ImageProcessingConfig
|
22
|
+
|
23
|
+
|
24
|
+
class MarkdownLink(NamedTuple):
|
25
|
+
type: Literal["link", "image"]
|
26
|
+
url: str
|
27
|
+
text: str
|
28
|
+
title: Optional[str]
|
29
|
+
pos: int
|
30
|
+
end_pos: int
|
31
|
+
|
32
|
+
@classmethod
|
33
|
+
def from_markdown(cls, markdown_text: str, referer_url: Optional[str]) -> list[Self]:
|
34
|
+
"""
|
35
|
+
The main function that returns the list of MarkdownLink for the input text.
|
36
|
+
For simplicity, we do a "pure inline parse" of the entire text
|
37
|
+
instead of letting the block parser break it up. That ensures that
|
38
|
+
link tokens cover the global positions of the entire input.
|
39
|
+
"""
|
40
|
+
|
41
|
+
from mistune import InlineParser, InlineState, Markdown
|
42
|
+
|
43
|
+
class _TrackingInlineState(InlineState):
|
44
|
+
meta_offset: int = 0 # Where in the original text does self.src start?
|
45
|
+
|
46
|
+
def copy(self) -> Self:
|
47
|
+
new_state = self.__class__(self.env)
|
48
|
+
new_state.src = self.src
|
49
|
+
new_state.tokens = []
|
50
|
+
new_state.in_image = self.in_image
|
51
|
+
new_state.in_link = self.in_link
|
52
|
+
new_state.in_emphasis = self.in_emphasis
|
53
|
+
new_state.in_strong = self.in_strong
|
54
|
+
new_state.meta_offset = self.meta_offset
|
55
|
+
return new_state
|
56
|
+
|
57
|
+
class _TrackingInlineParser(InlineParser):
|
58
|
+
state_cls: ClassVar = _TrackingInlineState
|
59
|
+
|
60
|
+
def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
|
61
|
+
self, m: re.Match[str], state: _TrackingInlineState
|
62
|
+
) -> Optional[int]:
|
63
|
+
"""
|
64
|
+
Mistune calls parse_link with a match object for the link syntax
|
65
|
+
and the current inline state. If we successfully parse the link,
|
66
|
+
super().parse_link(...) returns the new position *within self.src*.
|
67
|
+
We add that to state.meta_offset for the global position.
|
68
|
+
|
69
|
+
Because parse_link in mistune might return None or an int, we only
|
70
|
+
record positions if we get an int back (meaning success).
|
71
|
+
"""
|
72
|
+
offset = state.meta_offset
|
73
|
+
new_pos: int | None = super().parse_link(m, state)
|
74
|
+
if new_pos is not None:
|
75
|
+
# We have successfully parsed a link.
|
76
|
+
# The link token we just added should be the last token in state.tokens:
|
77
|
+
if state.tokens:
|
78
|
+
token = state.tokens[-1]
|
79
|
+
# The local end is new_pos in the substring.
|
80
|
+
# So the global start/end in the *original* text is offset + local positions.
|
81
|
+
token["global_pos"] = (offset + m.start(), offset + new_pos)
|
82
|
+
return new_pos
|
83
|
+
|
84
|
+
md = Markdown(inline=_TrackingInlineParser())
|
85
|
+
# Create an inline state that references the full text.
|
86
|
+
state = _TrackingInlineState({})
|
87
|
+
state.src = markdown_text
|
88
|
+
|
89
|
+
# Instead of calling md.parse, we can directly run the inline parser on
|
90
|
+
# the entire text, so that positions match the entire input:
|
91
|
+
md.inline.parse(state)
|
92
|
+
|
93
|
+
# Now gather all the link info from the tokens.
|
94
|
+
return cls._extract_links(tokens=state.tokens, referer_url=referer_url)
|
95
|
+
|
96
|
+
@property
|
97
|
+
def inline_text(self) -> str:
|
98
|
+
return self.text.replace("\n", " ").strip()
|
99
|
+
|
100
|
+
@property
|
101
|
+
def inline_title(self) -> str:
|
102
|
+
return self.title.replace("\n", " ").strip() if self.title else ""
|
103
|
+
|
104
|
+
@property
|
105
|
+
def link_markdown(self) -> str:
|
106
|
+
if self.title:
|
107
|
+
return f'[{self.inline_text}]({self.url} "{self.inline_title}")'
|
108
|
+
return f"[{self.inline_text}]({self.url})"
|
109
|
+
|
110
|
+
@classmethod
|
111
|
+
def replace(cls, text: str, replacements: list[tuple[Self, str]]) -> str:
|
112
|
+
for self, replacement in sorted(replacements, key=lambda x: x[0].pos, reverse=True):
|
113
|
+
text = text[: self.pos] + replacement + text[self.end_pos :]
|
114
|
+
return text
|
115
|
+
|
116
|
+
@classmethod
|
117
|
+
def _extract_links(cls, tokens: list[dict[str, object]], referer_url: Optional[str]) -> list[Self]:
|
118
|
+
results: list[Self] = []
|
119
|
+
for token in tokens:
|
120
|
+
if (
|
121
|
+
(type := token.get("type")) in ("link", "image")
|
122
|
+
and "global_pos" in token
|
123
|
+
and "attrs" in token
|
124
|
+
and _attrs_typeguard(attrs := token["attrs"])
|
125
|
+
and "url" in attrs
|
126
|
+
and _url_typeguard(url := attrs["url"])
|
127
|
+
and _global_pos_typeguard(global_pos := token["global_pos"])
|
128
|
+
):
|
129
|
+
if referer_url:
|
130
|
+
url = _to_absolute_path(path=url, referer=referer_url)
|
131
|
+
children: object | None = token.get("children")
|
132
|
+
if _children_typeguard(children):
|
133
|
+
text = _extract_text(children)
|
134
|
+
else:
|
135
|
+
text = ""
|
136
|
+
|
137
|
+
if "title" in attrs:
|
138
|
+
title = str(attrs["title"])
|
139
|
+
else:
|
140
|
+
title = None
|
141
|
+
|
142
|
+
start, end = global_pos
|
143
|
+
results.append(cls(type, url, text, title, start, end))
|
144
|
+
if "children" in token and _children_typeguard(children := token["children"]):
|
145
|
+
results.extend(cls._extract_links(children, referer_url))
|
146
|
+
return results
|
147
|
+
|
148
|
+
|
149
|
+
ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
|
150
|
+
ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
|
151
|
+
|
152
|
+
|
153
|
+
def caption_markdown_images(
|
154
|
+
markdown_text: str,
|
155
|
+
headers: dict[str, str],
|
156
|
+
image_processing_config: ImageProcessingConfig,
|
157
|
+
description_format: str,
|
158
|
+
image_description_instruction: str,
|
159
|
+
chatterer: Chatterer,
|
160
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
|
161
|
+
) -> str:
|
162
|
+
"""
|
163
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
164
|
+
"""
|
165
|
+
image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = _get_image_url_and_markdown_links(
|
166
|
+
markdown_text=markdown_text,
|
167
|
+
headers=headers,
|
168
|
+
config=image_processing_config,
|
169
|
+
img_bytes_fetcher=img_bytes_fetcher,
|
170
|
+
)
|
171
|
+
|
172
|
+
image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
|
173
|
+
for image_url, markdown_links in image_url_and_markdown_links.items():
|
174
|
+
if image_url is not None:
|
175
|
+
try:
|
176
|
+
image_summary: str = chatterer.describe_image(
|
177
|
+
image_url=image_url.data_uri,
|
178
|
+
instruction=image_description_instruction,
|
179
|
+
)
|
180
|
+
except Exception:
|
181
|
+
print_exc()
|
182
|
+
continue
|
183
|
+
image_description_and_references[image_summary] = markdown_links
|
184
|
+
else:
|
185
|
+
image_description_and_references[None] = markdown_links
|
186
|
+
|
187
|
+
return _replace_images(
|
188
|
+
markdown_text=markdown_text,
|
189
|
+
image_description_and_references=image_description_and_references,
|
190
|
+
description_format=description_format,
|
191
|
+
)
|
192
|
+
|
193
|
+
|
194
|
+
async def acaption_markdown_images(
|
195
|
+
markdown_text: str,
|
196
|
+
headers: dict[str, str],
|
197
|
+
image_processing_config: ImageProcessingConfig,
|
198
|
+
description_format: str,
|
199
|
+
image_description_instruction: str,
|
200
|
+
chatterer: Chatterer,
|
201
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
|
202
|
+
) -> str:
|
203
|
+
"""
|
204
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
205
|
+
"""
|
206
|
+
image_url_and_markdown_links: dict[
|
207
|
+
Optional[Base64Image], list[MarkdownLink]
|
208
|
+
] = await _aget_image_url_and_markdown_links(
|
209
|
+
markdown_text=markdown_text,
|
210
|
+
headers=headers,
|
211
|
+
config=image_processing_config,
|
212
|
+
img_bytes_fetcher=img_bytes_fetcher,
|
213
|
+
)
|
214
|
+
|
215
|
+
async def dummy() -> None:
|
216
|
+
pass
|
217
|
+
|
218
|
+
def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
|
219
|
+
if isinstance(e, BaseException):
|
220
|
+
print(format_exception_only(type(e), e))
|
221
|
+
return False
|
222
|
+
return True
|
223
|
+
|
224
|
+
coros: list[Awaitable[Optional[str]]] = [
|
225
|
+
chatterer.adescribe_image(image_url=image_url.data_uri, instruction=image_description_instruction)
|
226
|
+
if image_url is not None
|
227
|
+
else dummy()
|
228
|
+
for image_url in image_url_and_markdown_links.keys()
|
229
|
+
]
|
230
|
+
|
231
|
+
return _replace_images(
|
232
|
+
markdown_text=markdown_text,
|
233
|
+
image_description_and_references=ImageDescriptionAndReferences({
|
234
|
+
image_summary: markdown_links
|
235
|
+
for markdown_links, image_summary in zip(
|
236
|
+
image_url_and_markdown_links.values(), await gather(*coros, return_exceptions=True)
|
237
|
+
)
|
238
|
+
if _handle_exception(image_summary)
|
239
|
+
}),
|
240
|
+
description_format=description_format,
|
241
|
+
)
|
242
|
+
|
243
|
+
|
244
|
+
# --------------------------------------------------------------------
|
245
|
+
# Type Guards & Helper to gather plain text from nested tokens (for the link text).
|
246
|
+
# --------------------------------------------------------------------
|
247
|
+
def _children_typeguard(obj: object) -> TypeGuard[list[dict[str, object]]]:
|
248
|
+
if not isinstance(obj, list):
|
249
|
+
return False
|
250
|
+
return all(isinstance(i, dict) for i in cast(list[object], obj))
|
251
|
+
|
252
|
+
|
253
|
+
def _attrs_typeguard(obj: object) -> TypeGuard[dict[str, object]]:
|
254
|
+
if not isinstance(obj, dict):
|
255
|
+
return False
|
256
|
+
return all(isinstance(k, str) for k in cast(dict[object, object], obj))
|
257
|
+
|
258
|
+
|
259
|
+
def _global_pos_typeguard(obj: object) -> TypeGuard[tuple[int, int]]:
|
260
|
+
if not isinstance(obj, tuple):
|
261
|
+
return False
|
262
|
+
obj = cast(tuple[object, ...], obj)
|
263
|
+
if len(obj) != 2:
|
264
|
+
return False
|
265
|
+
return all(isinstance(i, int) for i in obj)
|
266
|
+
|
267
|
+
|
268
|
+
def _url_typeguard(obj: object) -> TypeGuard[str]:
|
269
|
+
return isinstance(obj, str)
|
270
|
+
|
271
|
+
|
272
|
+
def _extract_text(tokens: list[dict[str, object]]) -> str:
|
273
|
+
parts: list[str] = []
|
274
|
+
for t in tokens:
|
275
|
+
if t.get("type") == "text":
|
276
|
+
parts.append(str(t.get("raw", "")))
|
277
|
+
elif "children" in t:
|
278
|
+
children: object = t["children"]
|
279
|
+
if not _children_typeguard(children):
|
280
|
+
continue
|
281
|
+
parts.append(_extract_text(children))
|
282
|
+
return "".join(parts)
|
283
|
+
|
284
|
+
|
285
|
+
def _to_absolute_path(path: str, referer: str) -> str:
|
286
|
+
"""
|
287
|
+
path : 변환할 경로(상대/절대 경로 혹은 URL일 수도 있음)
|
288
|
+
referer : 기준이 되는 절대경로(혹은 URL)
|
289
|
+
"""
|
290
|
+
# referer가 URL인지 파일 경로인지 먼저 판별
|
291
|
+
ref_parsed = urlparse(referer)
|
292
|
+
is_referer_url = bool(ref_parsed.scheme and ref_parsed.netloc)
|
293
|
+
|
294
|
+
if is_referer_url:
|
295
|
+
# referer가 URL이라면,
|
296
|
+
# 1) path 자체가 이미 절대 URL인지 확인
|
297
|
+
parsed = urlparse(path)
|
298
|
+
if parsed.scheme and parsed.netloc:
|
299
|
+
# path가 이미 완전한 URL (예: http://, https:// 등)
|
300
|
+
return path
|
301
|
+
else:
|
302
|
+
# 그렇지 않다면(슬래시로 시작 포함), urljoin을 써서 referer + path 로 합침
|
303
|
+
return urljoin(referer, path)
|
304
|
+
else:
|
305
|
+
# referer가 로컬 경로라면,
|
306
|
+
# path가 로컬 파일 시스템에서의 절대경로인지 판단
|
307
|
+
if os.path.isabs(path):
|
308
|
+
return path
|
309
|
+
else:
|
310
|
+
# 파일이면 referer의 디렉토리만 추출
|
311
|
+
if not os.path.isdir(referer):
|
312
|
+
referer_dir = os.path.dirname(referer)
|
313
|
+
else:
|
314
|
+
referer_dir = referer
|
315
|
+
|
316
|
+
combined = os.path.join(referer_dir, path)
|
317
|
+
return os.path.abspath(combined)
|
318
|
+
|
319
|
+
|
320
|
+
def _get_image_url_and_markdown_links(
|
321
|
+
markdown_text: str,
|
322
|
+
headers: dict[str, str],
|
323
|
+
config: ImageProcessingConfig,
|
324
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
|
325
|
+
) -> dict[Optional[Base64Image], list[MarkdownLink]]:
|
326
|
+
image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
|
327
|
+
for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
|
328
|
+
if markdown_link.type == "link":
|
329
|
+
image_matches.setdefault(None, []).append(markdown_link)
|
330
|
+
continue
|
331
|
+
|
332
|
+
image_data = Base64Image.from_url_or_path(
|
333
|
+
markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
|
334
|
+
)
|
335
|
+
if not image_data:
|
336
|
+
image_matches.setdefault(None, []).append(markdown_link)
|
337
|
+
continue
|
338
|
+
image_matches.setdefault(image_data, []).append(markdown_link)
|
339
|
+
return image_matches
|
340
|
+
|
341
|
+
|
342
|
+
async def _aget_image_url_and_markdown_links(
|
343
|
+
markdown_text: str,
|
344
|
+
headers: dict[str, str],
|
345
|
+
config: ImageProcessingConfig,
|
346
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
|
347
|
+
) -> dict[Optional[Base64Image], list[MarkdownLink]]:
|
348
|
+
image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
|
349
|
+
for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
|
350
|
+
if markdown_link.type == "link":
|
351
|
+
image_matches.setdefault(None, []).append(markdown_link)
|
352
|
+
continue
|
353
|
+
image_data = await Base64Image.afrom_url_or_path(
|
354
|
+
markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
|
355
|
+
)
|
356
|
+
if not image_data:
|
357
|
+
image_matches.setdefault(None, []).append(markdown_link)
|
358
|
+
continue
|
359
|
+
image_matches.setdefault(image_data, []).append(markdown_link)
|
360
|
+
return image_matches
|
361
|
+
|
362
|
+
|
363
|
+
def _replace_images(
|
364
|
+
markdown_text: str, image_description_and_references: ImageDescriptionAndReferences, description_format: str
|
365
|
+
) -> str:
|
366
|
+
replacements: list[tuple[MarkdownLink, str]] = []
|
367
|
+
for image_description, markdown_links in image_description_and_references.items():
|
368
|
+
for markdown_link in markdown_links:
|
369
|
+
if image_description is None:
|
370
|
+
if markdown_link.type == "link":
|
371
|
+
replacements.append((markdown_link, markdown_link.link_markdown))
|
372
|
+
elif markdown_link.type == "image":
|
373
|
+
replacements.append((markdown_link, f""))
|
374
|
+
else:
|
375
|
+
replacements.append((
|
376
|
+
markdown_link,
|
377
|
+
description_format.format(
|
378
|
+
image_summary=image_description.replace("\n", " "),
|
379
|
+
inline_text=markdown_link.inline_text,
|
380
|
+
**markdown_link._asdict(),
|
381
|
+
),
|
382
|
+
))
|
383
|
+
|
384
|
+
return MarkdownLink.replace(markdown_text, replacements)
|