chatterer 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +41 -4
- chatterer/common_types/__init__.py +21 -0
- chatterer/common_types/io.py +19 -0
- chatterer/interactive.py +353 -0
- chatterer/language_model.py +129 -252
- chatterer/messages.py +13 -1
- chatterer/tools/__init__.py +27 -9
- chatterer/tools/{webpage_to_markdown/utils.py → caption_markdown_images.py} +158 -108
- chatterer/tools/convert_pdf_to_markdown.py +302 -0
- chatterer/tools/convert_to_text.py +49 -65
- chatterer/tools/upstage_document_parser.py +705 -0
- chatterer/tools/{webpage_to_markdown/playwright_bot.py → webpage_to_markdown.py} +197 -107
- chatterer/tools/youtube.py +2 -1
- chatterer/utils/__init__.py +4 -1
- chatterer/utils/{image.py → base64_image.py} +56 -62
- chatterer/utils/bytesio.py +59 -0
- chatterer/utils/cli.py +476 -0
- chatterer/utils/code_agent.py +137 -38
- chatterer/utils/imghdr.py +148 -0
- chatterer-0.1.14.dist-info/METADATA +387 -0
- chatterer-0.1.14.dist-info/RECORD +34 -0
- chatterer/tools/webpage_to_markdown/__init__.py +0 -4
- chatterer-0.1.12.dist-info/METADATA +0 -170
- chatterer-0.1.12.dist-info/RECORD +0 -27
- {chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/WHEEL +0 -0
- {chatterer-0.1.12.dist-info → chatterer-0.1.14.dist-info}/top_level.txt +0 -0
@@ -1,80 +1,24 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
1
|
import os.path
|
4
2
|
import re
|
5
|
-
from
|
3
|
+
from asyncio import gather
|
4
|
+
from traceback import format_exception_only, print_exc
|
6
5
|
from typing import (
|
6
|
+
Awaitable,
|
7
|
+
Callable,
|
7
8
|
ClassVar,
|
8
9
|
Literal,
|
9
10
|
NamedTuple,
|
10
11
|
NewType,
|
11
|
-
NotRequired,
|
12
12
|
Optional,
|
13
13
|
Self,
|
14
|
-
Sequence,
|
15
|
-
TypeAlias,
|
16
|
-
TypedDict,
|
17
14
|
TypeGuard,
|
18
15
|
cast,
|
19
16
|
)
|
20
17
|
from urllib.parse import urljoin, urlparse
|
21
18
|
|
22
|
-
import
|
23
|
-
import playwright.sync_api
|
24
|
-
from pydantic import BaseModel, Field
|
25
|
-
|
26
|
-
from ...utils.image import Base64Image, ImageProcessingConfig
|
27
|
-
|
28
|
-
|
29
|
-
class SelectedLineRanges(BaseModel):
|
30
|
-
line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
|
31
|
-
|
32
|
-
|
33
|
-
class PlaywrightLaunchOptions(TypedDict):
|
34
|
-
executable_path: NotRequired[str | Path]
|
35
|
-
channel: NotRequired[str]
|
36
|
-
args: NotRequired[Sequence[str]]
|
37
|
-
ignore_default_args: NotRequired[bool | Sequence[str]]
|
38
|
-
handle_sigint: NotRequired[bool]
|
39
|
-
handle_sigterm: NotRequired[bool]
|
40
|
-
handle_sighup: NotRequired[bool]
|
41
|
-
timeout: NotRequired[float]
|
42
|
-
env: NotRequired[dict[str, str | float | bool]]
|
43
|
-
headless: NotRequired[bool]
|
44
|
-
devtools: NotRequired[bool]
|
45
|
-
proxy: NotRequired[playwright.sync_api.ProxySettings]
|
46
|
-
downloads_path: NotRequired[str | Path]
|
47
|
-
slow_mo: NotRequired[float]
|
48
|
-
traces_dir: NotRequired[str | Path]
|
49
|
-
chromium_sandbox: NotRequired[bool]
|
50
|
-
firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
|
51
|
-
|
52
|
-
|
53
|
-
class PlaywrightPersistencyOptions(TypedDict):
|
54
|
-
user_data_dir: NotRequired[str | Path]
|
55
|
-
storage_state: NotRequired[playwright.sync_api.StorageState]
|
56
|
-
|
57
|
-
|
58
|
-
class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
|
19
|
+
from chatterer.language_model import Chatterer
|
59
20
|
|
60
|
-
|
61
|
-
def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
|
62
|
-
return {"headless": True}
|
63
|
-
|
64
|
-
|
65
|
-
class _TrackingInlineState(mistune.InlineState):
|
66
|
-
meta_offset: int = 0 # Where in the original text does self.src start?
|
67
|
-
|
68
|
-
def copy(self) -> Self:
|
69
|
-
new_state = self.__class__(self.env)
|
70
|
-
new_state.src = self.src
|
71
|
-
new_state.tokens = []
|
72
|
-
new_state.in_image = self.in_image
|
73
|
-
new_state.in_link = self.in_link
|
74
|
-
new_state.in_emphasis = self.in_emphasis
|
75
|
-
new_state.in_strong = self.in_strong
|
76
|
-
new_state.meta_offset = self.meta_offset
|
77
|
-
return new_state
|
21
|
+
from ..utils.base64_image import Base64Image, ImageProcessingConfig
|
78
22
|
|
79
23
|
|
80
24
|
class MarkdownLink(NamedTuple):
|
@@ -93,7 +37,51 @@ class MarkdownLink(NamedTuple):
|
|
93
37
|
instead of letting the block parser break it up. That ensures that
|
94
38
|
link tokens cover the global positions of the entire input.
|
95
39
|
"""
|
96
|
-
|
40
|
+
|
41
|
+
from mistune import InlineParser, InlineState, Markdown
|
42
|
+
|
43
|
+
class _TrackingInlineState(InlineState):
|
44
|
+
meta_offset: int = 0 # Where in the original text does self.src start?
|
45
|
+
|
46
|
+
def copy(self) -> Self:
|
47
|
+
new_state = self.__class__(self.env)
|
48
|
+
new_state.src = self.src
|
49
|
+
new_state.tokens = []
|
50
|
+
new_state.in_image = self.in_image
|
51
|
+
new_state.in_link = self.in_link
|
52
|
+
new_state.in_emphasis = self.in_emphasis
|
53
|
+
new_state.in_strong = self.in_strong
|
54
|
+
new_state.meta_offset = self.meta_offset
|
55
|
+
return new_state
|
56
|
+
|
57
|
+
class _TrackingInlineParser(InlineParser):
|
58
|
+
state_cls: ClassVar = _TrackingInlineState
|
59
|
+
|
60
|
+
def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
|
61
|
+
self, m: re.Match[str], state: _TrackingInlineState
|
62
|
+
) -> Optional[int]:
|
63
|
+
"""
|
64
|
+
Mistune calls parse_link with a match object for the link syntax
|
65
|
+
and the current inline state. If we successfully parse the link,
|
66
|
+
super().parse_link(...) returns the new position *within self.src*.
|
67
|
+
We add that to state.meta_offset for the global position.
|
68
|
+
|
69
|
+
Because parse_link in mistune might return None or an int, we only
|
70
|
+
record positions if we get an int back (meaning success).
|
71
|
+
"""
|
72
|
+
offset = state.meta_offset
|
73
|
+
new_pos: int | None = super().parse_link(m, state)
|
74
|
+
if new_pos is not None:
|
75
|
+
# We have successfully parsed a link.
|
76
|
+
# The link token we just added should be the last token in state.tokens:
|
77
|
+
if state.tokens:
|
78
|
+
token = state.tokens[-1]
|
79
|
+
# The local end is new_pos in the substring.
|
80
|
+
# So the global start/end in the *original* text is offset + local positions.
|
81
|
+
token["global_pos"] = (offset + m.start(), offset + new_pos)
|
82
|
+
return new_pos
|
83
|
+
|
84
|
+
md = Markdown(inline=_TrackingInlineParser())
|
97
85
|
# Create an inline state that references the full text.
|
98
86
|
state = _TrackingInlineState({})
|
99
87
|
state.src = markdown_text
|
@@ -155,36 +143,102 @@ class MarkdownLink(NamedTuple):
|
|
155
143
|
results.append(cls(type, url, text, title, start, end))
|
156
144
|
if "children" in token and _children_typeguard(children := token["children"]):
|
157
145
|
results.extend(cls._extract_links(children, referer_url))
|
158
|
-
|
159
146
|
return results
|
160
147
|
|
161
148
|
|
162
|
-
|
163
|
-
|
149
|
+
ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
|
150
|
+
ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
|
164
151
|
|
165
|
-
def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
|
166
|
-
self, m: re.Match[str], state: _TrackingInlineState
|
167
|
-
) -> Optional[int]:
|
168
|
-
"""
|
169
|
-
Mistune calls parse_link with a match object for the link syntax
|
170
|
-
and the current inline state. If we successfully parse the link,
|
171
|
-
super().parse_link(...) returns the new position *within self.src*.
|
172
|
-
We add that to state.meta_offset for the global position.
|
173
152
|
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
153
|
+
def caption_markdown_images(
|
154
|
+
markdown_text: str,
|
155
|
+
headers: dict[str, str],
|
156
|
+
image_processing_config: ImageProcessingConfig,
|
157
|
+
description_format: str,
|
158
|
+
image_description_instruction: str,
|
159
|
+
chatterer: Chatterer,
|
160
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
|
161
|
+
) -> str:
|
162
|
+
"""
|
163
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
164
|
+
"""
|
165
|
+
image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = _get_image_url_and_markdown_links(
|
166
|
+
markdown_text=markdown_text,
|
167
|
+
headers=headers,
|
168
|
+
config=image_processing_config,
|
169
|
+
img_bytes_fetcher=img_bytes_fetcher,
|
170
|
+
)
|
171
|
+
|
172
|
+
image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
|
173
|
+
for image_url, markdown_links in image_url_and_markdown_links.items():
|
174
|
+
if image_url is not None:
|
175
|
+
try:
|
176
|
+
image_summary: str = chatterer.describe_image(
|
177
|
+
image_url=image_url.data_uri,
|
178
|
+
instruction=image_description_instruction,
|
179
|
+
)
|
180
|
+
except Exception:
|
181
|
+
print_exc()
|
182
|
+
continue
|
183
|
+
image_description_and_references[image_summary] = markdown_links
|
184
|
+
else:
|
185
|
+
image_description_and_references[None] = markdown_links
|
186
|
+
|
187
|
+
return _replace_images(
|
188
|
+
markdown_text=markdown_text,
|
189
|
+
image_description_and_references=image_description_and_references,
|
190
|
+
description_format=description_format,
|
191
|
+
)
|
192
|
+
|
193
|
+
|
194
|
+
async def acaption_markdown_images(
|
195
|
+
markdown_text: str,
|
196
|
+
headers: dict[str, str],
|
197
|
+
image_processing_config: ImageProcessingConfig,
|
198
|
+
description_format: str,
|
199
|
+
image_description_instruction: str,
|
200
|
+
chatterer: Chatterer,
|
201
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
|
202
|
+
) -> str:
|
203
|
+
"""
|
204
|
+
Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
|
205
|
+
"""
|
206
|
+
image_url_and_markdown_links: dict[
|
207
|
+
Optional[Base64Image], list[MarkdownLink]
|
208
|
+
] = await _aget_image_url_and_markdown_links(
|
209
|
+
markdown_text=markdown_text,
|
210
|
+
headers=headers,
|
211
|
+
config=image_processing_config,
|
212
|
+
img_bytes_fetcher=img_bytes_fetcher,
|
213
|
+
)
|
214
|
+
|
215
|
+
async def dummy() -> None:
|
216
|
+
pass
|
217
|
+
|
218
|
+
def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
|
219
|
+
if isinstance(e, BaseException):
|
220
|
+
print(format_exception_only(type(e), e))
|
221
|
+
return False
|
222
|
+
return True
|
223
|
+
|
224
|
+
coros: list[Awaitable[Optional[str]]] = [
|
225
|
+
chatterer.adescribe_image(image_url=image_url.data_uri, instruction=image_description_instruction)
|
226
|
+
if image_url is not None
|
227
|
+
else dummy()
|
228
|
+
for image_url in image_url_and_markdown_links.keys()
|
229
|
+
]
|
230
|
+
|
231
|
+
return _replace_images(
|
232
|
+
markdown_text=markdown_text,
|
233
|
+
image_description_and_references=ImageDescriptionAndReferences({
|
234
|
+
image_summary: markdown_links
|
235
|
+
for markdown_links, image_summary in zip(
|
236
|
+
image_url_and_markdown_links.values(), await gather(*coros, return_exceptions=True)
|
237
|
+
)
|
238
|
+
if _handle_exception(image_summary)
|
239
|
+
}),
|
240
|
+
description_format=description_format,
|
241
|
+
)
|
188
242
|
|
189
243
|
|
190
244
|
# --------------------------------------------------------------------
|
@@ -263,11 +317,11 @@ def _to_absolute_path(path: str, referer: str) -> str:
|
|
263
317
|
return os.path.abspath(combined)
|
264
318
|
|
265
319
|
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
320
|
+
def _get_image_url_and_markdown_links(
|
321
|
+
markdown_text: str,
|
322
|
+
headers: dict[str, str],
|
323
|
+
config: ImageProcessingConfig,
|
324
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
|
271
325
|
) -> dict[Optional[Base64Image], list[MarkdownLink]]:
|
272
326
|
image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
|
273
327
|
for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
|
@@ -275,7 +329,9 @@ def get_image_url_and_markdown_links(
|
|
275
329
|
image_matches.setdefault(None, []).append(markdown_link)
|
276
330
|
continue
|
277
331
|
|
278
|
-
image_data = Base64Image.from_url_or_path(
|
332
|
+
image_data = Base64Image.from_url_or_path(
|
333
|
+
markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
|
334
|
+
)
|
279
335
|
if not image_data:
|
280
336
|
image_matches.setdefault(None, []).append(markdown_link)
|
281
337
|
continue
|
@@ -283,16 +339,19 @@ def get_image_url_and_markdown_links(
|
|
283
339
|
return image_matches
|
284
340
|
|
285
341
|
|
286
|
-
async def
|
287
|
-
markdown_text: str,
|
342
|
+
async def _aget_image_url_and_markdown_links(
|
343
|
+
markdown_text: str,
|
344
|
+
headers: dict[str, str],
|
345
|
+
config: ImageProcessingConfig,
|
346
|
+
img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
|
288
347
|
) -> dict[Optional[Base64Image], list[MarkdownLink]]:
|
289
348
|
image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
|
290
349
|
for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
|
291
350
|
if markdown_link.type == "link":
|
292
351
|
image_matches.setdefault(None, []).append(markdown_link)
|
293
352
|
continue
|
294
|
-
image_data = await Base64Image.
|
295
|
-
markdown_link.url, headers=headers, config=config,
|
353
|
+
image_data = await Base64Image.afrom_url_or_path(
|
354
|
+
markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
|
296
355
|
)
|
297
356
|
if not image_data:
|
298
357
|
image_matches.setdefault(None, []).append(markdown_link)
|
@@ -301,7 +360,7 @@ async def aget_image_url_and_markdown_links(
|
|
301
360
|
return image_matches
|
302
361
|
|
303
362
|
|
304
|
-
def
|
363
|
+
def _replace_images(
|
305
364
|
markdown_text: str, image_description_and_references: ImageDescriptionAndReferences, description_format: str
|
306
365
|
) -> str:
|
307
366
|
replacements: list[tuple[MarkdownLink, str]] = []
|
@@ -323,12 +382,3 @@ def replace_images(
|
|
323
382
|
))
|
324
383
|
|
325
384
|
return MarkdownLink.replace(markdown_text, replacements)
|
326
|
-
|
327
|
-
|
328
|
-
ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
|
329
|
-
ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
|
330
|
-
WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
|
331
|
-
|
332
|
-
DEFAULT_UA: str = (
|
333
|
-
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
|
334
|
-
)
|
@@ -0,0 +1,302 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import re
|
5
|
+
from contextlib import contextmanager
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional, Union
|
8
|
+
|
9
|
+
from ..language_model import Chatterer, HumanMessage
|
10
|
+
from ..utils.base64_image import Base64Image
|
11
|
+
from ..utils.bytesio import PathOrReadable, read_bytes_stream
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
|
15
|
+
|
16
|
+
# Setup basic logging
|
17
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
|
20
|
+
|
21
|
+
|
22
|
+
@dataclass
|
23
|
+
class PdfToMarkdown:
|
24
|
+
"""
|
25
|
+
Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
|
26
|
+
Processes PDFs page by page, providing the LLM with both the extracted raw
|
27
|
+
text and a rendered image of the page to handle complex layouts. It maintains
|
28
|
+
context between pages by feeding the *tail end* of the previously generated
|
29
|
+
Markdown back into the prompt for the next page to ensure smooth transitions.
|
30
|
+
"""
|
31
|
+
|
32
|
+
chatterer: Chatterer
|
33
|
+
"""An instance of the Chatterer class configured with a vision-capable model."""
|
34
|
+
image_zoom: float = 2.0
|
35
|
+
"""Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
|
36
|
+
image_format: Literal["jpg", "jpeg", "png"] = "png"
|
37
|
+
"""The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
|
38
|
+
image_jpg_quality: int = 95
|
39
|
+
"""Quality for JPEG images (if used)."""
|
40
|
+
context_tail_lines: int = 10
|
41
|
+
"""Number of lines from the end of the previous page's Markdown to use as context."""
|
42
|
+
# max_context_tokens: Optional[int] = None # This can be added later if needed
|
43
|
+
|
44
|
+
def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
|
45
|
+
"""Extracts the last N lines from the given markdown text."""
|
46
|
+
if not markdown_text or self.context_tail_lines <= 0:
|
47
|
+
return None
|
48
|
+
lines = markdown_text.strip().splitlines()
|
49
|
+
if not lines:
|
50
|
+
return None
|
51
|
+
# Get the last N lines, or fewer if the text is shorter
|
52
|
+
tail_lines = lines[-self.context_tail_lines :]
|
53
|
+
return "\n".join(tail_lines)
|
54
|
+
|
55
|
+
def _format_prompt_content(
|
56
|
+
self,
|
57
|
+
page_text: str,
|
58
|
+
page_image_b64: Base64Image,
|
59
|
+
previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
|
60
|
+
page_number: int = 0, # For context, 0-indexed
|
61
|
+
total_pages: int = 1,
|
62
|
+
) -> HumanMessage:
|
63
|
+
"""
|
64
|
+
Formats the content list for the HumanMessage input to the LLM.
|
65
|
+
Uses only the tail end of the previous page's markdown for context.
|
66
|
+
"""
|
67
|
+
# Construct the main instruction prompt
|
68
|
+
instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
|
69
|
+
1. The raw text extracted from the page ([Raw Text]).
|
70
|
+
2. A rendered image of the page ([Rendered Image]) showing its visual layout.
|
71
|
+
3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
|
72
|
+
|
73
|
+
**Conversion Requirements:**
|
74
|
+
* **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
|
75
|
+
* **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
|
76
|
+
* **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use ``.
|
77
|
+
* **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
|
78
|
+
* **Continuity (Crucial):**
|
79
|
+
* Examine the [End of Previous Page Markdown] if provided.
|
80
|
+
* If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
|
81
|
+
* For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
|
82
|
+
* **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
|
83
|
+
* If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
|
84
|
+
|
85
|
+
**Input Data:**
|
86
|
+
[Raw Text]
|
87
|
+
```
|
88
|
+
{page_text if page_text else "No text extracted from this page."}
|
89
|
+
```
|
90
|
+
[Rendered Image]
|
91
|
+
(See attached image)
|
92
|
+
"""
|
93
|
+
if previous_markdown_context_tail:
|
94
|
+
instruction += f"""[End of Previous Page Markdown]
|
95
|
+
```markdown
|
96
|
+
... (content from previous page ends with) ...
|
97
|
+
{previous_markdown_context_tail}
|
98
|
+
```
|
99
|
+
**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
|
100
|
+
else:
|
101
|
+
instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
|
102
|
+
|
103
|
+
instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
|
104
|
+
|
105
|
+
# Structure for multimodal input
|
106
|
+
return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
|
107
|
+
|
108
|
+
def convert(
|
109
|
+
self,
|
110
|
+
pdf_input: Union[str, "Document"],
|
111
|
+
page_indices: Optional[Union[Iterable[int], int]] = None,
|
112
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
113
|
+
) -> str:
|
114
|
+
"""
|
115
|
+
Converts a PDF document (or specific pages) to Markdown synchronously.
|
116
|
+
Args:
|
117
|
+
pdf_input: Path to the PDF file or a pymupdf.Document object.
|
118
|
+
page_indices: Specific 0-based page indices to convert. If None, converts all pages.
|
119
|
+
Can be a single int or an iterable of ints.
|
120
|
+
progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
|
121
|
+
after each page is processed.
|
122
|
+
Returns:
|
123
|
+
A single string containing the concatenated Markdown output for the processed pages.
|
124
|
+
"""
|
125
|
+
with open_pdf(pdf_input) as doc:
|
126
|
+
target_page_indices = list(_get_page_indices(page_indices, len(doc)))
|
127
|
+
total_pages_to_process = len(target_page_indices)
|
128
|
+
if total_pages_to_process == 0:
|
129
|
+
logger.warning("No pages selected for processing.")
|
130
|
+
return ""
|
131
|
+
|
132
|
+
full_markdown_output: List[str] = []
|
133
|
+
# --- Context Tracking ---
|
134
|
+
previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
|
135
|
+
|
136
|
+
# Pre-process all pages (optional optimization)
|
137
|
+
logger.info("Extracting text and rendering images for selected pages...")
|
138
|
+
page_text_dict = extract_text_from_pdf(doc, target_page_indices)
|
139
|
+
page_image_dict = render_pdf_as_image(
|
140
|
+
doc,
|
141
|
+
page_indices=target_page_indices,
|
142
|
+
zoom=self.image_zoom,
|
143
|
+
output=self.image_format,
|
144
|
+
jpg_quality=self.image_jpg_quality,
|
145
|
+
)
|
146
|
+
logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
|
147
|
+
|
148
|
+
page_idx: int = target_page_indices.pop(0) # Get the first page index
|
149
|
+
i: int = 1
|
150
|
+
while True:
|
151
|
+
logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
|
152
|
+
try:
|
153
|
+
# --- Get Context Tail ---
|
154
|
+
context_tail = self._get_context_tail(previous_page_markdown)
|
155
|
+
|
156
|
+
message = self._format_prompt_content(
|
157
|
+
page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
|
158
|
+
page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
|
159
|
+
previous_markdown_context_tail=context_tail, # Pass only the tail
|
160
|
+
page_number=page_idx,
|
161
|
+
total_pages=len(doc),
|
162
|
+
)
|
163
|
+
logger.debug(f"Sending request to LLM for page index {page_idx}...")
|
164
|
+
|
165
|
+
response = self.chatterer([message])
|
166
|
+
# Extract markdown, handling potential lack of backticks
|
167
|
+
markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
|
168
|
+
if markdowns:
|
169
|
+
current_page_markdown = "\n".join(markdowns)
|
170
|
+
else:
|
171
|
+
# Fallback: assume the whole response is markdown if no ```markdown blocks found
|
172
|
+
current_page_markdown = response.strip()
|
173
|
+
if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
|
174
|
+
# Basic cleanup if it just missed the 'markdown' language tag
|
175
|
+
current_page_markdown = current_page_markdown[3:-3].strip()
|
176
|
+
elif "```" in current_page_markdown:
|
177
|
+
logger.warning(
|
178
|
+
f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
|
179
|
+
)
|
180
|
+
|
181
|
+
logger.debug(f"Received response from LLM for page index {page_idx}.")
|
182
|
+
|
183
|
+
# --- Store result and update context ---
|
184
|
+
full_markdown_output.append(current_page_markdown)
|
185
|
+
# Update the *full* previous markdown for the *next* iteration's tail calculation
|
186
|
+
previous_page_markdown = current_page_markdown
|
187
|
+
|
188
|
+
except Exception as e:
|
189
|
+
logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
|
190
|
+
continue
|
191
|
+
|
192
|
+
# Progress callback
|
193
|
+
if progress_callback:
|
194
|
+
try:
|
195
|
+
progress_callback(i, total_pages_to_process)
|
196
|
+
except Exception as cb_err:
|
197
|
+
logger.warning(f"Progress callback failed: {cb_err}")
|
198
|
+
|
199
|
+
if not target_page_indices:
|
200
|
+
break
|
201
|
+
|
202
|
+
page_idx = target_page_indices.pop(0) # Get the next page index
|
203
|
+
i += 1 # Increment the page counter
|
204
|
+
|
205
|
+
# Join with double newline, potentially adjust based on how well continuations work
|
206
|
+
return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
|
207
|
+
|
208
|
+
|
209
|
+
def render_pdf_as_image(
|
210
|
+
doc: "Document",
|
211
|
+
zoom: float = 2.0,
|
212
|
+
output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
|
213
|
+
jpg_quality: int = 100,
|
214
|
+
page_indices: Iterable[int] | int | None = None,
|
215
|
+
) -> dict[int, bytes]:
|
216
|
+
"""
|
217
|
+
Convert PDF pages to images in bytes.
|
218
|
+
|
219
|
+
Args:
|
220
|
+
doc (Document): The PDF document to convert.
|
221
|
+
zoom (float): Zoom factor for the image resolution. Default is 2.0.
|
222
|
+
output (str): Output format for the image. Default is 'png'.
|
223
|
+
jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
|
224
|
+
page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
|
225
|
+
If an int is provided, only that page is converted.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
dict[int, bytes]: A dictionary mapping page numbers to image bytes.
|
229
|
+
"""
|
230
|
+
from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
|
231
|
+
from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
|
232
|
+
|
233
|
+
images_bytes: dict[int, bytes] = {}
|
234
|
+
matrix = Matrix(zoom, zoom) # Control output resolution
|
235
|
+
for page_idx in _get_page_indices(page_indices, len(doc)):
|
236
|
+
img_bytes = bytes(
|
237
|
+
get_pixmap(
|
238
|
+
page=doc[page_idx],
|
239
|
+
matrix=matrix,
|
240
|
+
).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
|
241
|
+
)
|
242
|
+
images_bytes[page_idx] = img_bytes
|
243
|
+
return images_bytes
|
244
|
+
|
245
|
+
|
246
|
+
def extract_text_from_pdf(
|
247
|
+
doc: "Document",
|
248
|
+
page_indices: Iterable[int] | int | None = None,
|
249
|
+
) -> dict[int, str]:
|
250
|
+
"""Convert a PDF file to plain text.
|
251
|
+
|
252
|
+
Extracts text from each page of a PDF file and formats it with page markers.
|
253
|
+
|
254
|
+
Args:
|
255
|
+
doc (Document): The PDF document to convert.
|
256
|
+
page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
|
257
|
+
If an int is provided, only that page is converted.
|
258
|
+
|
259
|
+
Returns:
|
260
|
+
dict[int, str]: A dictionary mapping page numbers to text content.
|
261
|
+
"""
|
262
|
+
return {
|
263
|
+
page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
|
264
|
+
for page_idx in _get_page_indices(page_indices, len(doc))
|
265
|
+
}
|
266
|
+
|
267
|
+
|
268
|
+
@contextmanager
|
269
|
+
def open_pdf(pdf_input: PathOrReadable | Document):
|
270
|
+
"""Open a PDF document from a file path or use an existing Document object.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
|
277
|
+
"""
|
278
|
+
import pymupdf # pyright: ignore[reportMissingTypeStubs]
|
279
|
+
|
280
|
+
should_close = True
|
281
|
+
|
282
|
+
if isinstance(pdf_input, pymupdf.Document):
|
283
|
+
should_close = False
|
284
|
+
doc = pdf_input
|
285
|
+
else:
|
286
|
+
with read_bytes_stream(pdf_input) as stream:
|
287
|
+
if stream is None:
|
288
|
+
raise FileNotFoundError(pdf_input)
|
289
|
+
doc = pymupdf.Document(stream=stream.read())
|
290
|
+
yield doc
|
291
|
+
if should_close:
|
292
|
+
doc.close()
|
293
|
+
|
294
|
+
|
295
|
+
def _get_page_indices(page_indices: Iterable[int] | int | None, max_doc_pages: int) -> Iterable[int]:
|
296
|
+
"""Helper function to handle page indices for PDF conversion."""
|
297
|
+
if page_indices is None:
|
298
|
+
return range(max_doc_pages)
|
299
|
+
elif isinstance(page_indices, int):
|
300
|
+
return [page_indices]
|
301
|
+
else:
|
302
|
+
return [i for i in page_indices if 0 <= i < max_doc_pages]
|