chatterer 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,80 +1,24 @@
1
- from __future__ import annotations
2
-
3
1
  import os.path
4
2
  import re
5
- from pathlib import Path
3
+ from asyncio import gather
4
+ from traceback import format_exception_only, print_exc
6
5
  from typing import (
6
+ Awaitable,
7
+ Callable,
7
8
  ClassVar,
8
9
  Literal,
9
10
  NamedTuple,
10
11
  NewType,
11
- NotRequired,
12
12
  Optional,
13
13
  Self,
14
- Sequence,
15
- TypeAlias,
16
- TypedDict,
17
14
  TypeGuard,
18
15
  cast,
19
16
  )
20
17
  from urllib.parse import urljoin, urlparse
21
18
 
22
- import mistune
23
- import playwright.sync_api
24
- from pydantic import BaseModel, Field
25
-
26
- from ...utils.image import Base64Image, ImageProcessingConfig
27
-
28
-
29
- class SelectedLineRanges(BaseModel):
30
- line_ranges: list[str] = Field(description="List of inclusive line ranges, e.g., ['1-3', '5-5', '7-10']")
31
-
32
-
33
- class PlaywrightLaunchOptions(TypedDict):
34
- executable_path: NotRequired[str | Path]
35
- channel: NotRequired[str]
36
- args: NotRequired[Sequence[str]]
37
- ignore_default_args: NotRequired[bool | Sequence[str]]
38
- handle_sigint: NotRequired[bool]
39
- handle_sigterm: NotRequired[bool]
40
- handle_sighup: NotRequired[bool]
41
- timeout: NotRequired[float]
42
- env: NotRequired[dict[str, str | float | bool]]
43
- headless: NotRequired[bool]
44
- devtools: NotRequired[bool]
45
- proxy: NotRequired[playwright.sync_api.ProxySettings]
46
- downloads_path: NotRequired[str | Path]
47
- slow_mo: NotRequired[float]
48
- traces_dir: NotRequired[str | Path]
49
- chromium_sandbox: NotRequired[bool]
50
- firefox_user_prefs: NotRequired[dict[str, str | float | bool]]
51
-
52
-
53
- class PlaywrightPersistencyOptions(TypedDict):
54
- user_data_dir: NotRequired[str | Path]
55
- storage_state: NotRequired[playwright.sync_api.StorageState]
56
-
57
-
58
- class PlaywrightOptions(PlaywrightLaunchOptions, PlaywrightPersistencyOptions): ...
19
+ from chatterer.language_model import Chatterer
59
20
 
60
-
61
- def get_default_playwright_launch_options() -> PlaywrightLaunchOptions:
62
- return {"headless": True}
63
-
64
-
65
- class _TrackingInlineState(mistune.InlineState):
66
- meta_offset: int = 0 # Where in the original text does self.src start?
67
-
68
- def copy(self) -> Self:
69
- new_state = self.__class__(self.env)
70
- new_state.src = self.src
71
- new_state.tokens = []
72
- new_state.in_image = self.in_image
73
- new_state.in_link = self.in_link
74
- new_state.in_emphasis = self.in_emphasis
75
- new_state.in_strong = self.in_strong
76
- new_state.meta_offset = self.meta_offset
77
- return new_state
21
+ from ..utils.base64_image import Base64Image, ImageProcessingConfig
78
22
 
79
23
 
80
24
  class MarkdownLink(NamedTuple):
@@ -93,7 +37,51 @@ class MarkdownLink(NamedTuple):
93
37
  instead of letting the block parser break it up. That ensures that
94
38
  link tokens cover the global positions of the entire input.
95
39
  """
96
- md = mistune.Markdown(inline=_TrackingInlineParser())
40
+
41
+ from mistune import InlineParser, InlineState, Markdown
42
+
43
+ class _TrackingInlineState(InlineState):
44
+ meta_offset: int = 0 # Where in the original text does self.src start?
45
+
46
+ def copy(self) -> Self:
47
+ new_state = self.__class__(self.env)
48
+ new_state.src = self.src
49
+ new_state.tokens = []
50
+ new_state.in_image = self.in_image
51
+ new_state.in_link = self.in_link
52
+ new_state.in_emphasis = self.in_emphasis
53
+ new_state.in_strong = self.in_strong
54
+ new_state.meta_offset = self.meta_offset
55
+ return new_state
56
+
57
+ class _TrackingInlineParser(InlineParser):
58
+ state_cls: ClassVar = _TrackingInlineState
59
+
60
+ def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
61
+ self, m: re.Match[str], state: _TrackingInlineState
62
+ ) -> Optional[int]:
63
+ """
64
+ Mistune calls parse_link with a match object for the link syntax
65
+ and the current inline state. If we successfully parse the link,
66
+ super().parse_link(...) returns the new position *within self.src*.
67
+ We add that to state.meta_offset for the global position.
68
+
69
+ Because parse_link in mistune might return None or an int, we only
70
+ record positions if we get an int back (meaning success).
71
+ """
72
+ offset = state.meta_offset
73
+ new_pos: int | None = super().parse_link(m, state)
74
+ if new_pos is not None:
75
+ # We have successfully parsed a link.
76
+ # The link token we just added should be the last token in state.tokens:
77
+ if state.tokens:
78
+ token = state.tokens[-1]
79
+ # The local end is new_pos in the substring.
80
+ # So the global start/end in the *original* text is offset + local positions.
81
+ token["global_pos"] = (offset + m.start(), offset + new_pos)
82
+ return new_pos
83
+
84
+ md = Markdown(inline=_TrackingInlineParser())
97
85
  # Create an inline state that references the full text.
98
86
  state = _TrackingInlineState({})
99
87
  state.src = markdown_text
@@ -155,36 +143,102 @@ class MarkdownLink(NamedTuple):
155
143
  results.append(cls(type, url, text, title, start, end))
156
144
  if "children" in token and _children_typeguard(children := token["children"]):
157
145
  results.extend(cls._extract_links(children, referer_url))
158
-
159
146
  return results
160
147
 
161
148
 
162
- class _TrackingInlineParser(mistune.InlineParser):
163
- state_cls: ClassVar = _TrackingInlineState
149
+ ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
150
+ ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
164
151
 
165
- def parse_link( # pyright: ignore[reportIncompatibleMethodOverride]
166
- self, m: re.Match[str], state: _TrackingInlineState
167
- ) -> Optional[int]:
168
- """
169
- Mistune calls parse_link with a match object for the link syntax
170
- and the current inline state. If we successfully parse the link,
171
- super().parse_link(...) returns the new position *within self.src*.
172
- We add that to state.meta_offset for the global position.
173
152
 
174
- Because parse_link in mistune might return None or an int, we only
175
- record positions if we get an int back (meaning success).
176
- """
177
- offset = state.meta_offset
178
- new_pos: int | None = super().parse_link(m, state)
179
- if new_pos is not None:
180
- # We have successfully parsed a link.
181
- # The link token we just added should be the last token in state.tokens:
182
- if state.tokens:
183
- token = state.tokens[-1]
184
- # The local end is new_pos in the substring.
185
- # So the global start/end in the *original* text is offset + local positions.
186
- token["global_pos"] = (offset + m.start(), offset + new_pos)
187
- return new_pos
153
+ def caption_markdown_images(
154
+ markdown_text: str,
155
+ headers: dict[str, str],
156
+ image_processing_config: ImageProcessingConfig,
157
+ description_format: str,
158
+ image_description_instruction: str,
159
+ chatterer: Chatterer,
160
+ img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
161
+ ) -> str:
162
+ """
163
+ Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
164
+ """
165
+ image_url_and_markdown_links: dict[Optional[Base64Image], list[MarkdownLink]] = _get_image_url_and_markdown_links(
166
+ markdown_text=markdown_text,
167
+ headers=headers,
168
+ config=image_processing_config,
169
+ img_bytes_fetcher=img_bytes_fetcher,
170
+ )
171
+
172
+ image_description_and_references: ImageDescriptionAndReferences = ImageDescriptionAndReferences({})
173
+ for image_url, markdown_links in image_url_and_markdown_links.items():
174
+ if image_url is not None:
175
+ try:
176
+ image_summary: str = chatterer.describe_image(
177
+ image_url=image_url.data_uri,
178
+ instruction=image_description_instruction,
179
+ )
180
+ except Exception:
181
+ print_exc()
182
+ continue
183
+ image_description_and_references[image_summary] = markdown_links
184
+ else:
185
+ image_description_and_references[None] = markdown_links
186
+
187
+ return _replace_images(
188
+ markdown_text=markdown_text,
189
+ image_description_and_references=image_description_and_references,
190
+ description_format=description_format,
191
+ )
192
+
193
+
194
+ async def acaption_markdown_images(
195
+ markdown_text: str,
196
+ headers: dict[str, str],
197
+ image_processing_config: ImageProcessingConfig,
198
+ description_format: str,
199
+ image_description_instruction: str,
200
+ chatterer: Chatterer,
201
+ img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
202
+ ) -> str:
203
+ """
204
+ Replace image URLs in Markdown text with their alt text and generate descriptions using a language model.
205
+ """
206
+ image_url_and_markdown_links: dict[
207
+ Optional[Base64Image], list[MarkdownLink]
208
+ ] = await _aget_image_url_and_markdown_links(
209
+ markdown_text=markdown_text,
210
+ headers=headers,
211
+ config=image_processing_config,
212
+ img_bytes_fetcher=img_bytes_fetcher,
213
+ )
214
+
215
+ async def dummy() -> None:
216
+ pass
217
+
218
+ def _handle_exception(e: Optional[str | BaseException]) -> TypeGuard[Optional[str]]:
219
+ if isinstance(e, BaseException):
220
+ print(format_exception_only(type(e), e))
221
+ return False
222
+ return True
223
+
224
+ coros: list[Awaitable[Optional[str]]] = [
225
+ chatterer.adescribe_image(image_url=image_url.data_uri, instruction=image_description_instruction)
226
+ if image_url is not None
227
+ else dummy()
228
+ for image_url in image_url_and_markdown_links.keys()
229
+ ]
230
+
231
+ return _replace_images(
232
+ markdown_text=markdown_text,
233
+ image_description_and_references=ImageDescriptionAndReferences({
234
+ image_summary: markdown_links
235
+ for markdown_links, image_summary in zip(
236
+ image_url_and_markdown_links.values(), await gather(*coros, return_exceptions=True)
237
+ )
238
+ if _handle_exception(image_summary)
239
+ }),
240
+ description_format=description_format,
241
+ )
188
242
 
189
243
 
190
244
  # --------------------------------------------------------------------
@@ -263,11 +317,11 @@ def _to_absolute_path(path: str, referer: str) -> str:
263
317
  return os.path.abspath(combined)
264
318
 
265
319
 
266
- # =======================
267
-
268
-
269
- def get_image_url_and_markdown_links(
270
- markdown_text: str, headers: dict[str, str], config: ImageProcessingConfig
320
+ def _get_image_url_and_markdown_links(
321
+ markdown_text: str,
322
+ headers: dict[str, str],
323
+ config: ImageProcessingConfig,
324
+ img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], bytes]] = None,
271
325
  ) -> dict[Optional[Base64Image], list[MarkdownLink]]:
272
326
  image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
273
327
  for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
@@ -275,7 +329,9 @@ def get_image_url_and_markdown_links(
275
329
  image_matches.setdefault(None, []).append(markdown_link)
276
330
  continue
277
331
 
278
- image_data = Base64Image.from_url_or_path(markdown_link.url, headers=headers, config=config)
332
+ image_data = Base64Image.from_url_or_path(
333
+ markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
334
+ )
279
335
  if not image_data:
280
336
  image_matches.setdefault(None, []).append(markdown_link)
281
337
  continue
@@ -283,16 +339,19 @@ def get_image_url_and_markdown_links(
283
339
  return image_matches
284
340
 
285
341
 
286
- async def aget_image_url_and_markdown_links(
287
- markdown_text: str, headers: dict[str, str], config: ImageProcessingConfig
342
+ async def _aget_image_url_and_markdown_links(
343
+ markdown_text: str,
344
+ headers: dict[str, str],
345
+ config: ImageProcessingConfig,
346
+ img_bytes_fetcher: Optional[Callable[[str, dict[str, str]], Awaitable[bytes]]] = None,
288
347
  ) -> dict[Optional[Base64Image], list[MarkdownLink]]:
289
348
  image_matches: dict[Optional[Base64Image], list[MarkdownLink]] = {}
290
349
  for markdown_link in MarkdownLink.from_markdown(markdown_text=markdown_text, referer_url=headers.get("Referer")):
291
350
  if markdown_link.type == "link":
292
351
  image_matches.setdefault(None, []).append(markdown_link)
293
352
  continue
294
- image_data = await Base64Image.from_url_or_path(
295
- markdown_link.url, headers=headers, config=config, return_coro=True
353
+ image_data = await Base64Image.afrom_url_or_path(
354
+ markdown_link.url, headers=headers, config=config, img_bytes_fetcher=img_bytes_fetcher
296
355
  )
297
356
  if not image_data:
298
357
  image_matches.setdefault(None, []).append(markdown_link)
@@ -301,7 +360,7 @@ async def aget_image_url_and_markdown_links(
301
360
  return image_matches
302
361
 
303
362
 
304
- def replace_images(
363
+ def _replace_images(
305
364
  markdown_text: str, image_description_and_references: ImageDescriptionAndReferences, description_format: str
306
365
  ) -> str:
307
366
  replacements: list[tuple[MarkdownLink, str]] = []
@@ -323,12 +382,3 @@ def replace_images(
323
382
  ))
324
383
 
325
384
  return MarkdownLink.replace(markdown_text, replacements)
326
-
327
-
328
- ImageDataAndReferences = dict[Optional[str], list[MarkdownLink]]
329
- ImageDescriptionAndReferences = NewType("ImageDescriptionAndReferences", ImageDataAndReferences)
330
- WaitUntil: TypeAlias = Literal["commit", "domcontentloaded", "load", "networkidle"]
331
-
332
- DEFAULT_UA: str = (
333
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Safari/537.36"
334
- )
@@ -0,0 +1,302 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ from contextlib import contextmanager
6
+ from dataclasses import dataclass
7
+ from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional, Union
8
+
9
+ from ..language_model import Chatterer, HumanMessage
10
+ from ..utils.base64_image import Base64Image
11
+ from ..utils.bytesio import PathOrReadable, read_bytes_stream
12
+
13
+ if TYPE_CHECKING:
14
+ from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
15
+
16
+ # Setup basic logging
17
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
+ logger = logging.getLogger(__name__)
19
+ MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
20
+
21
+
22
+ @dataclass
23
+ class PdfToMarkdown:
24
+ """
25
+ Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
26
+ Processes PDFs page by page, providing the LLM with both the extracted raw
27
+ text and a rendered image of the page to handle complex layouts. It maintains
28
+ context between pages by feeding the *tail end* of the previously generated
29
+ Markdown back into the prompt for the next page to ensure smooth transitions.
30
+ """
31
+
32
+ chatterer: Chatterer
33
+ """An instance of the Chatterer class configured with a vision-capable model."""
34
+ image_zoom: float = 2.0
35
+ """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
36
+ image_format: Literal["jpg", "jpeg", "png"] = "png"
37
+ """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
38
+ image_jpg_quality: int = 95
39
+ """Quality for JPEG images (if used)."""
40
+ context_tail_lines: int = 10
41
+ """Number of lines from the end of the previous page's Markdown to use as context."""
42
+ # max_context_tokens: Optional[int] = None # This can be added later if needed
43
+
44
+ def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
45
+ """Extracts the last N lines from the given markdown text."""
46
+ if not markdown_text or self.context_tail_lines <= 0:
47
+ return None
48
+ lines = markdown_text.strip().splitlines()
49
+ if not lines:
50
+ return None
51
+ # Get the last N lines, or fewer if the text is shorter
52
+ tail_lines = lines[-self.context_tail_lines :]
53
+ return "\n".join(tail_lines)
54
+
55
+ def _format_prompt_content(
56
+ self,
57
+ page_text: str,
58
+ page_image_b64: Base64Image,
59
+ previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
60
+ page_number: int = 0, # For context, 0-indexed
61
+ total_pages: int = 1,
62
+ ) -> HumanMessage:
63
+ """
64
+ Formats the content list for the HumanMessage input to the LLM.
65
+ Uses only the tail end of the previous page's markdown for context.
66
+ """
67
+ # Construct the main instruction prompt
68
+ instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
69
+ 1. The raw text extracted from the page ([Raw Text]).
70
+ 2. A rendered image of the page ([Rendered Image]) showing its visual layout.
71
+ 3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
72
+
73
+ **Conversion Requirements:**
74
+ * **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
75
+ * **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
76
+ * **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use `![alt](...)`.
77
+ * **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
78
+ * **Continuity (Crucial):**
79
+ * Examine the [End of Previous Page Markdown] if provided.
80
+ * If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
81
+ * For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
82
+ * **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
83
+ * If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
84
+
85
+ **Input Data:**
86
+ [Raw Text]
87
+ ```
88
+ {page_text if page_text else "No text extracted from this page."}
89
+ ```
90
+ [Rendered Image]
91
+ (See attached image)
92
+ """
93
+ if previous_markdown_context_tail:
94
+ instruction += f"""[End of Previous Page Markdown]
95
+ ```markdown
96
+ ... (content from previous page ends with) ...
97
+ {previous_markdown_context_tail}
98
+ ```
99
+ **Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
100
+ else:
101
+ instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
102
+
103
+ instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
104
+
105
+ # Structure for multimodal input
106
+ return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
107
+
108
+ def convert(
109
+ self,
110
+ pdf_input: Union[str, "Document"],
111
+ page_indices: Optional[Union[Iterable[int], int]] = None,
112
+ progress_callback: Optional[Callable[[int, int], None]] = None,
113
+ ) -> str:
114
+ """
115
+ Converts a PDF document (or specific pages) to Markdown synchronously.
116
+ Args:
117
+ pdf_input: Path to the PDF file or a pymupdf.Document object.
118
+ page_indices: Specific 0-based page indices to convert. If None, converts all pages.
119
+ Can be a single int or an iterable of ints.
120
+ progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
121
+ after each page is processed.
122
+ Returns:
123
+ A single string containing the concatenated Markdown output for the processed pages.
124
+ """
125
+ with open_pdf(pdf_input) as doc:
126
+ target_page_indices = list(_get_page_indices(page_indices, len(doc)))
127
+ total_pages_to_process = len(target_page_indices)
128
+ if total_pages_to_process == 0:
129
+ logger.warning("No pages selected for processing.")
130
+ return ""
131
+
132
+ full_markdown_output: List[str] = []
133
+ # --- Context Tracking ---
134
+ previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
135
+
136
+ # Pre-process all pages (optional optimization)
137
+ logger.info("Extracting text and rendering images for selected pages...")
138
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
139
+ page_image_dict = render_pdf_as_image(
140
+ doc,
141
+ page_indices=target_page_indices,
142
+ zoom=self.image_zoom,
143
+ output=self.image_format,
144
+ jpg_quality=self.image_jpg_quality,
145
+ )
146
+ logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
147
+
148
+ page_idx: int = target_page_indices.pop(0) # Get the first page index
149
+ i: int = 1
150
+ while True:
151
+ logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
152
+ try:
153
+ # --- Get Context Tail ---
154
+ context_tail = self._get_context_tail(previous_page_markdown)
155
+
156
+ message = self._format_prompt_content(
157
+ page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
158
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
159
+ previous_markdown_context_tail=context_tail, # Pass only the tail
160
+ page_number=page_idx,
161
+ total_pages=len(doc),
162
+ )
163
+ logger.debug(f"Sending request to LLM for page index {page_idx}...")
164
+
165
+ response = self.chatterer([message])
166
+ # Extract markdown, handling potential lack of backticks
167
+ markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
168
+ if markdowns:
169
+ current_page_markdown = "\n".join(markdowns)
170
+ else:
171
+ # Fallback: assume the whole response is markdown if no ```markdown blocks found
172
+ current_page_markdown = response.strip()
173
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
174
+ # Basic cleanup if it just missed the 'markdown' language tag
175
+ current_page_markdown = current_page_markdown[3:-3].strip()
176
+ elif "```" in current_page_markdown:
177
+ logger.warning(
178
+ f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
179
+ )
180
+
181
+ logger.debug(f"Received response from LLM for page index {page_idx}.")
182
+
183
+ # --- Store result and update context ---
184
+ full_markdown_output.append(current_page_markdown)
185
+ # Update the *full* previous markdown for the *next* iteration's tail calculation
186
+ previous_page_markdown = current_page_markdown
187
+
188
+ except Exception as e:
189
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
190
+ continue
191
+
192
+ # Progress callback
193
+ if progress_callback:
194
+ try:
195
+ progress_callback(i, total_pages_to_process)
196
+ except Exception as cb_err:
197
+ logger.warning(f"Progress callback failed: {cb_err}")
198
+
199
+ if not target_page_indices:
200
+ break
201
+
202
+ page_idx = target_page_indices.pop(0) # Get the next page index
203
+ i += 1 # Increment the page counter
204
+
205
+ # Join with double newline, potentially adjust based on how well continuations work
206
+ return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
207
+
208
+
209
+ def render_pdf_as_image(
210
+ doc: "Document",
211
+ zoom: float = 2.0,
212
+ output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
213
+ jpg_quality: int = 100,
214
+ page_indices: Iterable[int] | int | None = None,
215
+ ) -> dict[int, bytes]:
216
+ """
217
+ Convert PDF pages to images in bytes.
218
+
219
+ Args:
220
+ doc (Document): The PDF document to convert.
221
+ zoom (float): Zoom factor for the image resolution. Default is 2.0.
222
+ output (str): Output format for the image. Default is 'png'.
223
+ jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
224
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
225
+ If an int is provided, only that page is converted.
226
+
227
+ Returns:
228
+ dict[int, bytes]: A dictionary mapping page numbers to image bytes.
229
+ """
230
+ from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
231
+ from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
232
+
233
+ images_bytes: dict[int, bytes] = {}
234
+ matrix = Matrix(zoom, zoom) # Control output resolution
235
+ for page_idx in _get_page_indices(page_indices, len(doc)):
236
+ img_bytes = bytes(
237
+ get_pixmap(
238
+ page=doc[page_idx],
239
+ matrix=matrix,
240
+ ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
241
+ )
242
+ images_bytes[page_idx] = img_bytes
243
+ return images_bytes
244
+
245
+
246
+ def extract_text_from_pdf(
247
+ doc: "Document",
248
+ page_indices: Iterable[int] | int | None = None,
249
+ ) -> dict[int, str]:
250
+ """Convert a PDF file to plain text.
251
+
252
+ Extracts text from each page of a PDF file and formats it with page markers.
253
+
254
+ Args:
255
+ doc (Document): The PDF document to convert.
256
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
257
+ If an int is provided, only that page is converted.
258
+
259
+ Returns:
260
+ dict[int, str]: A dictionary mapping page numbers to text content.
261
+ """
262
+ return {
263
+ page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
264
+ for page_idx in _get_page_indices(page_indices, len(doc))
265
+ }
266
+
267
+
268
+ @contextmanager
269
+ def open_pdf(pdf_input: PathOrReadable | Document):
270
+ """Open a PDF document from a file path or use an existing Document object.
271
+
272
+ Args:
273
+ pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
274
+
275
+ Returns:
276
+ tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
277
+ """
278
+ import pymupdf # pyright: ignore[reportMissingTypeStubs]
279
+
280
+ should_close = True
281
+
282
+ if isinstance(pdf_input, pymupdf.Document):
283
+ should_close = False
284
+ doc = pdf_input
285
+ else:
286
+ with read_bytes_stream(pdf_input) as stream:
287
+ if stream is None:
288
+ raise FileNotFoundError(pdf_input)
289
+ doc = pymupdf.Document(stream=stream.read())
290
+ yield doc
291
+ if should_close:
292
+ doc.close()
293
+
294
+
295
+ def _get_page_indices(page_indices: Iterable[int] | int | None, max_doc_pages: int) -> Iterable[int]:
296
+ """Helper function to handle page indices for PDF conversion."""
297
+ if page_indices is None:
298
+ return range(max_doc_pages)
299
+ elif isinstance(page_indices, int):
300
+ return [page_indices]
301
+ else:
302
+ return [i for i in page_indices if 0 <= i < max_doc_pages]