chatterer 0.1.18__py3-none-any.whl → 0.1.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +93 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__init__.py +0 -0
  5. chatterer/examples/anything_to_markdown.py +85 -91
  6. chatterer/examples/get_code_snippets.py +55 -62
  7. chatterer/examples/login_with_playwright.py +156 -167
  8. chatterer/examples/make_ppt.py +488 -497
  9. chatterer/examples/pdf_to_markdown.py +100 -107
  10. chatterer/examples/pdf_to_text.py +54 -56
  11. chatterer/examples/transcription_api.py +112 -123
  12. chatterer/examples/upstage_parser.py +89 -100
  13. chatterer/examples/webpage_to_markdown.py +70 -79
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +533 -533
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +393 -302
  30. chatterer/tools/convert_to_text.py +446 -447
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +285 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/METADATA +392 -392
  40. chatterer-0.1.20.dist-info/RECORD +44 -0
  41. {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/WHEEL +1 -1
  42. chatterer-0.1.20.dist-info/entry_points.txt +10 -0
  43. chatterer-0.1.18.dist-info/RECORD +0 -42
  44. {chatterer-0.1.18.dist-info → chatterer-0.1.20.dist-info}/top_level.txt +0 -0
@@ -1,302 +1,393 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- import re
5
- from contextlib import contextmanager
6
- from dataclasses import dataclass
7
- from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional, Union
8
-
9
- from ..language_model import Chatterer, HumanMessage
10
- from ..utils.base64_image import Base64Image
11
- from ..utils.bytesio import PathOrReadable, read_bytes_stream
12
-
13
- if TYPE_CHECKING:
14
- from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
15
-
16
- # Setup basic logging
17
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
- logger = logging.getLogger(__name__)
19
- MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
20
-
21
-
22
- @dataclass
23
- class PdfToMarkdown:
24
- """
25
- Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
26
- Processes PDFs page by page, providing the LLM with both the extracted raw
27
- text and a rendered image of the page to handle complex layouts. It maintains
28
- context between pages by feeding the *tail end* of the previously generated
29
- Markdown back into the prompt for the next page to ensure smooth transitions.
30
- """
31
-
32
- chatterer: Chatterer
33
- """An instance of the Chatterer class configured with a vision-capable model."""
34
- image_zoom: float = 2.0
35
- """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
36
- image_format: Literal["jpg", "jpeg", "png"] = "png"
37
- """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
38
- image_jpg_quality: int = 95
39
- """Quality for JPEG images (if used)."""
40
- context_tail_lines: int = 10
41
- """Number of lines from the end of the previous page's Markdown to use as context."""
42
- # max_context_tokens: Optional[int] = None # This can be added later if needed
43
-
44
- def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
45
- """Extracts the last N lines from the given markdown text."""
46
- if not markdown_text or self.context_tail_lines <= 0:
47
- return None
48
- lines = markdown_text.strip().splitlines()
49
- if not lines:
50
- return None
51
- # Get the last N lines, or fewer if the text is shorter
52
- tail_lines = lines[-self.context_tail_lines :]
53
- return "\n".join(tail_lines)
54
-
55
- def _format_prompt_content(
56
- self,
57
- page_text: str,
58
- page_image_b64: Base64Image,
59
- previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
60
- page_number: int = 0, # For context, 0-indexed
61
- total_pages: int = 1,
62
- ) -> HumanMessage:
63
- """
64
- Formats the content list for the HumanMessage input to the LLM.
65
- Uses only the tail end of the previous page's markdown for context.
66
- """
67
- # Construct the main instruction prompt
68
- instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
69
- 1. The raw text extracted from the page ([Raw Text]).
70
- 2. A rendered image of the page ([Rendered Image]) showing its visual layout.
71
- 3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
72
-
73
- **Conversion Requirements:**
74
- * **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
75
- * **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
76
- * **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use `![alt](...)`.
77
- * **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
78
- * **Continuity (Crucial):**
79
- * Examine the [End of Previous Page Markdown] if provided.
80
- * If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
81
- * For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
82
- * **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
83
- * If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
84
-
85
- **Input Data:**
86
- [Raw Text]
87
- ```
88
- {page_text if page_text else "No text extracted from this page."}
89
- ```
90
- [Rendered Image]
91
- (See attached image)
92
- """
93
- if previous_markdown_context_tail:
94
- instruction += f"""[End of Previous Page Markdown]
95
- ```markdown
96
- ... (content from previous page ends with) ...
97
- {previous_markdown_context_tail}
98
- ```
99
- **Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
100
- else:
101
- instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
102
-
103
- instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
104
-
105
- # Structure for multimodal input
106
- return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
107
-
108
- def convert(
109
- self,
110
- pdf_input: Union[str, "Document"],
111
- page_indices: Optional[Union[Iterable[int], int]] = None,
112
- progress_callback: Optional[Callable[[int, int], None]] = None,
113
- ) -> str:
114
- """
115
- Converts a PDF document (or specific pages) to Markdown synchronously.
116
- Args:
117
- pdf_input: Path to the PDF file or a pymupdf.Document object.
118
- page_indices: Specific 0-based page indices to convert. If None, converts all pages.
119
- Can be a single int or an iterable of ints.
120
- progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
121
- after each page is processed.
122
- Returns:
123
- A single string containing the concatenated Markdown output for the processed pages.
124
- """
125
- with open_pdf(pdf_input) as doc:
126
- target_page_indices = list(_get_page_indices(page_indices, len(doc)))
127
- total_pages_to_process = len(target_page_indices)
128
- if total_pages_to_process == 0:
129
- logger.warning("No pages selected for processing.")
130
- return ""
131
-
132
- full_markdown_output: List[str] = []
133
- # --- Context Tracking ---
134
- previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
135
-
136
- # Pre-process all pages (optional optimization)
137
- logger.info("Extracting text and rendering images for selected pages...")
138
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
139
- page_image_dict = render_pdf_as_image(
140
- doc,
141
- page_indices=target_page_indices,
142
- zoom=self.image_zoom,
143
- output=self.image_format,
144
- jpg_quality=self.image_jpg_quality,
145
- )
146
- logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
147
-
148
- page_idx: int = target_page_indices.pop(0) # Get the first page index
149
- i: int = 1
150
- while True:
151
- logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
152
- try:
153
- # --- Get Context Tail ---
154
- context_tail = self._get_context_tail(previous_page_markdown)
155
-
156
- message = self._format_prompt_content(
157
- page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
158
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
159
- previous_markdown_context_tail=context_tail, # Pass only the tail
160
- page_number=page_idx,
161
- total_pages=len(doc),
162
- )
163
- logger.debug(f"Sending request to LLM for page index {page_idx}...")
164
-
165
- response = self.chatterer([message])
166
- # Extract markdown, handling potential lack of backticks
167
- markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
168
- if markdowns:
169
- current_page_markdown = "\n".join(markdowns)
170
- else:
171
- # Fallback: assume the whole response is markdown if no ```markdown blocks found
172
- current_page_markdown = response.strip()
173
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
174
- # Basic cleanup if it just missed the 'markdown' language tag
175
- current_page_markdown = current_page_markdown[3:-3].strip()
176
- elif "```" in current_page_markdown:
177
- logger.warning(
178
- f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
179
- )
180
-
181
- logger.debug(f"Received response from LLM for page index {page_idx}.")
182
-
183
- # --- Store result and update context ---
184
- full_markdown_output.append(current_page_markdown)
185
- # Update the *full* previous markdown for the *next* iteration's tail calculation
186
- previous_page_markdown = current_page_markdown
187
-
188
- except Exception as e:
189
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
190
- continue
191
-
192
- # Progress callback
193
- if progress_callback:
194
- try:
195
- progress_callback(i, total_pages_to_process)
196
- except Exception as cb_err:
197
- logger.warning(f"Progress callback failed: {cb_err}")
198
-
199
- if not target_page_indices:
200
- break
201
-
202
- page_idx = target_page_indices.pop(0) # Get the next page index
203
- i += 1 # Increment the page counter
204
-
205
- # Join with double newline, potentially adjust based on how well continuations work
206
- return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
207
-
208
-
209
- def render_pdf_as_image(
210
- doc: "Document",
211
- zoom: float = 2.0,
212
- output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
213
- jpg_quality: int = 100,
214
- page_indices: Iterable[int] | int | None = None,
215
- ) -> dict[int, bytes]:
216
- """
217
- Convert PDF pages to images in bytes.
218
-
219
- Args:
220
- doc (Document): The PDF document to convert.
221
- zoom (float): Zoom factor for the image resolution. Default is 2.0.
222
- output (str): Output format for the image. Default is 'png'.
223
- jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
224
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
225
- If an int is provided, only that page is converted.
226
-
227
- Returns:
228
- dict[int, bytes]: A dictionary mapping page numbers to image bytes.
229
- """
230
- from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
231
- from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
232
-
233
- images_bytes: dict[int, bytes] = {}
234
- matrix = Matrix(zoom, zoom) # Control output resolution
235
- for page_idx in _get_page_indices(page_indices, len(doc)):
236
- img_bytes = bytes(
237
- get_pixmap(
238
- page=doc[page_idx],
239
- matrix=matrix,
240
- ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
241
- )
242
- images_bytes[page_idx] = img_bytes
243
- return images_bytes
244
-
245
-
246
- def extract_text_from_pdf(
247
- doc: "Document",
248
- page_indices: Iterable[int] | int | None = None,
249
- ) -> dict[int, str]:
250
- """Convert a PDF file to plain text.
251
-
252
- Extracts text from each page of a PDF file and formats it with page markers.
253
-
254
- Args:
255
- doc (Document): The PDF document to convert.
256
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
257
- If an int is provided, only that page is converted.
258
-
259
- Returns:
260
- dict[int, str]: A dictionary mapping page numbers to text content.
261
- """
262
- return {
263
- page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
264
- for page_idx in _get_page_indices(page_indices, len(doc))
265
- }
266
-
267
-
268
- @contextmanager
269
- def open_pdf(pdf_input: PathOrReadable | Document):
270
- """Open a PDF document from a file path or use an existing Document object.
271
-
272
- Args:
273
- pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
274
-
275
- Returns:
276
- tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
277
- """
278
- import pymupdf # pyright: ignore[reportMissingTypeStubs]
279
-
280
- should_close = True
281
-
282
- if isinstance(pdf_input, pymupdf.Document):
283
- should_close = False
284
- doc = pdf_input
285
- else:
286
- with read_bytes_stream(pdf_input) as stream:
287
- if stream is None:
288
- raise FileNotFoundError(pdf_input)
289
- doc = pymupdf.Document(stream=stream.read())
290
- yield doc
291
- if should_close:
292
- doc.close()
293
-
294
-
295
- def _get_page_indices(page_indices: Iterable[int] | int | None, max_doc_pages: int) -> Iterable[int]:
296
- """Helper function to handle page indices for PDF conversion."""
297
- if page_indices is None:
298
- return range(max_doc_pages)
299
- elif isinstance(page_indices, int):
300
- return [page_indices]
301
- else:
302
- return [i for i in page_indices if 0 <= i < max_doc_pages]
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ from contextlib import contextmanager
6
+ from dataclasses import dataclass
7
+ from types import EllipsisType
8
+ from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional
9
+
10
+ from ..language_model import Chatterer, HumanMessage
11
+ from ..utils.base64_image import Base64Image
12
+ from ..utils.bytesio import PathOrReadable, read_bytes_stream
13
+
14
+ if TYPE_CHECKING:
15
+ from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
16
+
17
+ # Setup basic logging
18
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
19
+ logger = logging.getLogger(__name__)
20
+ MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
21
+ PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] | int | str
22
+
23
+
24
+ @dataclass
25
+ class PdfToMarkdown:
26
+ """
27
+ Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
28
+ Processes PDFs page by page, providing the LLM with both the extracted raw
29
+ text and a rendered image of the page to handle complex layouts. It maintains
30
+ context between pages by feeding the *tail end* of the previously generated
31
+ Markdown back into the prompt for the next page to ensure smooth transitions.
32
+ """
33
+
34
+ chatterer: Chatterer
35
+ """An instance of the Chatterer class configured with a vision-capable model."""
36
+ image_zoom: float = 2.0
37
+ """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
38
+ image_format: Literal["jpg", "jpeg", "png"] = "png"
39
+ """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
40
+ image_jpg_quality: int = 95
41
+ """Quality for JPEG images (if used)."""
42
+ context_tail_lines: int = 10
43
+ """Number of lines from the end of the previous page's Markdown to use as context."""
44
+ # max_context_tokens: Optional[int] = None # This can be added later if needed
45
+
46
+ def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
47
+ """Extracts the last N lines from the given markdown text."""
48
+ if not markdown_text or self.context_tail_lines <= 0:
49
+ return None
50
+ lines = markdown_text.strip().splitlines()
51
+ if not lines:
52
+ return None
53
+ # Get the last N lines, or fewer if the text is shorter
54
+ tail_lines = lines[-self.context_tail_lines :]
55
+ return "\n".join(tail_lines)
56
+
57
+ def _format_prompt_content(
58
+ self,
59
+ page_text: str,
60
+ page_image_b64: Base64Image,
61
+ previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
62
+ page_number: int = 0, # For context, 0-indexed
63
+ total_pages: int = 1,
64
+ ) -> HumanMessage:
65
+ """
66
+ Formats the content list for the HumanMessage input to the LLM.
67
+ Uses only the tail end of the previous page's markdown for context.
68
+ """
69
+ # Construct the main instruction prompt
70
+ instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
71
+ 1. The raw text extracted from the page ([Raw Text]).
72
+ 2. A rendered image of the page ([Rendered Image]) showing its visual layout.
73
+ 3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
74
+
75
+ **Conversion Requirements:**
76
+ * **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
77
+ * **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
78
+ * **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use `![alt](...)`.
79
+ * **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
80
+ * **Continuity (Crucial):**
81
+ * Examine the [End of Previous Page Markdown] if provided.
82
+ * If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
83
+ * For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
84
+ * **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
85
+ * If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
86
+
87
+ **Input Data:**
88
+ [Raw Text]
89
+ ```
90
+ {page_text if page_text else "No text extracted from this page."}
91
+ ```
92
+ [Rendered Image]
93
+ (See attached image)
94
+ """
95
+ if previous_markdown_context_tail:
96
+ instruction += f"""[End of Previous Page Markdown]
97
+ ```markdown
98
+ ... (content from previous page ends with) ...
99
+ {previous_markdown_context_tail}
100
+ ```
101
+ **Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
102
+ else:
103
+ instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
104
+
105
+ instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
106
+
107
+ # Structure for multimodal input
108
+ return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
109
+
110
+ def convert(
111
+ self,
112
+ pdf_input: "Document | PathOrReadable",
113
+ page_indices: Optional[PageIndexType] = None,
114
+ progress_callback: Optional[Callable[[int, int], None]] = None,
115
+ ) -> str:
116
+ """
117
+ Converts a PDF document (or specific pages) to Markdown synchronously.
118
+ Args:
119
+ pdf_input: Path to the PDF file or a pymupdf.Document object.
120
+ page_indices: Specific 0-based page indices to convert. If None, converts all pages.
121
+ Can be a single int or an iterable of ints.
122
+ progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
123
+ after each page is processed.
124
+ Returns:
125
+ A single string containing the concatenated Markdown output for the processed pages.
126
+ """
127
+ with open_pdf(pdf_input) as doc:
128
+ target_page_indices = list(
129
+ _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
130
+ )
131
+ total_pages_to_process = len(target_page_indices)
132
+ if total_pages_to_process == 0:
133
+ logger.warning("No pages selected for processing.")
134
+ return ""
135
+
136
+ full_markdown_output: List[str] = []
137
+ # --- Context Tracking ---
138
+ previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
139
+
140
+ # Pre-process all pages (optional optimization)
141
+ logger.info("Extracting text and rendering images for selected pages...")
142
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
143
+ page_image_dict = render_pdf_as_image(
144
+ doc,
145
+ page_indices=target_page_indices,
146
+ zoom=self.image_zoom,
147
+ output=self.image_format,
148
+ jpg_quality=self.image_jpg_quality,
149
+ )
150
+ logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
151
+
152
+ page_idx: int = target_page_indices.pop(0) # Get the first page index
153
+ i: int = 1
154
+ while True:
155
+ logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
156
+ try:
157
+ # --- Get Context Tail ---
158
+ context_tail = self._get_context_tail(previous_page_markdown)
159
+
160
+ message = self._format_prompt_content(
161
+ page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
162
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
163
+ previous_markdown_context_tail=context_tail, # Pass only the tail
164
+ page_number=page_idx,
165
+ total_pages=len(doc),
166
+ )
167
+ logger.debug(f"Sending request to LLM for page index {page_idx}...")
168
+
169
+ response = self.chatterer([message])
170
+ # Extract markdown, handling potential lack of backticks
171
+ markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
172
+ if markdowns:
173
+ current_page_markdown = "\n".join(markdowns)
174
+ else:
175
+ # Fallback: assume the whole response is markdown if no ```markdown blocks found
176
+ current_page_markdown = response.strip()
177
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
178
+ # Basic cleanup if it just missed the 'markdown' language tag
179
+ current_page_markdown = current_page_markdown[3:-3].strip()
180
+ elif "```" in current_page_markdown:
181
+ logger.warning(
182
+ f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
183
+ )
184
+
185
+ logger.debug(f"Received response from LLM for page index {page_idx}.")
186
+
187
+ # --- Store result and update context ---
188
+ full_markdown_output.append(current_page_markdown)
189
+ # Update the *full* previous markdown for the *next* iteration's tail calculation
190
+ previous_page_markdown = current_page_markdown
191
+
192
+ except Exception as e:
193
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
194
+ continue
195
+
196
+ # Progress callback
197
+ if progress_callback:
198
+ try:
199
+ progress_callback(i, total_pages_to_process)
200
+ except Exception as cb_err:
201
+ logger.warning(f"Progress callback failed: {cb_err}")
202
+
203
+ if not target_page_indices:
204
+ break
205
+
206
+ page_idx = target_page_indices.pop(0) # Get the next page index
207
+ i += 1 # Increment the page counter
208
+
209
+ # Join with double newline, potentially adjust based on how well continuations work
210
+ return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
211
+
212
+
213
+ def render_pdf_as_image(
214
+ doc: "Document",
215
+ zoom: float = 2.0,
216
+ output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
217
+ jpg_quality: int = 100,
218
+ page_indices: Iterable[int] | int | None = None,
219
+ ) -> dict[int, bytes]:
220
+ """
221
+ Convert PDF pages to images in bytes.
222
+
223
+ Args:
224
+ doc (Document): The PDF document to convert.
225
+ zoom (float): Zoom factor for the image resolution. Default is 2.0.
226
+ output (str): Output format for the image. Default is 'png'.
227
+ jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
228
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
229
+ If an int is provided, only that page is converted.
230
+
231
+ Returns:
232
+ dict[int, bytes]: A dictionary mapping page numbers to image bytes.
233
+ """
234
+ from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
235
+ from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
236
+
237
+ images_bytes: dict[int, bytes] = {}
238
+ matrix = Matrix(zoom, zoom) # Control output resolution
239
+ for page_idx in _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True):
240
+ img_bytes = bytes(
241
+ get_pixmap(
242
+ page=doc[page_idx],
243
+ matrix=matrix,
244
+ ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
245
+ )
246
+ images_bytes[page_idx] = img_bytes
247
+ return images_bytes
248
+
249
+
250
+ def extract_text_from_pdf(doc: "Document", page_indices: Optional[PageIndexType] = None) -> dict[int, str]:
251
+ """Convert a PDF file to plain text.
252
+
253
+ Extracts text from each page of a PDF file and formats it with page markers.
254
+
255
+ Args:
256
+ doc (Document): The PDF document to convert.
257
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
258
+ If an int is provided, only that page is converted.
259
+
260
+ Returns:
261
+ dict[int, str]: A dictionary mapping page numbers to text content.
262
+ """
263
+ return {
264
+ page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
265
+ for page_idx in _get_page_indices(
266
+ page_indices=page_indices,
267
+ max_doc_pages=len(doc),
268
+ is_input_zero_based=True,
269
+ )
270
+ }
271
+
272
+
273
+ @contextmanager
274
+ def open_pdf(pdf_input: PathOrReadable | Document):
275
+ """Open a PDF document from a file path or use an existing Document object.
276
+
277
+ Args:
278
+ pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
279
+
280
+ Returns:
281
+ tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
282
+ """
283
+ import pymupdf # pyright: ignore[reportMissingTypeStubs]
284
+
285
+ should_close = True
286
+
287
+ if isinstance(pdf_input, pymupdf.Document):
288
+ should_close = False
289
+ doc = pdf_input
290
+ else:
291
+ with read_bytes_stream(pdf_input) as stream:
292
+ if stream is None:
293
+ raise FileNotFoundError(pdf_input)
294
+ doc = pymupdf.Document(stream=stream.read())
295
+ yield doc
296
+ if should_close:
297
+ doc.close()
298
+
299
+
300
+ def _get_page_indices(
301
+ page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool
302
+ ) -> list[int]:
303
+ """Helper function to handle page indices for PDF conversion."""
304
+
305
+ def _to_zero_based_int(idx: int) -> int:
306
+ """Convert a 1-based index to a 0-based index if necessary."""
307
+ if is_input_zero_based:
308
+ return idx
309
+ else:
310
+ if idx < 1 or idx > max_doc_pages:
311
+ raise ValueError(f"Index {idx} is out of bounds for document with {max_doc_pages} pages (1-based).")
312
+ return idx - 1
313
+
314
+ if page_indices is None:
315
+ return list(range(max_doc_pages)) # Convert all pages
316
+ elif isinstance(page_indices, int):
317
+ # Handle single integer input for page index
318
+ return [_to_zero_based_int(page_indices)]
319
+ elif isinstance(page_indices, str):
320
+ # Handle string input for page indices
321
+ return _interpret_index_string(
322
+ index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based
323
+ )
324
+ else:
325
+ # Handle iterable input for page indices
326
+ indices: set[int] = set()
327
+ for idx in page_indices:
328
+ if isinstance(idx, int):
329
+ indices.add(_to_zero_based_int(idx))
330
+ else:
331
+ start, end = idx
332
+ if isinstance(start, EllipsisType):
333
+ start = 0
334
+ else:
335
+ start = _to_zero_based_int(start)
336
+
337
+ if isinstance(end, EllipsisType):
338
+ end = max_doc_pages - 1
339
+ else:
340
+ end = _to_zero_based_int(end)
341
+
342
+ if start > end:
343
+ raise ValueError(
344
+ f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
345
+ )
346
+ indices.update(range(start, end + 1))
347
+
348
+ return sorted(indices) # Return sorted list of indices
349
+
350
+
351
+ def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
352
+ """Interpret a string of comma-separated indices and ranges."""
353
+
354
+ def _to_zero_based_int(idx_str: str) -> int:
355
+ i = int(idx_str)
356
+ if is_input_zero_based:
357
+ if i < 0 or i >= max_doc_pages:
358
+ raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages.")
359
+ return i
360
+ else:
361
+ if i < 1 or i > max_doc_pages:
362
+ raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages (1-based).")
363
+ return i - 1 # Convert to zero-based index
364
+
365
+ indices: set[int] = set()
366
+ for part in index_str.split(","):
367
+ part: str = part.strip()
368
+ count_dash: int = part.count("-")
369
+ if count_dash == 0:
370
+ indices.add(_to_zero_based_int(part))
371
+ elif count_dash == 1:
372
+ idx_dash: int = part.index("-")
373
+ start = part[:idx_dash].strip()
374
+ end = part[idx_dash + 1 :].strip()
375
+ if not start:
376
+ start = _to_zero_based_int("0") # Default to 0 if no start index is provided
377
+ else:
378
+ start = _to_zero_based_int(start)
379
+
380
+ if not end:
381
+ end = _to_zero_based_int(str(max_doc_pages - 1)) # Default to last page if no end index is provided
382
+ else:
383
+ end = _to_zero_based_int(end)
384
+
385
+ if start > end:
386
+ raise ValueError(
387
+ f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
388
+ )
389
+ indices.update(range(start, end + 1))
390
+ else:
391
+ raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")
392
+
393
+ return sorted(indices) # Return sorted list of indices, ensuring no duplicates