chatterer 0.1.17__py3-none-any.whl → 0.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +93 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__init__.py +0 -0
  5. chatterer/examples/anything_to_markdown.py +95 -0
  6. chatterer/examples/get_code_snippets.py +64 -0
  7. chatterer/examples/login_with_playwright.py +171 -0
  8. chatterer/examples/make_ppt.py +499 -0
  9. chatterer/examples/pdf_to_markdown.py +107 -0
  10. chatterer/examples/pdf_to_text.py +60 -0
  11. chatterer/examples/transcription_api.py +127 -0
  12. chatterer/examples/upstage_parser.py +95 -0
  13. chatterer/examples/webpage_to_markdown.py +79 -0
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +533 -533
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +302 -302
  30. chatterer/tools/convert_to_text.py +447 -447
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +285 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/METADATA +392 -392
  40. chatterer-0.1.19.dist-info/RECORD +44 -0
  41. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/WHEEL +1 -1
  42. chatterer-0.1.19.dist-info/entry_points.txt +10 -0
  43. chatterer-0.1.17.dist-info/RECORD +0 -33
  44. {chatterer-0.1.17.dist-info → chatterer-0.1.19.dist-info}/top_level.txt +0 -0
@@ -1,302 +1,302 @@
1
- from __future__ import annotations
2
-
3
- import logging
4
- import re
5
- from contextlib import contextmanager
6
- from dataclasses import dataclass
7
- from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional, Union
8
-
9
- from ..language_model import Chatterer, HumanMessage
10
- from ..utils.base64_image import Base64Image
11
- from ..utils.bytesio import PathOrReadable, read_bytes_stream
12
-
13
- if TYPE_CHECKING:
14
- from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
15
-
16
- # Setup basic logging
17
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
- logger = logging.getLogger(__name__)
19
- MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
20
-
21
-
22
- @dataclass
23
- class PdfToMarkdown:
24
- """
25
- Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
26
- Processes PDFs page by page, providing the LLM with both the extracted raw
27
- text and a rendered image of the page to handle complex layouts. It maintains
28
- context between pages by feeding the *tail end* of the previously generated
29
- Markdown back into the prompt for the next page to ensure smooth transitions.
30
- """
31
-
32
- chatterer: Chatterer
33
- """An instance of the Chatterer class configured with a vision-capable model."""
34
- image_zoom: float = 2.0
35
- """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
36
- image_format: Literal["jpg", "jpeg", "png"] = "png"
37
- """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
38
- image_jpg_quality: int = 95
39
- """Quality for JPEG images (if used)."""
40
- context_tail_lines: int = 10
41
- """Number of lines from the end of the previous page's Markdown to use as context."""
42
- # max_context_tokens: Optional[int] = None # This can be added later if needed
43
-
44
- def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
45
- """Extracts the last N lines from the given markdown text."""
46
- if not markdown_text or self.context_tail_lines <= 0:
47
- return None
48
- lines = markdown_text.strip().splitlines()
49
- if not lines:
50
- return None
51
- # Get the last N lines, or fewer if the text is shorter
52
- tail_lines = lines[-self.context_tail_lines :]
53
- return "\n".join(tail_lines)
54
-
55
- def _format_prompt_content(
56
- self,
57
- page_text: str,
58
- page_image_b64: Base64Image,
59
- previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
60
- page_number: int = 0, # For context, 0-indexed
61
- total_pages: int = 1,
62
- ) -> HumanMessage:
63
- """
64
- Formats the content list for the HumanMessage input to the LLM.
65
- Uses only the tail end of the previous page's markdown for context.
66
- """
67
- # Construct the main instruction prompt
68
- instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
69
- 1. The raw text extracted from the page ([Raw Text]).
70
- 2. A rendered image of the page ([Rendered Image]) showing its visual layout.
71
- 3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
72
-
73
- **Conversion Requirements:**
74
- * **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
75
- * **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
76
- * **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use `![alt](...)`.
77
- * **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
78
- * **Continuity (Crucial):**
79
- * Examine the [End of Previous Page Markdown] if provided.
80
- * If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
81
- * For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
82
- * **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
83
- * If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
84
-
85
- **Input Data:**
86
- [Raw Text]
87
- ```
88
- {page_text if page_text else "No text extracted from this page."}
89
- ```
90
- [Rendered Image]
91
- (See attached image)
92
- """
93
- if previous_markdown_context_tail:
94
- instruction += f"""[End of Previous Page Markdown]
95
- ```markdown
96
- ... (content from previous page ends with) ...
97
- {previous_markdown_context_tail}
98
- ```
99
- **Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
100
- else:
101
- instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
102
-
103
- instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
104
-
105
- # Structure for multimodal input
106
- return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
107
-
108
- def convert(
109
- self,
110
- pdf_input: Union[str, "Document"],
111
- page_indices: Optional[Union[Iterable[int], int]] = None,
112
- progress_callback: Optional[Callable[[int, int], None]] = None,
113
- ) -> str:
114
- """
115
- Converts a PDF document (or specific pages) to Markdown synchronously.
116
- Args:
117
- pdf_input: Path to the PDF file or a pymupdf.Document object.
118
- page_indices: Specific 0-based page indices to convert. If None, converts all pages.
119
- Can be a single int or an iterable of ints.
120
- progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
121
- after each page is processed.
122
- Returns:
123
- A single string containing the concatenated Markdown output for the processed pages.
124
- """
125
- with open_pdf(pdf_input) as doc:
126
- target_page_indices = list(_get_page_indices(page_indices, len(doc)))
127
- total_pages_to_process = len(target_page_indices)
128
- if total_pages_to_process == 0:
129
- logger.warning("No pages selected for processing.")
130
- return ""
131
-
132
- full_markdown_output: List[str] = []
133
- # --- Context Tracking ---
134
- previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
135
-
136
- # Pre-process all pages (optional optimization)
137
- logger.info("Extracting text and rendering images for selected pages...")
138
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
139
- page_image_dict = render_pdf_as_image(
140
- doc,
141
- page_indices=target_page_indices,
142
- zoom=self.image_zoom,
143
- output=self.image_format,
144
- jpg_quality=self.image_jpg_quality,
145
- )
146
- logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
147
-
148
- page_idx: int = target_page_indices.pop(0) # Get the first page index
149
- i: int = 1
150
- while True:
151
- logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
152
- try:
153
- # --- Get Context Tail ---
154
- context_tail = self._get_context_tail(previous_page_markdown)
155
-
156
- message = self._format_prompt_content(
157
- page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
158
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
159
- previous_markdown_context_tail=context_tail, # Pass only the tail
160
- page_number=page_idx,
161
- total_pages=len(doc),
162
- )
163
- logger.debug(f"Sending request to LLM for page index {page_idx}...")
164
-
165
- response = self.chatterer([message])
166
- # Extract markdown, handling potential lack of backticks
167
- markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
168
- if markdowns:
169
- current_page_markdown = "\n".join(markdowns)
170
- else:
171
- # Fallback: assume the whole response is markdown if no ```markdown blocks found
172
- current_page_markdown = response.strip()
173
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
174
- # Basic cleanup if it just missed the 'markdown' language tag
175
- current_page_markdown = current_page_markdown[3:-3].strip()
176
- elif "```" in current_page_markdown:
177
- logger.warning(
178
- f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
179
- )
180
-
181
- logger.debug(f"Received response from LLM for page index {page_idx}.")
182
-
183
- # --- Store result and update context ---
184
- full_markdown_output.append(current_page_markdown)
185
- # Update the *full* previous markdown for the *next* iteration's tail calculation
186
- previous_page_markdown = current_page_markdown
187
-
188
- except Exception as e:
189
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
190
- continue
191
-
192
- # Progress callback
193
- if progress_callback:
194
- try:
195
- progress_callback(i, total_pages_to_process)
196
- except Exception as cb_err:
197
- logger.warning(f"Progress callback failed: {cb_err}")
198
-
199
- if not target_page_indices:
200
- break
201
-
202
- page_idx = target_page_indices.pop(0) # Get the next page index
203
- i += 1 # Increment the page counter
204
-
205
- # Join with double newline, potentially adjust based on how well continuations work
206
- return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
207
-
208
-
209
- def render_pdf_as_image(
210
- doc: "Document",
211
- zoom: float = 2.0,
212
- output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
213
- jpg_quality: int = 100,
214
- page_indices: Iterable[int] | int | None = None,
215
- ) -> dict[int, bytes]:
216
- """
217
- Convert PDF pages to images in bytes.
218
-
219
- Args:
220
- doc (Document): The PDF document to convert.
221
- zoom (float): Zoom factor for the image resolution. Default is 2.0.
222
- output (str): Output format for the image. Default is 'png'.
223
- jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
224
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
225
- If an int is provided, only that page is converted.
226
-
227
- Returns:
228
- dict[int, bytes]: A dictionary mapping page numbers to image bytes.
229
- """
230
- from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
231
- from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
232
-
233
- images_bytes: dict[int, bytes] = {}
234
- matrix = Matrix(zoom, zoom) # Control output resolution
235
- for page_idx in _get_page_indices(page_indices, len(doc)):
236
- img_bytes = bytes(
237
- get_pixmap(
238
- page=doc[page_idx],
239
- matrix=matrix,
240
- ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
241
- )
242
- images_bytes[page_idx] = img_bytes
243
- return images_bytes
244
-
245
-
246
- def extract_text_from_pdf(
247
- doc: "Document",
248
- page_indices: Iterable[int] | int | None = None,
249
- ) -> dict[int, str]:
250
- """Convert a PDF file to plain text.
251
-
252
- Extracts text from each page of a PDF file and formats it with page markers.
253
-
254
- Args:
255
- doc (Document): The PDF document to convert.
256
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
257
- If an int is provided, only that page is converted.
258
-
259
- Returns:
260
- dict[int, str]: A dictionary mapping page numbers to text content.
261
- """
262
- return {
263
- page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
264
- for page_idx in _get_page_indices(page_indices, len(doc))
265
- }
266
-
267
-
268
- @contextmanager
269
- def open_pdf(pdf_input: PathOrReadable | Document):
270
- """Open a PDF document from a file path or use an existing Document object.
271
-
272
- Args:
273
- pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
274
-
275
- Returns:
276
- tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
277
- """
278
- import pymupdf # pyright: ignore[reportMissingTypeStubs]
279
-
280
- should_close = True
281
-
282
- if isinstance(pdf_input, pymupdf.Document):
283
- should_close = False
284
- doc = pdf_input
285
- else:
286
- with read_bytes_stream(pdf_input) as stream:
287
- if stream is None:
288
- raise FileNotFoundError(pdf_input)
289
- doc = pymupdf.Document(stream=stream.read())
290
- yield doc
291
- if should_close:
292
- doc.close()
293
-
294
-
295
- def _get_page_indices(page_indices: Iterable[int] | int | None, max_doc_pages: int) -> Iterable[int]:
296
- """Helper function to handle page indices for PDF conversion."""
297
- if page_indices is None:
298
- return range(max_doc_pages)
299
- elif isinstance(page_indices, int):
300
- return [page_indices]
301
- else:
302
- return [i for i in page_indices if 0 <= i < max_doc_pages]
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ from contextlib import contextmanager
6
+ from dataclasses import dataclass
7
+ from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional, Union
8
+
9
+ from ..language_model import Chatterer, HumanMessage
10
+ from ..utils.base64_image import Base64Image
11
+ from ..utils.bytesio import PathOrReadable, read_bytes_stream
12
+
13
+ if TYPE_CHECKING:
14
+ from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
15
+
16
+ # Setup basic logging
17
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
18
+ logger = logging.getLogger(__name__)
19
+ MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
20
+
21
+
22
+ @dataclass
23
+ class PdfToMarkdown:
24
+ """
25
+ Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
26
+ Processes PDFs page by page, providing the LLM with both the extracted raw
27
+ text and a rendered image of the page to handle complex layouts. It maintains
28
+ context between pages by feeding the *tail end* of the previously generated
29
+ Markdown back into the prompt for the next page to ensure smooth transitions.
30
+ """
31
+
32
+ chatterer: Chatterer
33
+ """An instance of the Chatterer class configured with a vision-capable model."""
34
+ image_zoom: float = 2.0
35
+ """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
36
+ image_format: Literal["jpg", "jpeg", "png"] = "png"
37
+ """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
38
+ image_jpg_quality: int = 95
39
+ """Quality for JPEG images (if used)."""
40
+ context_tail_lines: int = 10
41
+ """Number of lines from the end of the previous page's Markdown to use as context."""
42
+ # max_context_tokens: Optional[int] = None # This can be added later if needed
43
+
44
+ def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
45
+ """Extracts the last N lines from the given markdown text."""
46
+ if not markdown_text or self.context_tail_lines <= 0:
47
+ return None
48
+ lines = markdown_text.strip().splitlines()
49
+ if not lines:
50
+ return None
51
+ # Get the last N lines, or fewer if the text is shorter
52
+ tail_lines = lines[-self.context_tail_lines :]
53
+ return "\n".join(tail_lines)
54
+
55
+ def _format_prompt_content(
56
+ self,
57
+ page_text: str,
58
+ page_image_b64: Base64Image,
59
+ previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
60
+ page_number: int = 0, # For context, 0-indexed
61
+ total_pages: int = 1,
62
+ ) -> HumanMessage:
63
+ """
64
+ Formats the content list for the HumanMessage input to the LLM.
65
+ Uses only the tail end of the previous page's markdown for context.
66
+ """
67
+ # Construct the main instruction prompt
68
+ instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
69
+ 1. The raw text extracted from the page ([Raw Text]).
70
+ 2. A rendered image of the page ([Rendered Image]) showing its visual layout.
71
+ 3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
72
+
73
+ **Conversion Requirements:**
74
+ * **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
75
+ * **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
76
+ * **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use `![alt](...)`.
77
+ * **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
78
+ * **Continuity (Crucial):**
79
+ * Examine the [End of Previous Page Markdown] if provided.
80
+ * If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
81
+ * For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
82
+ * **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
83
+ * If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
84
+
85
+ **Input Data:**
86
+ [Raw Text]
87
+ ```
88
+ {page_text if page_text else "No text extracted from this page."}
89
+ ```
90
+ [Rendered Image]
91
+ (See attached image)
92
+ """
93
+ if previous_markdown_context_tail:
94
+ instruction += f"""[End of Previous Page Markdown]
95
+ ```markdown
96
+ ... (content from previous page ends with) ...
97
+ {previous_markdown_context_tail}
98
+ ```
99
+ **Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
100
+ else:
101
+ instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
102
+
103
+ instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
104
+
105
+ # Structure for multimodal input
106
+ return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
107
+
108
+ def convert(
109
+ self,
110
+ pdf_input: Union[str, "Document"],
111
+ page_indices: Optional[Union[Iterable[int], int]] = None,
112
+ progress_callback: Optional[Callable[[int, int], None]] = None,
113
+ ) -> str:
114
+ """
115
+ Converts a PDF document (or specific pages) to Markdown synchronously.
116
+ Args:
117
+ pdf_input: Path to the PDF file or a pymupdf.Document object.
118
+ page_indices: Specific 0-based page indices to convert. If None, converts all pages.
119
+ Can be a single int or an iterable of ints.
120
+ progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
121
+ after each page is processed.
122
+ Returns:
123
+ A single string containing the concatenated Markdown output for the processed pages.
124
+ """
125
+ with open_pdf(pdf_input) as doc:
126
+ target_page_indices = list(_get_page_indices(page_indices, len(doc)))
127
+ total_pages_to_process = len(target_page_indices)
128
+ if total_pages_to_process == 0:
129
+ logger.warning("No pages selected for processing.")
130
+ return ""
131
+
132
+ full_markdown_output: List[str] = []
133
+ # --- Context Tracking ---
134
+ previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
135
+
136
+ # Pre-process all pages (optional optimization)
137
+ logger.info("Extracting text and rendering images for selected pages...")
138
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
139
+ page_image_dict = render_pdf_as_image(
140
+ doc,
141
+ page_indices=target_page_indices,
142
+ zoom=self.image_zoom,
143
+ output=self.image_format,
144
+ jpg_quality=self.image_jpg_quality,
145
+ )
146
+ logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
147
+
148
+ page_idx: int = target_page_indices.pop(0) # Get the first page index
149
+ i: int = 1
150
+ while True:
151
+ logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
152
+ try:
153
+ # --- Get Context Tail ---
154
+ context_tail = self._get_context_tail(previous_page_markdown)
155
+
156
+ message = self._format_prompt_content(
157
+ page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
158
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
159
+ previous_markdown_context_tail=context_tail, # Pass only the tail
160
+ page_number=page_idx,
161
+ total_pages=len(doc),
162
+ )
163
+ logger.debug(f"Sending request to LLM for page index {page_idx}...")
164
+
165
+ response = self.chatterer([message])
166
+ # Extract markdown, handling potential lack of backticks
167
+ markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
168
+ if markdowns:
169
+ current_page_markdown = "\n".join(markdowns)
170
+ else:
171
+ # Fallback: assume the whole response is markdown if no ```markdown blocks found
172
+ current_page_markdown = response.strip()
173
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
174
+ # Basic cleanup if it just missed the 'markdown' language tag
175
+ current_page_markdown = current_page_markdown[3:-3].strip()
176
+ elif "```" in current_page_markdown:
177
+ logger.warning(
178
+ f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
179
+ )
180
+
181
+ logger.debug(f"Received response from LLM for page index {page_idx}.")
182
+
183
+ # --- Store result and update context ---
184
+ full_markdown_output.append(current_page_markdown)
185
+ # Update the *full* previous markdown for the *next* iteration's tail calculation
186
+ previous_page_markdown = current_page_markdown
187
+
188
+ except Exception as e:
189
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
190
+ continue
191
+
192
+ # Progress callback
193
+ if progress_callback:
194
+ try:
195
+ progress_callback(i, total_pages_to_process)
196
+ except Exception as cb_err:
197
+ logger.warning(f"Progress callback failed: {cb_err}")
198
+
199
+ if not target_page_indices:
200
+ break
201
+
202
+ page_idx = target_page_indices.pop(0) # Get the next page index
203
+ i += 1 # Increment the page counter
204
+
205
+ # Join with double newline, potentially adjust based on how well continuations work
206
+ return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
207
+
208
+
209
+ def render_pdf_as_image(
210
+ doc: "Document",
211
+ zoom: float = 2.0,
212
+ output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
213
+ jpg_quality: int = 100,
214
+ page_indices: Iterable[int] | int | None = None,
215
+ ) -> dict[int, bytes]:
216
+ """
217
+ Convert PDF pages to images in bytes.
218
+
219
+ Args:
220
+ doc (Document): The PDF document to convert.
221
+ zoom (float): Zoom factor for the image resolution. Default is 2.0.
222
+ output (str): Output format for the image. Default is 'png'.
223
+ jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
224
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
225
+ If an int is provided, only that page is converted.
226
+
227
+ Returns:
228
+ dict[int, bytes]: A dictionary mapping page numbers to image bytes.
229
+ """
230
+ from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
231
+ from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
232
+
233
+ images_bytes: dict[int, bytes] = {}
234
+ matrix = Matrix(zoom, zoom) # Control output resolution
235
+ for page_idx in _get_page_indices(page_indices, len(doc)):
236
+ img_bytes = bytes(
237
+ get_pixmap(
238
+ page=doc[page_idx],
239
+ matrix=matrix,
240
+ ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
241
+ )
242
+ images_bytes[page_idx] = img_bytes
243
+ return images_bytes
244
+
245
+
246
+ def extract_text_from_pdf(
247
+ doc: "Document",
248
+ page_indices: Iterable[int] | int | None = None,
249
+ ) -> dict[int, str]:
250
+ """Convert a PDF file to plain text.
251
+
252
+ Extracts text from each page of a PDF file and formats it with page markers.
253
+
254
+ Args:
255
+ doc (Document): The PDF document to convert.
256
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
257
+ If an int is provided, only that page is converted.
258
+
259
+ Returns:
260
+ dict[int, str]: A dictionary mapping page numbers to text content.
261
+ """
262
+ return {
263
+ page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
264
+ for page_idx in _get_page_indices(page_indices, len(doc))
265
+ }
266
+
267
+
268
+ @contextmanager
269
+ def open_pdf(pdf_input: PathOrReadable | Document):
270
+ """Open a PDF document from a file path or use an existing Document object.
271
+
272
+ Args:
273
+ pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
274
+
275
+ Returns:
276
+ tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
277
+ """
278
+ import pymupdf # pyright: ignore[reportMissingTypeStubs]
279
+
280
+ should_close = True
281
+
282
+ if isinstance(pdf_input, pymupdf.Document):
283
+ should_close = False
284
+ doc = pdf_input
285
+ else:
286
+ with read_bytes_stream(pdf_input) as stream:
287
+ if stream is None:
288
+ raise FileNotFoundError(pdf_input)
289
+ doc = pymupdf.Document(stream=stream.read())
290
+ yield doc
291
+ if should_close:
292
+ doc.close()
293
+
294
+
295
+ def _get_page_indices(page_indices: Iterable[int] | int | None, max_doc_pages: int) -> Iterable[int]:
296
+ """Helper function to handle page indices for PDF conversion."""
297
+ if page_indices is None:
298
+ return range(max_doc_pages)
299
+ elif isinstance(page_indices, int):
300
+ return [page_indices]
301
+ else:
302
+ return [i for i in page_indices if 0 <= i < max_doc_pages]