chatterer 0.1.24__py3-none-any.whl → 0.1.25__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. chatterer/__init__.py +97 -93
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/examples/__main__.py +75 -75
  5. chatterer/examples/any2md.py +85 -85
  6. chatterer/examples/pdf2md.py +338 -338
  7. chatterer/examples/pdf2txt.py +54 -54
  8. chatterer/examples/ppt.py +486 -486
  9. chatterer/examples/pw.py +143 -137
  10. chatterer/examples/snippet.py +56 -55
  11. chatterer/examples/transcribe.py +192 -112
  12. chatterer/examples/upstage.py +89 -89
  13. chatterer/examples/web2md.py +80 -66
  14. chatterer/interactive.py +354 -354
  15. chatterer/language_model.py +536 -536
  16. chatterer/messages.py +21 -21
  17. chatterer/strategies/__init__.py +13 -13
  18. chatterer/strategies/atom_of_thoughts.py +975 -975
  19. chatterer/strategies/base.py +14 -14
  20. chatterer/tools/__init__.py +46 -46
  21. chatterer/tools/caption_markdown_images.py +384 -384
  22. chatterer/tools/citation_chunking/__init__.py +3 -3
  23. chatterer/tools/citation_chunking/chunks.py +53 -53
  24. chatterer/tools/citation_chunking/citation_chunker.py +118 -118
  25. chatterer/tools/citation_chunking/citations.py +285 -285
  26. chatterer/tools/citation_chunking/prompt.py +157 -157
  27. chatterer/tools/citation_chunking/reference.py +26 -26
  28. chatterer/tools/citation_chunking/utils.py +138 -138
  29. chatterer/tools/convert_pdf_to_markdown.py +645 -625
  30. chatterer/tools/convert_to_text.py +446 -446
  31. chatterer/tools/upstage_document_parser.py +705 -705
  32. chatterer/tools/webpage_to_markdown.py +739 -739
  33. chatterer/tools/youtube.py +146 -146
  34. chatterer/utils/__init__.py +15 -15
  35. chatterer/utils/base64_image.py +293 -285
  36. chatterer/utils/bytesio.py +59 -59
  37. chatterer/utils/code_agent.py +237 -237
  38. chatterer/utils/imghdr.py +148 -148
  39. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/METADATA +390 -389
  40. chatterer-0.1.25.dist-info/RECORD +45 -0
  41. chatterer-0.1.24.dist-info/RECORD +0 -45
  42. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/WHEEL +0 -0
  43. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/entry_points.txt +0 -0
  44. {chatterer-0.1.24.dist-info → chatterer-0.1.25.dist-info}/top_level.txt +0 -0
@@ -1,625 +1,645 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import logging
5
- import re
6
- from contextlib import contextmanager
7
- from dataclasses import dataclass
8
- from types import EllipsisType
9
- from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional
10
-
11
- from ..language_model import Chatterer, HumanMessage
12
- from ..utils.base64_image import Base64Image
13
- from ..utils.bytesio import PathOrReadable, read_bytes_stream
14
-
15
- if TYPE_CHECKING:
16
- from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
17
-
18
- # Setup basic logging
19
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
- logger = logging.getLogger(__name__)
21
- MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
22
- PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] | int | str
23
-
24
-
25
- @dataclass
26
- class PdfToMarkdown:
27
- """
28
- Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
29
-
30
- This class supports both sequential and parallel processing:
31
- - Sequential processing preserves strict page continuity using previous page context
32
- - Parallel processing enables faster conversion for large documents by using
33
- previous page image and text for context instead of generated markdown
34
- """
35
-
36
- chatterer: Chatterer
37
- """An instance of the Chatterer class configured with a vision-capable model."""
38
- image_zoom: float = 2.0
39
- """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
40
- image_format: Literal["jpg", "jpeg", "png"] = "png"
41
- """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
42
- image_jpg_quality: int = 95
43
- """Quality for JPEG images (if used)."""
44
- context_tail_lines: int = 10
45
- """Number of lines from the end of the previous page's Markdown to use as context (sequential mode only)."""
46
-
47
- def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
48
- """Extracts the last N lines from the given markdown text."""
49
- if not markdown_text or self.context_tail_lines <= 0:
50
- return None
51
- lines = markdown_text.strip().splitlines()
52
- if not lines:
53
- return None
54
- tail_lines = lines[-self.context_tail_lines :]
55
- return "\n".join(tail_lines)
56
-
57
- def _format_prompt_content_sequential(
58
- self,
59
- page_text: str,
60
- page_image_b64: Base64Image,
61
- previous_markdown_context_tail: Optional[str] = None,
62
- page_number: int = 0,
63
- total_pages: int = 1,
64
- ) -> HumanMessage:
65
- """
66
- Formats the content for sequential processing using previous page's markdown context.
67
- """
68
- instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
69
-
70
- **Input provided:**
71
- 1. **Raw Text**: Extracted text from the PDF page (may contain OCR errors)
72
- 2. **Page Image**: Visual rendering of the page showing actual layout
73
- 3. **Previous Context**: End portion of the previous page's generated Markdown (if available)
74
-
75
- **Conversion Rules:**
76
- • **Text Structure**: Use the image to understand the actual layout and fix any OCR errors in the raw text
77
- • **Headings**: Use appropriate heading levels (# ## ### etc.) based on visual hierarchy
78
- • **Lists**: Convert to proper Markdown lists (- or 1. 2. 3.) maintaining structure
79
- • **Tables**: Convert to Markdown table format using | pipes |
80
- • **Images/Diagrams**: Describe significant visual elements as: `<details><summary>Figure: Brief title</summary>Detailed description based on what you see in the image</details>`
81
- • **Code/Formulas**: Use ``` code blocks ``` or LaTeX $$ math $$ as appropriate
82
- • **Continuity**: If previous context shows incomplete content (mid-sentence, list, table), seamlessly continue from that point
83
- • **NO REPETITION**: Never repeat content from the previous context - only generate new content for this page
84
-
85
- **Raw Text:**
86
- ```
87
- {page_text if page_text else "No text extracted from this page."}
88
- ```
89
-
90
- **Page Image:** (attached)
91
- """
92
-
93
- if previous_markdown_context_tail:
94
- instruction += f"""
95
- **Previous Page Context (DO NOT REPEAT):**
96
- ```markdown
97
- ... (previous page ended with) ...
98
- {previous_markdown_context_tail}
99
- ```
100
-
101
- Continue seamlessly from the above context if the current page content flows from it.
102
- """
103
- else:
104
- instruction += "\n**Note:** This is the first page or start of a new section."
105
-
106
- instruction += "\n\n**Output only the Markdown content for the current page. Ensure proper formatting and NO repetition of previous content.**"
107
-
108
- return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
109
-
110
- def _format_prompt_content_parallel(
111
- self,
112
- page_text: str,
113
- page_image_b64: Base64Image,
114
- previous_page_text: Optional[str] = None,
115
- previous_page_image_b64: Optional[Base64Image] = None,
116
- page_number: int = 0,
117
- total_pages: int = 1,
118
- ) -> HumanMessage:
119
- """
120
- Formats the content for parallel processing using previous page's raw data.
121
- """
122
- instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
123
-
124
- **Task**: Convert the current page to Markdown while maintaining proper continuity with the previous page.
125
-
126
- **Current Page Data:**
127
- - **Raw Text**: Extracted text (may have OCR errors - use image to verify)
128
- - **Page Image**: Visual rendering showing actual layout
129
-
130
- **Previous Page Data** (for context only):
131
- - **Previous Raw Text**: Text from the previous page
132
- - **Previous Page Image**: Visual of the previous page
133
-
134
- **Conversion Instructions:**
135
- 1. **Primary Focus**: Convert the CURRENT page content accurately
136
- 2. **Continuity Check**:
137
- - Examine if the current page continues content from the previous page (sentences, paragraphs, lists, tables)
138
- - If yes, start your Markdown naturally continuing that content
139
- - If no, start fresh with proper heading/structure
140
- 3. **Format Rules**:
141
- - Use image to fix OCR errors and understand layout
142
- - Convert headings to # ## ### based on visual hierarchy
143
- - Convert lists to proper Markdown (- or 1. 2. 3.)
144
- - Convert tables to | pipe | format
145
- - Describe significant images/charts as: `<details><summary>Figure: Title</summary>Description</details>`
146
- - Use ``` for code blocks and $$ for math formulas
147
-
148
- **Current Page Raw Text:**
149
- ```
150
- {page_text if page_text else "No text extracted from this page."}
151
- ```
152
-
153
- **Current Page Image:** (see first attached image)
154
- """
155
-
156
- content = [instruction, page_image_b64.data_uri_content]
157
-
158
- if previous_page_text is not None and previous_page_image_b64 is not None:
159
- instruction += f"""
160
-
161
- **Previous Page Raw Text (for context):**
162
- ```
163
- {previous_page_text if previous_page_text else "No text from previous page."}
164
- ```
165
-
166
- **Previous Page Image:** (see second attached image)
167
- """
168
- content.append(previous_page_image_b64.data_uri_content)
169
- else:
170
- instruction += "\n**Note:** This is the first page - no previous context available."
171
-
172
- instruction += "\n\n**Generate ONLY the Markdown for the current page. Ensure proper continuity and formatting.**"
173
- content[0] = instruction
174
-
175
- return HumanMessage(content=content)
176
-
177
- def convert(
178
- self,
179
- pdf_input: "Document | PathOrReadable",
180
- page_indices: Optional[PageIndexType] = None,
181
- progress_callback: Optional[Callable[[int, int], None]] = None,
182
- mode: Literal["sequential", "parallel"] = "sequential",
183
- ) -> str:
184
- """
185
- Converts a PDF document to Markdown synchronously.
186
-
187
- Args:
188
- pdf_input: Path to PDF file or pymupdf.Document object
189
- page_indices: Specific page indices to convert (0-based). If None, converts all pages
190
- progress_callback: Optional callback function called with (current_page, total_pages)
191
- mode: "sequential" for strict continuity or "parallel" for independent page processing
192
-
193
- Returns:
194
- Concatenated Markdown string for all processed pages
195
- """
196
- if mode == "sequential":
197
- return self._convert_sequential(pdf_input, page_indices, progress_callback)
198
- else:
199
- return self._convert_parallel_sync(pdf_input, page_indices, progress_callback)
200
-
201
- async def aconvert(
202
- self,
203
- pdf_input: "Document | PathOrReadable",
204
- page_indices: Optional[PageIndexType] = None,
205
- progress_callback: Optional[Callable[[int, int], None]] = None,
206
- max_concurrent: int = 5,
207
- ) -> str:
208
- """
209
- Converts a PDF document to Markdown asynchronously with parallel processing.
210
-
211
- Args:
212
- pdf_input: Path to PDF file or pymupdf.Document object
213
- page_indices: Specific page indices to convert (0-based). If None, converts all pages
214
- progress_callback: Optional callback function called with (current_page, total_pages)
215
- max_concurrent: Maximum number of concurrent LLM requests
216
-
217
- Returns:
218
- Concatenated Markdown string for all processed pages
219
- """
220
- with open_pdf(pdf_input) as doc:
221
- target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
222
- total_pages_to_process = len(target_page_indices)
223
-
224
- if total_pages_to_process == 0:
225
- logger.warning("No pages selected for processing.")
226
- return ""
227
-
228
- logger.info(f"Starting parallel Markdown conversion for {total_pages_to_process} pages...")
229
-
230
- # Pre-process all pages
231
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
232
- page_image_dict = render_pdf_as_image(
233
- doc,
234
- page_indices=target_page_indices,
235
- zoom=self.image_zoom,
236
- output=self.image_format,
237
- jpg_quality=self.image_jpg_quality,
238
- )
239
-
240
- # Process pages in parallel with semaphore for concurrency control
241
- semaphore = asyncio.Semaphore(max_concurrent)
242
-
243
- async def process_page(i: int, page_idx: int) -> tuple[int, str]:
244
- async with semaphore:
245
- logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
246
-
247
- try:
248
- # Get previous page data for context
249
- prev_page_idx = target_page_indices[i - 1] if i > 0 else None
250
- previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
251
- previous_page_image_b64 = None
252
- if prev_page_idx is not None:
253
- previous_page_image_b64 = Base64Image.from_bytes(page_image_dict[prev_page_idx], ext=self.image_format)
254
-
255
- message = self._format_prompt_content_parallel(
256
- page_text=page_text_dict.get(page_idx, ""),
257
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
258
- previous_page_text=previous_page_text,
259
- previous_page_image_b64=previous_page_image_b64,
260
- page_number=page_idx,
261
- total_pages=len(doc),
262
- )
263
-
264
- response = await self.chatterer.agenerate([message])
265
-
266
- # Extract markdown
267
- markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
268
- if markdowns:
269
- current_page_markdown = "\n".join(markdowns)
270
- else:
271
- current_page_markdown = response.strip()
272
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
273
- current_page_markdown = current_page_markdown[3:-3].strip()
274
-
275
- logger.debug(f"Completed processing page {i + 1}/{total_pages_to_process}")
276
-
277
- # Call progress callback if provided
278
- if progress_callback:
279
- try:
280
- progress_callback(i + 1, total_pages_to_process)
281
- except Exception as cb_err:
282
- logger.warning(f"Progress callback failed: {cb_err}")
283
-
284
- return (i, current_page_markdown)
285
-
286
- except Exception as e:
287
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
288
- return (i, f"<!-- Error processing page {page_idx + 1}: {str(e)} -->")
289
-
290
- # Execute all page processing tasks
291
-
292
- tasks = [process_page(i, page_idx) for i, page_idx in enumerate(target_page_indices)]
293
- results = await asyncio.gather(*tasks, return_exceptions=True)
294
-
295
- # Sort results by original page order and extract markdown
296
- markdown_results = [""] * total_pages_to_process
297
- for result in results:
298
- if isinstance(result, Exception):
299
- logger.error(f"Task failed with exception: {result}")
300
- continue
301
- if isinstance(result, tuple) and len(result) == 2:
302
- page_order, markdown = result
303
- markdown_results[page_order] = markdown
304
- else:
305
- logger.error(f"Unexpected result format: {result}")
306
-
307
- return "\n\n".join(markdown_results).strip()
308
-
309
- def _convert_sequential(
310
- self,
311
- pdf_input: "Document | PathOrReadable",
312
- page_indices: Optional[PageIndexType] = None,
313
- progress_callback: Optional[Callable[[int, int], None]] = None,
314
- ) -> str:
315
- """Sequential conversion maintaining strict page continuity."""
316
- with open_pdf(pdf_input) as doc:
317
- target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
318
- total_pages_to_process = len(target_page_indices)
319
- if total_pages_to_process == 0:
320
- logger.warning("No pages selected for processing.")
321
- return ""
322
-
323
- full_markdown_output: List[str] = []
324
- previous_page_markdown: Optional[str] = None
325
-
326
- # Pre-process all pages
327
- logger.info("Extracting text and rendering images for selected pages...")
328
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
329
- page_image_dict = render_pdf_as_image(
330
- doc,
331
- page_indices=target_page_indices,
332
- zoom=self.image_zoom,
333
- output=self.image_format,
334
- jpg_quality=self.image_jpg_quality,
335
- )
336
- logger.info(f"Starting sequential Markdown conversion for {total_pages_to_process} pages...")
337
-
338
- for i, page_idx in enumerate(target_page_indices):
339
- logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
340
- try:
341
- context_tail = self._get_context_tail(previous_page_markdown)
342
-
343
- message = self._format_prompt_content_sequential(
344
- page_text=page_text_dict.get(page_idx, ""),
345
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
346
- previous_markdown_context_tail=context_tail,
347
- page_number=page_idx,
348
- total_pages=len(doc),
349
- )
350
-
351
- response = self.chatterer.generate([message])
352
-
353
- # Extract markdown
354
- markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
355
- if markdowns:
356
- current_page_markdown = "\n".join(markdowns)
357
- else:
358
- current_page_markdown = response.strip()
359
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
360
- current_page_markdown = current_page_markdown[3:-3].strip()
361
-
362
- full_markdown_output.append(current_page_markdown)
363
- previous_page_markdown = current_page_markdown
364
-
365
- except Exception as e:
366
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
367
- continue
368
-
369
- # Progress callback
370
- if progress_callback:
371
- try:
372
- progress_callback(i + 1, total_pages_to_process)
373
- except Exception as cb_err:
374
- logger.warning(f"Progress callback failed: {cb_err}")
375
-
376
- return "\n\n".join(full_markdown_output).strip()
377
-
378
- def _convert_parallel_sync(
379
- self,
380
- pdf_input: "Document | PathOrReadable",
381
- page_indices: Optional[PageIndexType] = None,
382
- progress_callback: Optional[Callable[[int, int], None]] = None,
383
- ) -> str:
384
- """Synchronous parallel-style conversion (processes independently but sequentially)."""
385
- with open_pdf(pdf_input) as doc:
386
- target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
387
- total_pages_to_process = len(target_page_indices)
388
- if total_pages_to_process == 0:
389
- logger.warning("No pages selected for processing.")
390
- return ""
391
-
392
- logger.info(f"Starting parallel-style Markdown conversion for {total_pages_to_process} pages...")
393
-
394
- # Pre-process all pages
395
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
396
- page_image_dict = render_pdf_as_image(
397
- doc,
398
- page_indices=target_page_indices,
399
- zoom=self.image_zoom,
400
- output=self.image_format,
401
- jpg_quality=self.image_jpg_quality,
402
- )
403
-
404
- full_markdown_output: List[str] = []
405
-
406
- for i, page_idx in enumerate(target_page_indices):
407
- logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
408
-
409
- try:
410
- # Get previous page data for context
411
- prev_page_idx = target_page_indices[i - 1] if i > 0 else None
412
- previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
413
- previous_page_image_b64 = None
414
- if prev_page_idx is not None:
415
- previous_page_image_b64 = Base64Image.from_bytes(page_image_dict[prev_page_idx], ext=self.image_format)
416
-
417
- message = self._format_prompt_content_parallel(
418
- page_text=page_text_dict.get(page_idx, ""),
419
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
420
- previous_page_text=previous_page_text,
421
- previous_page_image_b64=previous_page_image_b64,
422
- page_number=page_idx,
423
- total_pages=len(doc),
424
- )
425
-
426
- response = self.chatterer.generate([message])
427
-
428
- # Extract markdown
429
- markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
430
- if markdowns:
431
- current_page_markdown = "\n".join(markdowns)
432
- else:
433
- current_page_markdown = response.strip()
434
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
435
- current_page_markdown = current_page_markdown[3:-3].strip()
436
-
437
- full_markdown_output.append(current_page_markdown)
438
-
439
- except Exception as e:
440
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
441
- continue
442
-
443
- # Progress callback
444
- if progress_callback:
445
- try:
446
- progress_callback(i + 1, total_pages_to_process)
447
- except Exception as cb_err:
448
- logger.warning(f"Progress callback failed: {cb_err}")
449
-
450
- return "\n\n".join(full_markdown_output).strip()
451
-
452
-
453
- def render_pdf_as_image(
454
- doc: "Document",
455
- zoom: float = 2.0,
456
- output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
457
- jpg_quality: int = 100,
458
- page_indices: Iterable[int] | int | None = None,
459
- ) -> dict[int, bytes]:
460
- """
461
- Convert PDF pages to images in bytes.
462
-
463
- Args:
464
- doc (Document): The PDF document to convert.
465
- zoom (float): Zoom factor for the image resolution. Default is 2.0.
466
- output (str): Output format for the image. Default is 'png'.
467
- jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
468
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
469
- If an int is provided, only that page is converted.
470
-
471
- Returns:
472
- dict[int, bytes]: A dictionary mapping page numbers to image bytes.
473
- """
474
- from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
475
- from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
476
-
477
- images_bytes: dict[int, bytes] = {}
478
- matrix = Matrix(zoom, zoom) # Control output resolution
479
- for page_idx in _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True):
480
- img_bytes = bytes(
481
- get_pixmap(
482
- page=doc[page_idx],
483
- matrix=matrix,
484
- ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
485
- )
486
- images_bytes[page_idx] = img_bytes
487
- return images_bytes
488
-
489
-
490
- def extract_text_from_pdf(doc: "Document", page_indices: Optional[PageIndexType] = None) -> dict[int, str]:
491
- """Convert a PDF file to plain text.
492
-
493
- Extracts text from each page of a PDF file and formats it with page markers.
494
-
495
- Args:
496
- doc (Document): The PDF document to convert.
497
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
498
- If an int is provided, only that page is converted.
499
-
500
- Returns:
501
- dict[int, str]: A dictionary mapping page numbers to text content.
502
- """
503
- return {
504
- page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
505
- for page_idx in _get_page_indices(
506
- page_indices=page_indices,
507
- max_doc_pages=len(doc),
508
- is_input_zero_based=True,
509
- )
510
- }
511
-
512
-
513
- @contextmanager
514
- def open_pdf(pdf_input: PathOrReadable | Document):
515
- """Open a PDF document from a file path or use an existing Document object.
516
-
517
- Args:
518
- pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
519
-
520
- Returns:
521
- tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
522
- """
523
- import pymupdf # pyright: ignore[reportMissingTypeStubs]
524
-
525
- should_close = True
526
-
527
- if isinstance(pdf_input, pymupdf.Document):
528
- should_close = False
529
- doc = pdf_input
530
- else:
531
- with read_bytes_stream(pdf_input) as stream:
532
- if stream is None:
533
- raise FileNotFoundError(pdf_input)
534
- doc = pymupdf.Document(stream=stream.read())
535
- yield doc
536
- if should_close:
537
- doc.close()
538
-
539
-
540
- def _get_page_indices(page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
541
- """Helper function to handle page indices for PDF conversion."""
542
-
543
- def _to_zero_based_int(idx: int) -> int:
544
- """Convert a 1-based index to a 0-based index if necessary."""
545
- if is_input_zero_based:
546
- return idx
547
- else:
548
- if idx < 1 or idx > max_doc_pages:
549
- raise ValueError(f"Index {idx} is out of bounds for document with {max_doc_pages} pages (1-based).")
550
- return idx - 1
551
-
552
- if page_indices is None:
553
- return list(range(max_doc_pages)) # Convert all pages
554
- elif isinstance(page_indices, int):
555
- # Handle single integer input for page index
556
- return [_to_zero_based_int(page_indices)]
557
- elif isinstance(page_indices, str):
558
- # Handle string input for page indices
559
- return _interpret_index_string(index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based)
560
- else:
561
- # Handle iterable input for page indices
562
- indices: set[int] = set()
563
- for idx in page_indices:
564
- if isinstance(idx, int):
565
- indices.add(_to_zero_based_int(idx))
566
- else:
567
- start, end = idx
568
- if isinstance(start, EllipsisType):
569
- start = 0
570
- else:
571
- start = _to_zero_based_int(start)
572
-
573
- if isinstance(end, EllipsisType):
574
- end = max_doc_pages - 1
575
- else:
576
- end = _to_zero_based_int(end)
577
-
578
- if start > end:
579
- raise ValueError(f"Invalid range: {start} - {end}. Start index must be less than or equal to end index.")
580
- indices.update(range(start, end + 1))
581
-
582
- return sorted(indices) # Return sorted list of indices
583
-
584
-
585
- def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
586
- """Interpret a string of comma-separated indices and ranges."""
587
-
588
- def _to_zero_based_int(idx_str: str) -> int:
589
- i = int(idx_str)
590
- if is_input_zero_based:
591
- if i < 0 or i >= max_doc_pages:
592
- raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages.")
593
- return i
594
- else:
595
- if i < 1 or i > max_doc_pages:
596
- raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages (1-based).")
597
- return i - 1 # Convert to zero-based index
598
-
599
- indices: set[int] = set()
600
- for part in index_str.split(","):
601
- part: str = part.strip()
602
- count_dash: int = part.count("-")
603
- if count_dash == 0:
604
- indices.add(_to_zero_based_int(part))
605
- elif count_dash == 1:
606
- idx_dash: int = part.index("-")
607
- start = part[:idx_dash].strip()
608
- end = part[idx_dash + 1 :].strip()
609
- if not start:
610
- start = _to_zero_based_int("0") # Default to 0 if no start index is provided
611
- else:
612
- start = _to_zero_based_int(start)
613
-
614
- if not end:
615
- end = _to_zero_based_int(str(max_doc_pages - 1)) # Default to last page if no end index is provided
616
- else:
617
- end = _to_zero_based_int(end)
618
-
619
- if start > end:
620
- raise ValueError(f"Invalid range: {start} - {end}. Start index must be less than or equal to end index.")
621
- indices.update(range(start, end + 1))
622
- else:
623
- raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")
624
-
625
- return sorted(indices) # Return sorted list of indices, ensuring no duplicates
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import logging
5
+ import re
6
+ from contextlib import contextmanager
7
+ from dataclasses import dataclass
8
+ from types import EllipsisType
9
+ from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional
10
+
11
+ from ..language_model import Chatterer, HumanMessage
12
+ from ..utils.base64_image import Base64Image
13
+ from ..utils.bytesio import PathOrReadable, read_bytes_stream
14
+
15
+ if TYPE_CHECKING:
16
+ from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
17
+
18
+ # Setup basic logging
19
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
+ logger = logging.getLogger(__name__)
21
+ MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
22
+ PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] | int | str
23
+
24
+
25
+ @dataclass
26
+ class PdfToMarkdown:
27
+ """
28
+ Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
29
+
30
+ This class supports both sequential and parallel processing:
31
+ - Sequential processing preserves strict page continuity using previous page context
32
+ - Parallel processing enables faster conversion for large documents by using
33
+ previous page image and text for context instead of generated markdown
34
+ """
35
+
36
+ chatterer: Chatterer
37
+ """An instance of the Chatterer class configured with a vision-capable model."""
38
+ image_zoom: float = 2.0
39
+ """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
40
+ image_format: Literal["jpg", "jpeg", "png"] = "png"
41
+ """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
42
+ image_jpg_quality: int = 95
43
+ """Quality for JPEG images (if used)."""
44
+ context_tail_lines: int = 10
45
+ """Number of lines from the end of the previous page's Markdown to use as context (sequential mode only)."""
46
+
47
+ def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
48
+ """Extracts the last N lines from the given markdown text."""
49
+ if not markdown_text or self.context_tail_lines <= 0:
50
+ return None
51
+ lines = markdown_text.strip().splitlines()
52
+ if not lines:
53
+ return None
54
+ tail_lines = lines[-self.context_tail_lines :]
55
+ return "\n".join(tail_lines)
56
+
57
+ def _format_prompt_content_sequential(
58
+ self,
59
+ page_text: str,
60
+ page_image_b64: Base64Image,
61
+ previous_markdown_context_tail: Optional[str] = None,
62
+ page_number: int = 0,
63
+ total_pages: int = 1,
64
+ ) -> HumanMessage:
65
+ """
66
+ Formats the content for sequential processing using previous page's markdown context.
67
+ """
68
+ instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
69
+
70
+ **Input provided:**
71
+ 1. **Raw Text**: Extracted text from the PDF page (may contain OCR errors)
72
+ 2. **Page Image**: Visual rendering of the page showing actual layout
73
+ 3. **Previous Context**: End portion of the previous page's generated Markdown (if available)
74
+
75
+ **Conversion Rules:**
76
+ • **Text Structure**: Use the image to understand the actual layout and fix any OCR errors in the raw text
77
+ • **Headings**: Use appropriate heading levels (# ## ### etc.) based on visual hierarchy
78
+ • **Lists**: Convert to proper Markdown lists (- or 1. 2. 3.) maintaining structure
79
+ • **Tables**: Convert to Markdown table format using | pipes |
80
+ • **Images/Diagrams**: Describe significant visual elements as: `<details><summary>Figure: Brief title</summary>Detailed description based on what you see in the image</details>`
81
+ • **Code/Formulas**: Use ``` code blocks ``` or LaTeX $$ math $$ as appropriate
82
+ • **Continuity**: If previous context shows incomplete content (mid-sentence, list, table), seamlessly continue from that point
83
+ • **NO REPETITION**: Never repeat content from the previous context - only generate new content for this page
84
+
85
+ **Raw Text:**
86
+ ```
87
+ {page_text if page_text else "No text extracted from this page."}
88
+ ```
89
+
90
+ **Page Image:** (attached)
91
+ """
92
+
93
+ if previous_markdown_context_tail:
94
+ instruction += f"""
95
+ **Previous Page Context (DO NOT REPEAT):**
96
+ ```markdown
97
+ ... (previous page ended with) ...
98
+ {previous_markdown_context_tail}
99
+ ```
100
+
101
+ Continue seamlessly from the above context if the current page content flows from it.
102
+ """
103
+ else:
104
+ instruction += "\n**Note:** This is the first page or start of a new section."
105
+
106
+ instruction += "\n\n**Output only the Markdown content for the current page. Ensure proper formatting and NO repetition of previous content.**"
107
+
108
+ return HumanMessage(content=[instruction, page_image_b64.data_uri_content_dict])
109
+
110
+ def _format_prompt_content_parallel(
111
+ self,
112
+ page_text: str,
113
+ page_image_b64: Base64Image,
114
+ previous_page_text: Optional[str] = None,
115
+ previous_page_image_b64: Optional[Base64Image] = None,
116
+ page_number: int = 0,
117
+ total_pages: int = 1,
118
+ ) -> HumanMessage:
119
+ """
120
+ Formats the content for parallel processing using previous page's raw data.
121
+ """
122
+ instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
123
+
124
+ **Task**: Convert the current page to Markdown while maintaining proper continuity with the previous page.
125
+
126
+ **Current Page Data:**
127
+ - **Raw Text**: Extracted text (may have OCR errors - use image to verify)
128
+ - **Page Image**: Visual rendering showing actual layout
129
+
130
+ **Previous Page Data** (for context only):
131
+ - **Previous Raw Text**: Text from the previous page
132
+ - **Previous Page Image**: Visual of the previous page
133
+
134
+ **Conversion Instructions:**
135
+ 1. **Primary Focus**: Convert the CURRENT page content accurately
136
+ 2. **Continuity Check**:
137
+ - Examine if the current page continues content from the previous page (sentences, paragraphs, lists, tables)
138
+ - If yes, start your Markdown naturally continuing that content
139
+ - If no, start fresh with proper heading/structure
140
+ 3. **Format Rules**:
141
+ - Use image to fix OCR errors and understand layout
142
+ - Convert headings to # ## ### based on visual hierarchy
143
+ - Convert lists to proper Markdown (- or 1. 2. 3.)
144
+ - Convert tables to | pipe | format
145
+ - Describe significant images/charts as: `<details><summary>Figure: Title</summary>Description</details>`
146
+ - Use ``` for code blocks and $$ for math formulas
147
+
148
+ **Current Page Raw Text:**
149
+ ```
150
+ {page_text if page_text else "No text extracted from this page."}
151
+ ```
152
+
153
+ **Current Page Image:** (see first attached image)
154
+ """
155
+
156
+ content: list[str | dict[str, object]] = [instruction, page_image_b64.data_uri_content_dict]
157
+
158
+ if previous_page_text is not None and previous_page_image_b64 is not None:
159
+ instruction += f"""
160
+
161
+ **Previous Page Raw Text (for context):**
162
+ ```
163
+ {previous_page_text if previous_page_text else "No text from previous page."}
164
+ ```
165
+
166
+ **Previous Page Image:** (see second attached image)
167
+ """
168
+ content.append(previous_page_image_b64.data_uri_content_dict)
169
+ else:
170
+ instruction += "\n**Note:** This is the first page - no previous context available."
171
+
172
+ instruction += (
173
+ "\n\n**Generate ONLY the Markdown for the current page. Ensure proper continuity and formatting.**"
174
+ )
175
+ content[0] = instruction
176
+
177
+ return HumanMessage(content=content)
178
+
179
+ def convert(
180
+ self,
181
+ pdf_input: "Document | PathOrReadable",
182
+ page_indices: Optional[PageIndexType] = None,
183
+ progress_callback: Optional[Callable[[int, int], None]] = None,
184
+ mode: Literal["sequential", "parallel"] = "sequential",
185
+ ) -> str:
186
+ """
187
+ Converts a PDF document to Markdown synchronously.
188
+
189
+ Args:
190
+ pdf_input: Path to PDF file or pymupdf.Document object
191
+ page_indices: Specific page indices to convert (0-based). If None, converts all pages
192
+ progress_callback: Optional callback function called with (current_page, total_pages)
193
+ mode: "sequential" for strict continuity or "parallel" for independent page processing
194
+
195
+ Returns:
196
+ Concatenated Markdown string for all processed pages
197
+ """
198
+ if mode == "sequential":
199
+ return self._convert_sequential(pdf_input, page_indices, progress_callback)
200
+ else:
201
+ return self._convert_parallel_sync(pdf_input, page_indices, progress_callback)
202
+
203
+ async def aconvert(
204
+ self,
205
+ pdf_input: "Document | PathOrReadable",
206
+ page_indices: Optional[PageIndexType] = None,
207
+ progress_callback: Optional[Callable[[int, int], None]] = None,
208
+ max_concurrent: int = 5,
209
+ ) -> str:
210
+ """
211
+ Converts a PDF document to Markdown asynchronously with parallel processing.
212
+
213
+ Args:
214
+ pdf_input: Path to PDF file or pymupdf.Document object
215
+ page_indices: Specific page indices to convert (0-based). If None, converts all pages
216
+ progress_callback: Optional callback function called with (current_page, total_pages)
217
+ max_concurrent: Maximum number of concurrent LLM requests
218
+
219
+ Returns:
220
+ Concatenated Markdown string for all processed pages
221
+ """
222
+ with open_pdf(pdf_input) as doc:
223
+ target_page_indices = list(
224
+ _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
225
+ )
226
+ total_pages_to_process = len(target_page_indices)
227
+
228
+ if total_pages_to_process == 0:
229
+ logger.warning("No pages selected for processing.")
230
+ return ""
231
+
232
+ logger.info(f"Starting parallel Markdown conversion for {total_pages_to_process} pages...")
233
+
234
+ # Pre-process all pages
235
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
236
+ page_image_dict = render_pdf_as_image(
237
+ doc,
238
+ page_indices=target_page_indices,
239
+ zoom=self.image_zoom,
240
+ output=self.image_format,
241
+ jpg_quality=self.image_jpg_quality,
242
+ )
243
+
244
+ # Process pages in parallel with semaphore for concurrency control
245
+ semaphore = asyncio.Semaphore(max_concurrent)
246
+
247
+ async def process_page(i: int, page_idx: int) -> tuple[int, str]:
248
+ async with semaphore:
249
+ logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
250
+
251
+ try:
252
+ # Get previous page data for context
253
+ prev_page_idx = target_page_indices[i - 1] if i > 0 else None
254
+ previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
255
+ previous_page_image_b64 = None
256
+ if prev_page_idx is not None:
257
+ previous_page_image_b64 = Base64Image.from_bytes(
258
+ page_image_dict[prev_page_idx], ext=self.image_format
259
+ )
260
+
261
+ message = self._format_prompt_content_parallel(
262
+ page_text=page_text_dict.get(page_idx, ""),
263
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
264
+ previous_page_text=previous_page_text,
265
+ previous_page_image_b64=previous_page_image_b64,
266
+ page_number=page_idx,
267
+ total_pages=len(doc),
268
+ )
269
+
270
+ response = await self.chatterer.agenerate([message])
271
+
272
+ # Extract markdown
273
+ markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
274
+ if markdowns:
275
+ current_page_markdown = "\n".join(markdowns)
276
+ else:
277
+ current_page_markdown = response.strip()
278
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
279
+ current_page_markdown = current_page_markdown[3:-3].strip()
280
+
281
+ logger.debug(f"Completed processing page {i + 1}/{total_pages_to_process}")
282
+
283
+ # Call progress callback if provided
284
+ if progress_callback:
285
+ try:
286
+ progress_callback(i + 1, total_pages_to_process)
287
+ except Exception as cb_err:
288
+ logger.warning(f"Progress callback failed: {cb_err}")
289
+
290
+ return (i, current_page_markdown)
291
+
292
+ except Exception as e:
293
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
294
+ return (i, f"<!-- Error processing page {page_idx + 1}: {str(e)} -->")
295
+
296
+ # Execute all page processing tasks
297
+
298
+ tasks = [process_page(i, page_idx) for i, page_idx in enumerate(target_page_indices)]
299
+ results = await asyncio.gather(*tasks, return_exceptions=True)
300
+
301
+ # Sort results by original page order and extract markdown
302
+ markdown_results = [""] * total_pages_to_process
303
+ for result in results:
304
+ if isinstance(result, Exception):
305
+ logger.error(f"Task failed with exception: {result}")
306
+ continue
307
+ if isinstance(result, tuple) and len(result) == 2:
308
+ page_order, markdown = result
309
+ markdown_results[page_order] = markdown
310
+ else:
311
+ logger.error(f"Unexpected result format: {result}")
312
+
313
+ return "\n\n".join(markdown_results).strip()
314
+
315
+ def _convert_sequential(
316
+ self,
317
+ pdf_input: "Document | PathOrReadable",
318
+ page_indices: Optional[PageIndexType] = None,
319
+ progress_callback: Optional[Callable[[int, int], None]] = None,
320
+ ) -> str:
321
+ """Sequential conversion maintaining strict page continuity."""
322
+ with open_pdf(pdf_input) as doc:
323
+ target_page_indices = list(
324
+ _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
325
+ )
326
+ total_pages_to_process = len(target_page_indices)
327
+ if total_pages_to_process == 0:
328
+ logger.warning("No pages selected for processing.")
329
+ return ""
330
+
331
+ full_markdown_output: List[str] = []
332
+ previous_page_markdown: Optional[str] = None
333
+
334
+ # Pre-process all pages
335
+ logger.info("Extracting text and rendering images for selected pages...")
336
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
337
+ page_image_dict = render_pdf_as_image(
338
+ doc,
339
+ page_indices=target_page_indices,
340
+ zoom=self.image_zoom,
341
+ output=self.image_format,
342
+ jpg_quality=self.image_jpg_quality,
343
+ )
344
+ logger.info(f"Starting sequential Markdown conversion for {total_pages_to_process} pages...")
345
+
346
+ for i, page_idx in enumerate(target_page_indices):
347
+ logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
348
+ try:
349
+ context_tail = self._get_context_tail(previous_page_markdown)
350
+
351
+ message = self._format_prompt_content_sequential(
352
+ page_text=page_text_dict.get(page_idx, ""),
353
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
354
+ previous_markdown_context_tail=context_tail,
355
+ page_number=page_idx,
356
+ total_pages=len(doc),
357
+ )
358
+
359
+ response = self.chatterer.generate([message])
360
+
361
+ # Extract markdown
362
+ markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
363
+ if markdowns:
364
+ current_page_markdown = "\n".join(markdowns)
365
+ else:
366
+ current_page_markdown = response.strip()
367
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
368
+ current_page_markdown = current_page_markdown[3:-3].strip()
369
+
370
+ full_markdown_output.append(current_page_markdown)
371
+ previous_page_markdown = current_page_markdown
372
+
373
+ except Exception as e:
374
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
375
+ continue
376
+
377
+ # Progress callback
378
+ if progress_callback:
379
+ try:
380
+ progress_callback(i + 1, total_pages_to_process)
381
+ except Exception as cb_err:
382
+ logger.warning(f"Progress callback failed: {cb_err}")
383
+
384
+ return "\n\n".join(full_markdown_output).strip()
385
+
386
+ def _convert_parallel_sync(
387
+ self,
388
+ pdf_input: "Document | PathOrReadable",
389
+ page_indices: Optional[PageIndexType] = None,
390
+ progress_callback: Optional[Callable[[int, int], None]] = None,
391
+ ) -> str:
392
+ """Synchronous parallel-style conversion (processes independently but sequentially)."""
393
+ with open_pdf(pdf_input) as doc:
394
+ target_page_indices = list(
395
+ _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
396
+ )
397
+ total_pages_to_process = len(target_page_indices)
398
+ if total_pages_to_process == 0:
399
+ logger.warning("No pages selected for processing.")
400
+ return ""
401
+
402
+ logger.info(f"Starting parallel-style Markdown conversion for {total_pages_to_process} pages...")
403
+
404
+ # Pre-process all pages
405
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
406
+ page_image_dict = render_pdf_as_image(
407
+ doc,
408
+ page_indices=target_page_indices,
409
+ zoom=self.image_zoom,
410
+ output=self.image_format,
411
+ jpg_quality=self.image_jpg_quality,
412
+ )
413
+
414
+ full_markdown_output: List[str] = []
415
+
416
+ for i, page_idx in enumerate(target_page_indices):
417
+ logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
418
+
419
+ try:
420
+ # Get previous page data for context
421
+ prev_page_idx = target_page_indices[i - 1] if i > 0 else None
422
+ previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
423
+ previous_page_image_b64 = None
424
+ if prev_page_idx is not None:
425
+ previous_page_image_b64 = Base64Image.from_bytes(
426
+ page_image_dict[prev_page_idx], ext=self.image_format
427
+ )
428
+
429
+ message = self._format_prompt_content_parallel(
430
+ page_text=page_text_dict.get(page_idx, ""),
431
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
432
+ previous_page_text=previous_page_text,
433
+ previous_page_image_b64=previous_page_image_b64,
434
+ page_number=page_idx,
435
+ total_pages=len(doc),
436
+ )
437
+
438
+ response = self.chatterer.generate([message])
439
+
440
+ # Extract markdown
441
+ markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
442
+ if markdowns:
443
+ current_page_markdown = "\n".join(markdowns)
444
+ else:
445
+ current_page_markdown = response.strip()
446
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
447
+ current_page_markdown = current_page_markdown[3:-3].strip()
448
+
449
+ full_markdown_output.append(current_page_markdown)
450
+
451
+ except Exception as e:
452
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
453
+ continue
454
+
455
+ # Progress callback
456
+ if progress_callback:
457
+ try:
458
+ progress_callback(i + 1, total_pages_to_process)
459
+ except Exception as cb_err:
460
+ logger.warning(f"Progress callback failed: {cb_err}")
461
+
462
+ return "\n\n".join(full_markdown_output).strip()
463
+
464
+
465
+ def render_pdf_as_image(
466
+ doc: "Document",
467
+ zoom: float = 2.0,
468
+ output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
469
+ jpg_quality: int = 100,
470
+ page_indices: Iterable[int] | int | None = None,
471
+ ) -> dict[int, bytes]:
472
+ """
473
+ Convert PDF pages to images in bytes.
474
+
475
+ Args:
476
+ doc (Document): The PDF document to convert.
477
+ zoom (float): Zoom factor for the image resolution. Default is 2.0.
478
+ output (str): Output format for the image. Default is 'png'.
479
+ jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
480
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
481
+ If an int is provided, only that page is converted.
482
+
483
+ Returns:
484
+ dict[int, bytes]: A dictionary mapping page numbers to image bytes.
485
+ """
486
+ from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
487
+ from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
488
+
489
+ images_bytes: dict[int, bytes] = {}
490
+ matrix = Matrix(zoom, zoom) # Control output resolution
491
+ for page_idx in _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True):
492
+ img_bytes = bytes(
493
+ get_pixmap(
494
+ page=doc[page_idx],
495
+ matrix=matrix,
496
+ ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
497
+ )
498
+ images_bytes[page_idx] = img_bytes
499
+ return images_bytes
500
+
501
+
502
+ def extract_text_from_pdf(doc: "Document", page_indices: Optional[PageIndexType] = None) -> dict[int, str]:
503
+ """Convert a PDF file to plain text.
504
+
505
+ Extracts text from each page of a PDF file and formats it with page markers.
506
+
507
+ Args:
508
+ doc (Document): The PDF document to convert.
509
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
510
+ If an int is provided, only that page is converted.
511
+
512
+ Returns:
513
+ dict[int, str]: A dictionary mapping page numbers to text content.
514
+ """
515
+ return {
516
+ page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
517
+ for page_idx in _get_page_indices(
518
+ page_indices=page_indices,
519
+ max_doc_pages=len(doc),
520
+ is_input_zero_based=True,
521
+ )
522
+ }
523
+
524
+
525
+ @contextmanager
526
+ def open_pdf(pdf_input: PathOrReadable | Document):
527
+ """Open a PDF document from a file path or use an existing Document object.
528
+
529
+ Args:
530
+ pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
531
+
532
+ Returns:
533
+ tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
534
+ """
535
+ import pymupdf # pyright: ignore[reportMissingTypeStubs]
536
+
537
+ should_close = True
538
+
539
+ if isinstance(pdf_input, pymupdf.Document):
540
+ should_close = False
541
+ doc = pdf_input
542
+ else:
543
+ with read_bytes_stream(pdf_input) as stream:
544
+ if stream is None:
545
+ raise FileNotFoundError(pdf_input)
546
+ doc = pymupdf.Document(stream=stream.read())
547
+ yield doc
548
+ if should_close:
549
+ doc.close()
550
+
551
+
552
+ def _get_page_indices(
553
+ page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool
554
+ ) -> list[int]:
555
+ """Helper function to handle page indices for PDF conversion."""
556
+
557
+ def _to_zero_based_int(idx: int) -> int:
558
+ """Convert a 1-based index to a 0-based index if necessary."""
559
+ if is_input_zero_based:
560
+ return idx
561
+ else:
562
+ if idx < 1 or idx > max_doc_pages:
563
+ raise ValueError(f"Index {idx} is out of bounds for document with {max_doc_pages} pages (1-based).")
564
+ return idx - 1
565
+
566
+ if page_indices is None:
567
+ return list(range(max_doc_pages)) # Convert all pages
568
+ elif isinstance(page_indices, int):
569
+ # Handle single integer input for page index
570
+ return [_to_zero_based_int(page_indices)]
571
+ elif isinstance(page_indices, str):
572
+ # Handle string input for page indices
573
+ return _interpret_index_string(
574
+ index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based
575
+ )
576
+ else:
577
+ # Handle iterable input for page indices
578
+ indices: set[int] = set()
579
+ for idx in page_indices:
580
+ if isinstance(idx, int):
581
+ indices.add(_to_zero_based_int(idx))
582
+ else:
583
+ start, end = idx
584
+ if isinstance(start, EllipsisType):
585
+ start = 0
586
+ else:
587
+ start = _to_zero_based_int(start)
588
+
589
+ if isinstance(end, EllipsisType):
590
+ end = max_doc_pages - 1
591
+ else:
592
+ end = _to_zero_based_int(end)
593
+
594
+ if start > end:
595
+ raise ValueError(
596
+ f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
597
+ )
598
+ indices.update(range(start, end + 1))
599
+
600
+ return sorted(indices) # Return sorted list of indices
601
+
602
+
603
+ def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
604
+ """Interpret a string of comma-separated indices and ranges."""
605
+
606
+ def _to_zero_based_int(idx_str: str) -> int:
607
+ i = int(idx_str)
608
+ if is_input_zero_based:
609
+ if i < 0 or i >= max_doc_pages:
610
+ raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages.")
611
+ return i
612
+ else:
613
+ if i < 1 or i > max_doc_pages:
614
+ raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages (1-based).")
615
+ return i - 1 # Convert to zero-based index
616
+
617
+ indices: set[int] = set()
618
+ for part in index_str.split(","):
619
+ part: str = part.strip()
620
+ count_dash: int = part.count("-")
621
+ if count_dash == 0:
622
+ indices.add(_to_zero_based_int(part))
623
+ elif count_dash == 1:
624
+ idx_dash: int = part.index("-")
625
+ start = part[:idx_dash].strip()
626
+ end = part[idx_dash + 1 :].strip()
627
+ if not start:
628
+ start = _to_zero_based_int("0") # Default to 0 if no start index is provided
629
+ else:
630
+ start = _to_zero_based_int(start)
631
+
632
+ if not end:
633
+ end = _to_zero_based_int(str(max_doc_pages - 1)) # Default to last page if no end index is provided
634
+ else:
635
+ end = _to_zero_based_int(end)
636
+
637
+ if start > end:
638
+ raise ValueError(
639
+ f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
640
+ )
641
+ indices.update(range(start, end + 1))
642
+ else:
643
+ raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")
644
+
645
+ return sorted(indices) # Return sorted list of indices, ensuring no duplicates