chatterer 0.1.25__py3-none-any.whl → 0.1.27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. chatterer/__init__.py +87 -97
  2. chatterer/common_types/__init__.py +21 -21
  3. chatterer/common_types/io.py +19 -19
  4. chatterer/constants.py +5 -0
  5. chatterer/examples/__main__.py +75 -75
  6. chatterer/examples/any2md.py +83 -85
  7. chatterer/examples/pdf2md.py +231 -338
  8. chatterer/examples/pdf2txt.py +52 -54
  9. chatterer/examples/ppt.py +487 -486
  10. chatterer/examples/pw.py +141 -143
  11. chatterer/examples/snippet.py +54 -56
  12. chatterer/examples/transcribe.py +192 -192
  13. chatterer/examples/upstage.py +87 -89
  14. chatterer/examples/web2md.py +80 -80
  15. chatterer/interactive.py +422 -354
  16. chatterer/language_model.py +530 -536
  17. chatterer/messages.py +21 -21
  18. chatterer/tools/__init__.py +46 -46
  19. chatterer/tools/caption_markdown_images.py +388 -384
  20. chatterer/tools/citation_chunking/__init__.py +3 -3
  21. chatterer/tools/citation_chunking/chunks.py +51 -53
  22. chatterer/tools/citation_chunking/citation_chunker.py +117 -118
  23. chatterer/tools/citation_chunking/citations.py +284 -285
  24. chatterer/tools/citation_chunking/prompt.py +157 -157
  25. chatterer/tools/citation_chunking/reference.py +26 -26
  26. chatterer/tools/citation_chunking/utils.py +138 -138
  27. chatterer/tools/convert_pdf_to_markdown.py +636 -645
  28. chatterer/tools/convert_to_text.py +446 -446
  29. chatterer/tools/upstage_document_parser.py +704 -705
  30. chatterer/tools/webpage_to_markdown.py +739 -739
  31. chatterer/tools/youtube.py +146 -147
  32. chatterer/utils/__init__.py +15 -15
  33. chatterer/utils/base64_image.py +349 -293
  34. chatterer/utils/bytesio.py +59 -59
  35. chatterer/utils/code_agent.py +237 -237
  36. chatterer/utils/imghdr.py +145 -148
  37. {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/METADATA +377 -390
  38. chatterer-0.1.27.dist-info/RECORD +43 -0
  39. chatterer/strategies/__init__.py +0 -13
  40. chatterer/strategies/atom_of_thoughts.py +0 -975
  41. chatterer/strategies/base.py +0 -14
  42. chatterer-0.1.25.dist-info/RECORD +0 -45
  43. {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/WHEEL +0 -0
  44. {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/entry_points.txt +0 -0
  45. {chatterer-0.1.25.dist-info → chatterer-0.1.27.dist-info}/top_level.txt +0 -0
@@ -1,645 +1,636 @@
1
- from __future__ import annotations
2
-
3
- import asyncio
4
- import logging
5
- import re
6
- from contextlib import contextmanager
7
- from dataclasses import dataclass
8
- from types import EllipsisType
9
- from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional
10
-
11
- from ..language_model import Chatterer, HumanMessage
12
- from ..utils.base64_image import Base64Image
13
- from ..utils.bytesio import PathOrReadable, read_bytes_stream
14
-
15
- if TYPE_CHECKING:
16
- from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
17
-
18
- # Setup basic logging
19
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
20
- logger = logging.getLogger(__name__)
21
- MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
22
- PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] | int | str
23
-
24
-
25
- @dataclass
26
- class PdfToMarkdown:
27
- """
28
- Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
29
-
30
- This class supports both sequential and parallel processing:
31
- - Sequential processing preserves strict page continuity using previous page context
32
- - Parallel processing enables faster conversion for large documents by using
33
- previous page image and text for context instead of generated markdown
34
- """
35
-
36
- chatterer: Chatterer
37
- """An instance of the Chatterer class configured with a vision-capable model."""
38
- image_zoom: float = 2.0
39
- """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
40
- image_format: Literal["jpg", "jpeg", "png"] = "png"
41
- """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
42
- image_jpg_quality: int = 95
43
- """Quality for JPEG images (if used)."""
44
- context_tail_lines: int = 10
45
- """Number of lines from the end of the previous page's Markdown to use as context (sequential mode only)."""
46
-
47
- def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
48
- """Extracts the last N lines from the given markdown text."""
49
- if not markdown_text or self.context_tail_lines <= 0:
50
- return None
51
- lines = markdown_text.strip().splitlines()
52
- if not lines:
53
- return None
54
- tail_lines = lines[-self.context_tail_lines :]
55
- return "\n".join(tail_lines)
56
-
57
- def _format_prompt_content_sequential(
58
- self,
59
- page_text: str,
60
- page_image_b64: Base64Image,
61
- previous_markdown_context_tail: Optional[str] = None,
62
- page_number: int = 0,
63
- total_pages: int = 1,
64
- ) -> HumanMessage:
65
- """
66
- Formats the content for sequential processing using previous page's markdown context.
67
- """
68
- instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
69
-
70
- **Input provided:**
71
- 1. **Raw Text**: Extracted text from the PDF page (may contain OCR errors)
72
- 2. **Page Image**: Visual rendering of the page showing actual layout
73
- 3. **Previous Context**: End portion of the previous page's generated Markdown (if available)
74
-
75
- **Conversion Rules:**
76
- • **Text Structure**: Use the image to understand the actual layout and fix any OCR errors in the raw text
77
- • **Headings**: Use appropriate heading levels (# ## ### etc.) based on visual hierarchy
78
- • **Lists**: Convert to proper Markdown lists (- or 1. 2. 3.) maintaining structure
79
- • **Tables**: Convert to Markdown table format using | pipes |
80
- • **Images/Diagrams**: Describe significant visual elements as: `<details><summary>Figure: Brief title</summary>Detailed description based on what you see in the image</details>`
81
- • **Code/Formulas**: Use ``` code blocks ``` or LaTeX $$ math $$ as appropriate
82
- **Continuity**: If previous context shows incomplete content (mid-sentence, list, table), seamlessly continue from that point
83
- • **NO REPETITION**: Never repeat content from the previous context - only generate new content for this page
84
-
85
- **Raw Text:**
86
- ```
87
- {page_text if page_text else "No text extracted from this page."}
88
- ```
89
-
90
- **Page Image:** (attached)
91
- """
92
-
93
- if previous_markdown_context_tail:
94
- instruction += f"""
95
- **Previous Page Context (DO NOT REPEAT):**
96
- ```markdown
97
- ... (previous page ended with) ...
98
- {previous_markdown_context_tail}
99
- ```
100
-
101
- Continue seamlessly from the above context if the current page content flows from it.
102
- """
103
- else:
104
- instruction += "\n**Note:** This is the first page or start of a new section."
105
-
106
- instruction += "\n\n**Output only the Markdown content for the current page. Ensure proper formatting and NO repetition of previous content.**"
107
-
108
- return HumanMessage(content=[instruction, page_image_b64.data_uri_content_dict])
109
-
110
- def _format_prompt_content_parallel(
111
- self,
112
- page_text: str,
113
- page_image_b64: Base64Image,
114
- previous_page_text: Optional[str] = None,
115
- previous_page_image_b64: Optional[Base64Image] = None,
116
- page_number: int = 0,
117
- total_pages: int = 1,
118
- ) -> HumanMessage:
119
- """
120
- Formats the content for parallel processing using previous page's raw data.
121
- """
122
- instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
123
-
124
- **Task**: Convert the current page to Markdown while maintaining proper continuity with the previous page.
125
-
126
- **Current Page Data:**
127
- - **Raw Text**: Extracted text (may have OCR errors - use image to verify)
128
- - **Page Image**: Visual rendering showing actual layout
129
-
130
- **Previous Page Data** (for context only):
131
- - **Previous Raw Text**: Text from the previous page
132
- - **Previous Page Image**: Visual of the previous page
133
-
134
- **Conversion Instructions:**
135
- 1. **Primary Focus**: Convert the CURRENT page content accurately
136
- 2. **Continuity Check**:
137
- - Examine if the current page continues content from the previous page (sentences, paragraphs, lists, tables)
138
- - If yes, start your Markdown naturally continuing that content
139
- - If no, start fresh with proper heading/structure
140
- 3. **Format Rules**:
141
- - Use image to fix OCR errors and understand layout
142
- - Convert headings to # ## ### based on visual hierarchy
143
- - Convert lists to proper Markdown (- or 1. 2. 3.)
144
- - Convert tables to | pipe | format
145
- - Describe significant images/charts as: `<details><summary>Figure: Title</summary>Description</details>`
146
- - Use ``` for code blocks and $$ for math formulas
147
-
148
- **Current Page Raw Text:**
149
- ```
150
- {page_text if page_text else "No text extracted from this page."}
151
- ```
152
-
153
- **Current Page Image:** (see first attached image)
154
- """
155
-
156
- content: list[str | dict[str, object]] = [instruction, page_image_b64.data_uri_content_dict]
157
-
158
- if previous_page_text is not None and previous_page_image_b64 is not None:
159
- instruction += f"""
160
-
161
- **Previous Page Raw Text (for context):**
162
- ```
163
- {previous_page_text if previous_page_text else "No text from previous page."}
164
- ```
165
-
166
- **Previous Page Image:** (see second attached image)
167
- """
168
- content.append(previous_page_image_b64.data_uri_content_dict)
169
- else:
170
- instruction += "\n**Note:** This is the first page - no previous context available."
171
-
172
- instruction += (
173
- "\n\n**Generate ONLY the Markdown for the current page. Ensure proper continuity and formatting.**"
174
- )
175
- content[0] = instruction
176
-
177
- return HumanMessage(content=content)
178
-
179
- def convert(
180
- self,
181
- pdf_input: "Document | PathOrReadable",
182
- page_indices: Optional[PageIndexType] = None,
183
- progress_callback: Optional[Callable[[int, int], None]] = None,
184
- mode: Literal["sequential", "parallel"] = "sequential",
185
- ) -> str:
186
- """
187
- Converts a PDF document to Markdown synchronously.
188
-
189
- Args:
190
- pdf_input: Path to PDF file or pymupdf.Document object
191
- page_indices: Specific page indices to convert (0-based). If None, converts all pages
192
- progress_callback: Optional callback function called with (current_page, total_pages)
193
- mode: "sequential" for strict continuity or "parallel" for independent page processing
194
-
195
- Returns:
196
- Concatenated Markdown string for all processed pages
197
- """
198
- if mode == "sequential":
199
- return self._convert_sequential(pdf_input, page_indices, progress_callback)
200
- else:
201
- return self._convert_parallel_sync(pdf_input, page_indices, progress_callback)
202
-
203
- async def aconvert(
204
- self,
205
- pdf_input: "Document | PathOrReadable",
206
- page_indices: Optional[PageIndexType] = None,
207
- progress_callback: Optional[Callable[[int, int], None]] = None,
208
- max_concurrent: int = 5,
209
- ) -> str:
210
- """
211
- Converts a PDF document to Markdown asynchronously with parallel processing.
212
-
213
- Args:
214
- pdf_input: Path to PDF file or pymupdf.Document object
215
- page_indices: Specific page indices to convert (0-based). If None, converts all pages
216
- progress_callback: Optional callback function called with (current_page, total_pages)
217
- max_concurrent: Maximum number of concurrent LLM requests
218
-
219
- Returns:
220
- Concatenated Markdown string for all processed pages
221
- """
222
- with open_pdf(pdf_input) as doc:
223
- target_page_indices = list(
224
- _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
225
- )
226
- total_pages_to_process = len(target_page_indices)
227
-
228
- if total_pages_to_process == 0:
229
- logger.warning("No pages selected for processing.")
230
- return ""
231
-
232
- logger.info(f"Starting parallel Markdown conversion for {total_pages_to_process} pages...")
233
-
234
- # Pre-process all pages
235
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
236
- page_image_dict = render_pdf_as_image(
237
- doc,
238
- page_indices=target_page_indices,
239
- zoom=self.image_zoom,
240
- output=self.image_format,
241
- jpg_quality=self.image_jpg_quality,
242
- )
243
-
244
- # Process pages in parallel with semaphore for concurrency control
245
- semaphore = asyncio.Semaphore(max_concurrent)
246
-
247
- async def process_page(i: int, page_idx: int) -> tuple[int, str]:
248
- async with semaphore:
249
- logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
250
-
251
- try:
252
- # Get previous page data for context
253
- prev_page_idx = target_page_indices[i - 1] if i > 0 else None
254
- previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
255
- previous_page_image_b64 = None
256
- if prev_page_idx is not None:
257
- previous_page_image_b64 = Base64Image.from_bytes(
258
- page_image_dict[prev_page_idx], ext=self.image_format
259
- )
260
-
261
- message = self._format_prompt_content_parallel(
262
- page_text=page_text_dict.get(page_idx, ""),
263
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
264
- previous_page_text=previous_page_text,
265
- previous_page_image_b64=previous_page_image_b64,
266
- page_number=page_idx,
267
- total_pages=len(doc),
268
- )
269
-
270
- response = await self.chatterer.agenerate([message])
271
-
272
- # Extract markdown
273
- markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
274
- if markdowns:
275
- current_page_markdown = "\n".join(markdowns)
276
- else:
277
- current_page_markdown = response.strip()
278
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
279
- current_page_markdown = current_page_markdown[3:-3].strip()
280
-
281
- logger.debug(f"Completed processing page {i + 1}/{total_pages_to_process}")
282
-
283
- # Call progress callback if provided
284
- if progress_callback:
285
- try:
286
- progress_callback(i + 1, total_pages_to_process)
287
- except Exception as cb_err:
288
- logger.warning(f"Progress callback failed: {cb_err}")
289
-
290
- return (i, current_page_markdown)
291
-
292
- except Exception as e:
293
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
294
- return (i, f"<!-- Error processing page {page_idx + 1}: {str(e)} -->")
295
-
296
- # Execute all page processing tasks
297
-
298
- tasks = [process_page(i, page_idx) for i, page_idx in enumerate(target_page_indices)]
299
- results = await asyncio.gather(*tasks, return_exceptions=True)
300
-
301
- # Sort results by original page order and extract markdown
302
- markdown_results = [""] * total_pages_to_process
303
- for result in results:
304
- if isinstance(result, Exception):
305
- logger.error(f"Task failed with exception: {result}")
306
- continue
307
- if isinstance(result, tuple) and len(result) == 2:
308
- page_order, markdown = result
309
- markdown_results[page_order] = markdown
310
- else:
311
- logger.error(f"Unexpected result format: {result}")
312
-
313
- return "\n\n".join(markdown_results).strip()
314
-
315
- def _convert_sequential(
316
- self,
317
- pdf_input: "Document | PathOrReadable",
318
- page_indices: Optional[PageIndexType] = None,
319
- progress_callback: Optional[Callable[[int, int], None]] = None,
320
- ) -> str:
321
- """Sequential conversion maintaining strict page continuity."""
322
- with open_pdf(pdf_input) as doc:
323
- target_page_indices = list(
324
- _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
325
- )
326
- total_pages_to_process = len(target_page_indices)
327
- if total_pages_to_process == 0:
328
- logger.warning("No pages selected for processing.")
329
- return ""
330
-
331
- full_markdown_output: List[str] = []
332
- previous_page_markdown: Optional[str] = None
333
-
334
- # Pre-process all pages
335
- logger.info("Extracting text and rendering images for selected pages...")
336
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
337
- page_image_dict = render_pdf_as_image(
338
- doc,
339
- page_indices=target_page_indices,
340
- zoom=self.image_zoom,
341
- output=self.image_format,
342
- jpg_quality=self.image_jpg_quality,
343
- )
344
- logger.info(f"Starting sequential Markdown conversion for {total_pages_to_process} pages...")
345
-
346
- for i, page_idx in enumerate(target_page_indices):
347
- logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
348
- try:
349
- context_tail = self._get_context_tail(previous_page_markdown)
350
-
351
- message = self._format_prompt_content_sequential(
352
- page_text=page_text_dict.get(page_idx, ""),
353
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
354
- previous_markdown_context_tail=context_tail,
355
- page_number=page_idx,
356
- total_pages=len(doc),
357
- )
358
-
359
- response = self.chatterer.generate([message])
360
-
361
- # Extract markdown
362
- markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
363
- if markdowns:
364
- current_page_markdown = "\n".join(markdowns)
365
- else:
366
- current_page_markdown = response.strip()
367
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
368
- current_page_markdown = current_page_markdown[3:-3].strip()
369
-
370
- full_markdown_output.append(current_page_markdown)
371
- previous_page_markdown = current_page_markdown
372
-
373
- except Exception as e:
374
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
375
- continue
376
-
377
- # Progress callback
378
- if progress_callback:
379
- try:
380
- progress_callback(i + 1, total_pages_to_process)
381
- except Exception as cb_err:
382
- logger.warning(f"Progress callback failed: {cb_err}")
383
-
384
- return "\n\n".join(full_markdown_output).strip()
385
-
386
- def _convert_parallel_sync(
387
- self,
388
- pdf_input: "Document | PathOrReadable",
389
- page_indices: Optional[PageIndexType] = None,
390
- progress_callback: Optional[Callable[[int, int], None]] = None,
391
- ) -> str:
392
- """Synchronous parallel-style conversion (processes independently but sequentially)."""
393
- with open_pdf(pdf_input) as doc:
394
- target_page_indices = list(
395
- _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
396
- )
397
- total_pages_to_process = len(target_page_indices)
398
- if total_pages_to_process == 0:
399
- logger.warning("No pages selected for processing.")
400
- return ""
401
-
402
- logger.info(f"Starting parallel-style Markdown conversion for {total_pages_to_process} pages...")
403
-
404
- # Pre-process all pages
405
- page_text_dict = extract_text_from_pdf(doc, target_page_indices)
406
- page_image_dict = render_pdf_as_image(
407
- doc,
408
- page_indices=target_page_indices,
409
- zoom=self.image_zoom,
410
- output=self.image_format,
411
- jpg_quality=self.image_jpg_quality,
412
- )
413
-
414
- full_markdown_output: List[str] = []
415
-
416
- for i, page_idx in enumerate(target_page_indices):
417
- logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
418
-
419
- try:
420
- # Get previous page data for context
421
- prev_page_idx = target_page_indices[i - 1] if i > 0 else None
422
- previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
423
- previous_page_image_b64 = None
424
- if prev_page_idx is not None:
425
- previous_page_image_b64 = Base64Image.from_bytes(
426
- page_image_dict[prev_page_idx], ext=self.image_format
427
- )
428
-
429
- message = self._format_prompt_content_parallel(
430
- page_text=page_text_dict.get(page_idx, ""),
431
- page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
432
- previous_page_text=previous_page_text,
433
- previous_page_image_b64=previous_page_image_b64,
434
- page_number=page_idx,
435
- total_pages=len(doc),
436
- )
437
-
438
- response = self.chatterer.generate([message])
439
-
440
- # Extract markdown
441
- markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
442
- if markdowns:
443
- current_page_markdown = "\n".join(markdowns)
444
- else:
445
- current_page_markdown = response.strip()
446
- if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
447
- current_page_markdown = current_page_markdown[3:-3].strip()
448
-
449
- full_markdown_output.append(current_page_markdown)
450
-
451
- except Exception as e:
452
- logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
453
- continue
454
-
455
- # Progress callback
456
- if progress_callback:
457
- try:
458
- progress_callback(i + 1, total_pages_to_process)
459
- except Exception as cb_err:
460
- logger.warning(f"Progress callback failed: {cb_err}")
461
-
462
- return "\n\n".join(full_markdown_output).strip()
463
-
464
-
465
- def render_pdf_as_image(
466
- doc: "Document",
467
- zoom: float = 2.0,
468
- output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
469
- jpg_quality: int = 100,
470
- page_indices: Iterable[int] | int | None = None,
471
- ) -> dict[int, bytes]:
472
- """
473
- Convert PDF pages to images in bytes.
474
-
475
- Args:
476
- doc (Document): The PDF document to convert.
477
- zoom (float): Zoom factor for the image resolution. Default is 2.0.
478
- output (str): Output format for the image. Default is 'png'.
479
- jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
480
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
481
- If an int is provided, only that page is converted.
482
-
483
- Returns:
484
- dict[int, bytes]: A dictionary mapping page numbers to image bytes.
485
- """
486
- from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
487
- from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
488
-
489
- images_bytes: dict[int, bytes] = {}
490
- matrix = Matrix(zoom, zoom) # Control output resolution
491
- for page_idx in _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True):
492
- img_bytes = bytes(
493
- get_pixmap(
494
- page=doc[page_idx],
495
- matrix=matrix,
496
- ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
497
- )
498
- images_bytes[page_idx] = img_bytes
499
- return images_bytes
500
-
501
-
502
- def extract_text_from_pdf(doc: "Document", page_indices: Optional[PageIndexType] = None) -> dict[int, str]:
503
- """Convert a PDF file to plain text.
504
-
505
- Extracts text from each page of a PDF file and formats it with page markers.
506
-
507
- Args:
508
- doc (Document): The PDF document to convert.
509
- page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
510
- If an int is provided, only that page is converted.
511
-
512
- Returns:
513
- dict[int, str]: A dictionary mapping page numbers to text content.
514
- """
515
- return {
516
- page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
517
- for page_idx in _get_page_indices(
518
- page_indices=page_indices,
519
- max_doc_pages=len(doc),
520
- is_input_zero_based=True,
521
- )
522
- }
523
-
524
-
525
- @contextmanager
526
- def open_pdf(pdf_input: PathOrReadable | Document):
527
- """Open a PDF document from a file path or use an existing Document object.
528
-
529
- Args:
530
- pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
531
-
532
- Returns:
533
- tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
534
- """
535
- import pymupdf # pyright: ignore[reportMissingTypeStubs]
536
-
537
- should_close = True
538
-
539
- if isinstance(pdf_input, pymupdf.Document):
540
- should_close = False
541
- doc = pdf_input
542
- else:
543
- with read_bytes_stream(pdf_input) as stream:
544
- if stream is None:
545
- raise FileNotFoundError(pdf_input)
546
- doc = pymupdf.Document(stream=stream.read())
547
- yield doc
548
- if should_close:
549
- doc.close()
550
-
551
-
552
- def _get_page_indices(
553
- page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool
554
- ) -> list[int]:
555
- """Helper function to handle page indices for PDF conversion."""
556
-
557
- def _to_zero_based_int(idx: int) -> int:
558
- """Convert a 1-based index to a 0-based index if necessary."""
559
- if is_input_zero_based:
560
- return idx
561
- else:
562
- if idx < 1 or idx > max_doc_pages:
563
- raise ValueError(f"Index {idx} is out of bounds for document with {max_doc_pages} pages (1-based).")
564
- return idx - 1
565
-
566
- if page_indices is None:
567
- return list(range(max_doc_pages)) # Convert all pages
568
- elif isinstance(page_indices, int):
569
- # Handle single integer input for page index
570
- return [_to_zero_based_int(page_indices)]
571
- elif isinstance(page_indices, str):
572
- # Handle string input for page indices
573
- return _interpret_index_string(
574
- index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based
575
- )
576
- else:
577
- # Handle iterable input for page indices
578
- indices: set[int] = set()
579
- for idx in page_indices:
580
- if isinstance(idx, int):
581
- indices.add(_to_zero_based_int(idx))
582
- else:
583
- start, end = idx
584
- if isinstance(start, EllipsisType):
585
- start = 0
586
- else:
587
- start = _to_zero_based_int(start)
588
-
589
- if isinstance(end, EllipsisType):
590
- end = max_doc_pages - 1
591
- else:
592
- end = _to_zero_based_int(end)
593
-
594
- if start > end:
595
- raise ValueError(
596
- f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
597
- )
598
- indices.update(range(start, end + 1))
599
-
600
- return sorted(indices) # Return sorted list of indices
601
-
602
-
603
- def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
604
- """Interpret a string of comma-separated indices and ranges."""
605
-
606
- def _to_zero_based_int(idx_str: str) -> int:
607
- i = int(idx_str)
608
- if is_input_zero_based:
609
- if i < 0 or i >= max_doc_pages:
610
- raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages.")
611
- return i
612
- else:
613
- if i < 1 or i > max_doc_pages:
614
- raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages (1-based).")
615
- return i - 1 # Convert to zero-based index
616
-
617
- indices: set[int] = set()
618
- for part in index_str.split(","):
619
- part: str = part.strip()
620
- count_dash: int = part.count("-")
621
- if count_dash == 0:
622
- indices.add(_to_zero_based_int(part))
623
- elif count_dash == 1:
624
- idx_dash: int = part.index("-")
625
- start = part[:idx_dash].strip()
626
- end = part[idx_dash + 1 :].strip()
627
- if not start:
628
- start = _to_zero_based_int("0") # Default to 0 if no start index is provided
629
- else:
630
- start = _to_zero_based_int(start)
631
-
632
- if not end:
633
- end = _to_zero_based_int(str(max_doc_pages - 1)) # Default to last page if no end index is provided
634
- else:
635
- end = _to_zero_based_int(end)
636
-
637
- if start > end:
638
- raise ValueError(
639
- f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
640
- )
641
- indices.update(range(start, end + 1))
642
- else:
643
- raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")
644
-
645
- return sorted(indices) # Return sorted list of indices, ensuring no duplicates
1
+ import asyncio
2
+ import re
3
+ from contextlib import contextmanager
4
+ from dataclasses import dataclass
5
+ from types import EllipsisType
6
+ from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional
7
+
8
+ from loguru import logger
9
+
10
+ from ..language_model import Chatterer, HumanMessage
11
+ from ..utils.base64_image import Base64Image
12
+ from ..utils.bytesio import PathOrReadable, read_bytes_stream
13
+
14
+ if TYPE_CHECKING:
15
+ from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
16
+
17
+
18
+ MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
19
+ PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] | int | str
20
+
21
+
22
+ @dataclass
23
+ class PdfToMarkdown:
24
+ """
25
+ Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
26
+
27
+ This class supports both sequential and parallel processing:
28
+ - Sequential processing preserves strict page continuity using previous page context
29
+ - Parallel processing enables faster conversion for large documents by using
30
+ previous page image and text for context instead of generated markdown
31
+ """
32
+
33
+ chatterer: Chatterer
34
+ """An instance of the Chatterer class configured with a vision-capable model."""
35
+ image_zoom: float = 2.0
36
+ """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
37
+ image_format: Literal["jpg", "jpeg", "png"] = "png"
38
+ """The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
39
+ image_jpg_quality: int = 95
40
+ """Quality for JPEG images (if used)."""
41
+ context_tail_lines: int = 10
42
+ """Number of lines from the end of the previous page's Markdown to use as context (sequential mode only)."""
43
+
44
+ def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
45
+ """Extracts the last N lines from the given markdown text."""
46
+ if not markdown_text or self.context_tail_lines <= 0:
47
+ return None
48
+ lines = markdown_text.strip().splitlines()
49
+ if not lines:
50
+ return None
51
+ tail_lines = lines[-self.context_tail_lines :]
52
+ return "\n".join(tail_lines)
53
+
54
+ def _format_prompt_content_sequential(
55
+ self,
56
+ page_text: str,
57
+ page_image_b64: Base64Image,
58
+ previous_markdown_context_tail: Optional[str] = None,
59
+ page_number: int = 0,
60
+ total_pages: int = 1,
61
+ ) -> HumanMessage:
62
+ """
63
+ Formats the content for sequential processing using previous page's markdown context.
64
+ """
65
+ instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
66
+
67
+ **Input provided:**
68
+ 1. **Raw Text**: Extracted text from the PDF page (may contain OCR errors)
69
+ 2. **Page Image**: Visual rendering of the page showing actual layout
70
+ 3. **Previous Context**: End portion of the previous page's generated Markdown (if available)
71
+
72
+ **Conversion Rules:**
73
+ **Text Structure**: Use the image to understand the actual layout and fix any OCR errors in the raw text
74
+ • **Headings**: Use appropriate heading levels (# ## ### etc.) based on visual hierarchy
75
+ **Lists**: Convert to proper Markdown lists (- or 1. 2. 3.) maintaining structure
76
+ • **Tables**: Convert to Markdown table format using | pipes |
77
+ • **Images/Diagrams**: Describe significant visual elements as: `<details><summary>Figure: Brief title</summary>Detailed description based on what you see in the image</details>`
78
+ • **Code/Formulas**: Use ``` code blocks ``` or LaTeX $$ math $$ as appropriate
79
+ • **Continuity**: If previous context shows incomplete content (mid-sentence, list, table), seamlessly continue from that point
80
+ • **NO REPETITION**: Never repeat content from the previous context - only generate new content for this page
81
+
82
+ **Raw Text:**
83
+ ```
84
+ {page_text if page_text else "No text extracted from this page."}
85
+ ```
86
+
87
+ **Page Image:** (attached)
88
+ """
89
+
90
+ if previous_markdown_context_tail:
91
+ instruction += f"""
92
+ **Previous Page Context (DO NOT REPEAT):**
93
+ ```markdown
94
+ ... (previous page ended with) ...
95
+ {previous_markdown_context_tail}
96
+ ```
97
+
98
+ Continue seamlessly from the above context if the current page content flows from it.
99
+ """
100
+ else:
101
+ instruction += "\n**Note:** This is the first page or start of a new section."
102
+
103
+ instruction += "\n\n**Output only the Markdown content for the current page. Ensure proper formatting and NO repetition of previous content.**"
104
+
105
+ return HumanMessage(content=[instruction, page_image_b64.data_uri_content_dict])
106
+
107
+ def _format_prompt_content_parallel(
108
+ self,
109
+ page_text: str,
110
+ page_image_b64: Base64Image,
111
+ previous_page_text: Optional[str] = None,
112
+ previous_page_image_b64: Optional[Base64Image] = None,
113
+ page_number: int = 0,
114
+ total_pages: int = 1,
115
+ ) -> HumanMessage:
116
+ """
117
+ Formats the content for parallel processing using previous page's raw data.
118
+ """
119
+ instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
120
+
121
+ **Task**: Convert the current page to Markdown while maintaining proper continuity with the previous page.
122
+
123
+ **Current Page Data:**
124
+ - **Raw Text**: Extracted text (may have OCR errors - use image to verify)
125
+ - **Page Image**: Visual rendering showing actual layout
126
+
127
+ **Previous Page Data** (for context only):
128
+ - **Previous Raw Text**: Text from the previous page
129
+ - **Previous Page Image**: Visual of the previous page
130
+
131
+ **Conversion Instructions:**
132
+ 1. **Primary Focus**: Convert the CURRENT page content accurately
133
+ 2. **Continuity Check**:
134
+ - Examine if the current page continues content from the previous page (sentences, paragraphs, lists, tables)
135
+ - If yes, start your Markdown naturally continuing that content
136
+ - If no, start fresh with proper heading/structure
137
+ 3. **Format Rules**:
138
+ - Use image to fix OCR errors and understand layout
139
+ - Convert headings to # ## ### based on visual hierarchy
140
+ - Convert lists to proper Markdown (- or 1. 2. 3.)
141
+ - Convert tables to | pipe | format
142
+ - Describe significant images/charts as: `<details><summary>Figure: Title</summary>Description</details>`
143
+ - Use ``` for code blocks and $$ for math formulas
144
+
145
+ **Current Page Raw Text:**
146
+ ```
147
+ {page_text if page_text else "No text extracted from this page."}
148
+ ```
149
+
150
+ **Current Page Image:** (see first attached image)
151
+ """
152
+
153
+ content: list[str | dict[str, object]] = [instruction, page_image_b64.data_uri_content_dict]
154
+
155
+ if previous_page_text is not None and previous_page_image_b64 is not None:
156
+ instruction += f"""
157
+
158
+ **Previous Page Raw Text (for context):**
159
+ ```
160
+ {previous_page_text if previous_page_text else "No text from previous page."}
161
+ ```
162
+
163
+ **Previous Page Image:** (see second attached image)
164
+ """
165
+ content.append(previous_page_image_b64.data_uri_content_dict)
166
+ else:
167
+ instruction += "\n**Note:** This is the first page - no previous context available."
168
+
169
+ instruction += (
170
+ "\n\n**Generate ONLY the Markdown for the current page. Ensure proper continuity and formatting.**"
171
+ )
172
+ content[0] = instruction
173
+
174
+ return HumanMessage(content=content)
175
+
176
+ def convert(
177
+ self,
178
+ pdf_input: "Document | PathOrReadable",
179
+ page_indices: Optional[PageIndexType] = None,
180
+ progress_callback: Optional[Callable[[int, int], None]] = None,
181
+ mode: Literal["sequential", "parallel"] = "sequential",
182
+ ) -> str:
183
+ """
184
+ Converts a PDF document to Markdown synchronously.
185
+
186
+ Args:
187
+ pdf_input: Path to PDF file or pymupdf.Document object
188
+ page_indices: Specific page indices to convert (0-based). If None, converts all pages
189
+ progress_callback: Optional callback function called with (current_page, total_pages)
190
+ mode: "sequential" for strict continuity or "parallel" for independent page processing
191
+
192
+ Returns:
193
+ Concatenated Markdown string for all processed pages
194
+ """
195
+ if mode == "sequential":
196
+ return self._convert_sequential(pdf_input, page_indices, progress_callback)
197
+ else:
198
+ return self._convert_parallel_sync(pdf_input, page_indices, progress_callback)
199
+
200
+ async def aconvert(
201
+ self,
202
+ pdf_input: "Document | PathOrReadable",
203
+ page_indices: Optional[PageIndexType] = None,
204
+ progress_callback: Optional[Callable[[int, int], None]] = None,
205
+ max_concurrent: int = 5,
206
+ ) -> str:
207
+ """
208
+ Converts a PDF document to Markdown asynchronously with parallel processing.
209
+
210
+ Args:
211
+ pdf_input: Path to PDF file or pymupdf.Document object
212
+ page_indices: Specific page indices to convert (0-based). If None, converts all pages
213
+ progress_callback: Optional callback function called with (current_page, total_pages)
214
+ max_concurrent: Maximum number of concurrent LLM requests
215
+
216
+ Returns:
217
+ Concatenated Markdown string for all processed pages
218
+ """
219
+ with open_pdf(pdf_input) as doc:
220
+ target_page_indices: list[int] = list(
221
+ _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
222
+ )
223
+ total_pages_to_process: int = len(target_page_indices)
224
+
225
+ if not total_pages_to_process:
226
+ logger.warning("No pages selected for processing.")
227
+ return ""
228
+
229
+ # Pre-process all pages
230
+ page_text_dict: dict[int, str] = extract_text_from_pdf(doc, target_page_indices)
231
+ page_image_dict: dict[int, bytes] = render_pdf_as_image(
232
+ doc,
233
+ page_indices=target_page_indices,
234
+ zoom=self.image_zoom,
235
+ output=self.image_format,
236
+ jpg_quality=self.image_jpg_quality,
237
+ )
238
+
239
+ semaphore = asyncio.Semaphore(max_concurrent)
240
+
241
+ async def process_page(i: int, page_idx: int) -> tuple[int, str]:
242
+ async with semaphore:
243
+ try:
244
+ # Get previous page data for context
245
+ prev_page_idx: int | None = target_page_indices[i - 1] if i > 0 else None
246
+ message: HumanMessage = self._format_prompt_content_parallel(
247
+ page_text=page_text_dict.get(page_idx, ""),
248
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
249
+ previous_page_text=(
250
+ page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
251
+ ),
252
+ previous_page_image_b64=(
253
+ Base64Image.from_bytes(page_image_dict[prev_page_idx], ext=self.image_format)
254
+ if prev_page_idx is not None
255
+ else None
256
+ ),
257
+ page_number=page_idx,
258
+ total_pages=len(doc),
259
+ )
260
+ response: str = await self.chatterer.agenerate([message])
261
+
262
+ # Extract markdown
263
+ markdowns: list[str] = [
264
+ str(match.group(1).strip()) for match in MARKDOWN_PATTERN.finditer(response)
265
+ ]
266
+ if markdowns:
267
+ current_page_markdown = "\n".join(markdowns)
268
+ else:
269
+ current_page_markdown = response.strip()
270
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
271
+ current_page_markdown = current_page_markdown[3:-3].strip()
272
+
273
+ # Call progress callback if provided
274
+ if progress_callback:
275
+ try:
276
+ progress_callback(i + 1, total_pages_to_process)
277
+ except Exception as cb_err:
278
+ logger.warning(f"Progress callback failed: {cb_err}")
279
+
280
+ return (i, current_page_markdown)
281
+
282
+ except Exception as e:
283
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
284
+ return (i, f"<!-- Error processing page {page_idx + 1}: {str(e)} -->")
285
+
286
+ # Execute all page processing tasks
287
+
288
+ results: list[tuple[int, str] | BaseException] = await asyncio.gather(
289
+ *(process_page(i, page_idx) for i, page_idx in enumerate(target_page_indices)), return_exceptions=True
290
+ )
291
+
292
+ # Sort results by original page order and extract markdown
293
+ markdown_results = [""] * total_pages_to_process
294
+ for result in results:
295
+ if isinstance(result, Exception):
296
+ logger.error(f"Task failed with exception: {result}")
297
+ continue
298
+ if isinstance(result, tuple) and len(result) == 2:
299
+ page_order, markdown = result
300
+ markdown_results[page_order] = markdown
301
+ else:
302
+ logger.error(f"Unexpected result format: {result}")
303
+
304
+ return "\n\n".join(markdown_results).strip()
305
+
306
+ def _convert_sequential(
307
+ self,
308
+ pdf_input: "Document | PathOrReadable",
309
+ page_indices: Optional[PageIndexType] = None,
310
+ progress_callback: Optional[Callable[[int, int], None]] = None,
311
+ ) -> str:
312
+ """Sequential conversion maintaining strict page continuity."""
313
+ with open_pdf(pdf_input) as doc:
314
+ target_page_indices = list(
315
+ _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
316
+ )
317
+ total_pages_to_process = len(target_page_indices)
318
+ if total_pages_to_process == 0:
319
+ logger.warning("No pages selected for processing.")
320
+ return ""
321
+
322
+ full_markdown_output: List[str] = []
323
+ previous_page_markdown: Optional[str] = None
324
+
325
+ # Pre-process all pages
326
+ logger.info("Extracting text and rendering images for selected pages...")
327
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
328
+ page_image_dict = render_pdf_as_image(
329
+ doc,
330
+ page_indices=target_page_indices,
331
+ zoom=self.image_zoom,
332
+ output=self.image_format,
333
+ jpg_quality=self.image_jpg_quality,
334
+ )
335
+ logger.info(f"Starting sequential Markdown conversion for {total_pages_to_process} pages...")
336
+
337
+ for i, page_idx in enumerate(target_page_indices):
338
+ logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
339
+ try:
340
+ context_tail = self._get_context_tail(previous_page_markdown)
341
+
342
+ message = self._format_prompt_content_sequential(
343
+ page_text=page_text_dict.get(page_idx, ""),
344
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
345
+ previous_markdown_context_tail=context_tail,
346
+ page_number=page_idx,
347
+ total_pages=len(doc),
348
+ )
349
+
350
+ response = self.chatterer.generate([message])
351
+
352
+ # Extract markdown
353
+ markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
354
+ if markdowns:
355
+ current_page_markdown = "\n".join(markdowns)
356
+ else:
357
+ current_page_markdown = response.strip()
358
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
359
+ current_page_markdown = current_page_markdown[3:-3].strip()
360
+
361
+ full_markdown_output.append(current_page_markdown)
362
+ previous_page_markdown = current_page_markdown
363
+
364
+ except Exception as e:
365
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
366
+ continue
367
+
368
+ # Progress callback
369
+ if progress_callback:
370
+ try:
371
+ progress_callback(i + 1, total_pages_to_process)
372
+ except Exception as cb_err:
373
+ logger.warning(f"Progress callback failed: {cb_err}")
374
+
375
+ return "\n\n".join(full_markdown_output).strip()
376
+
377
+ def _convert_parallel_sync(
378
+ self,
379
+ pdf_input: "Document | PathOrReadable",
380
+ page_indices: Optional[PageIndexType] = None,
381
+ progress_callback: Optional[Callable[[int, int], None]] = None,
382
+ ) -> str:
383
+ """Synchronous parallel-style conversion (processes independently but sequentially)."""
384
+ with open_pdf(pdf_input) as doc:
385
+ target_page_indices = list(
386
+ _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
387
+ )
388
+ total_pages_to_process = len(target_page_indices)
389
+ if total_pages_to_process == 0:
390
+ logger.warning("No pages selected for processing.")
391
+ return ""
392
+
393
+ logger.info(f"Starting parallel-style Markdown conversion for {total_pages_to_process} pages...")
394
+
395
+ # Pre-process all pages
396
+ page_text_dict = extract_text_from_pdf(doc, target_page_indices)
397
+ page_image_dict = render_pdf_as_image(
398
+ doc,
399
+ page_indices=target_page_indices,
400
+ zoom=self.image_zoom,
401
+ output=self.image_format,
402
+ jpg_quality=self.image_jpg_quality,
403
+ )
404
+
405
+ full_markdown_output: List[str] = []
406
+
407
+ for i, page_idx in enumerate(target_page_indices):
408
+ logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
409
+
410
+ try:
411
+ # Get previous page data for context
412
+ prev_page_idx = target_page_indices[i - 1] if i > 0 else None
413
+ previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
414
+ previous_page_image_b64 = None
415
+ if prev_page_idx is not None:
416
+ previous_page_image_b64 = Base64Image.from_bytes(
417
+ page_image_dict[prev_page_idx], ext=self.image_format
418
+ )
419
+
420
+ message = self._format_prompt_content_parallel(
421
+ page_text=page_text_dict.get(page_idx, ""),
422
+ page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
423
+ previous_page_text=previous_page_text,
424
+ previous_page_image_b64=previous_page_image_b64,
425
+ page_number=page_idx,
426
+ total_pages=len(doc),
427
+ )
428
+
429
+ response = self.chatterer.generate([message])
430
+
431
+ # Extract markdown
432
+ markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
433
+ if markdowns:
434
+ current_page_markdown = "\n".join(markdowns)
435
+ else:
436
+ current_page_markdown = response.strip()
437
+ if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
438
+ current_page_markdown = current_page_markdown[3:-3].strip()
439
+
440
+ full_markdown_output.append(current_page_markdown)
441
+
442
+ except Exception as e:
443
+ logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
444
+ continue
445
+
446
+ # Progress callback
447
+ if progress_callback:
448
+ try:
449
+ progress_callback(i + 1, total_pages_to_process)
450
+ except Exception as cb_err:
451
+ logger.warning(f"Progress callback failed: {cb_err}")
452
+
453
+ return "\n\n".join(full_markdown_output).strip()
454
+
455
+
456
+ def render_pdf_as_image(
457
+ doc: "Document",
458
+ zoom: float = 2.0,
459
+ output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
460
+ jpg_quality: int = 100,
461
+ page_indices: Iterable[int] | int | None = None,
462
+ ) -> dict[int, bytes]:
463
+ """
464
+ Convert PDF pages to images in bytes.
465
+
466
+ Args:
467
+ doc (Document): The PDF document to convert.
468
+ zoom (float): Zoom factor for the image resolution. Default is 2.0.
469
+ output (str): Output format for the image. Default is 'png'.
470
+ jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
471
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
472
+ If an int is provided, only that page is converted.
473
+
474
+ Returns:
475
+ dict[int, bytes]: A dictionary mapping page numbers to image bytes.
476
+ """
477
+ from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
478
+ from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
479
+
480
+ images_bytes: dict[int, bytes] = {}
481
+ matrix = Matrix(zoom, zoom) # Control output resolution
482
+ for page_idx in _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True):
483
+ img_bytes = bytes(
484
+ get_pixmap(
485
+ page=doc[page_idx],
486
+ matrix=matrix,
487
+ ).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
488
+ )
489
+ images_bytes[page_idx] = img_bytes
490
+ return images_bytes
491
+
492
+
493
+ def extract_text_from_pdf(doc: "Document", page_indices: Optional[PageIndexType] = None) -> dict[int, str]:
494
+ """Convert a PDF file to plain text.
495
+
496
+ Extracts text from each page of a PDF file and formats it with page markers.
497
+
498
+ Args:
499
+ doc (Document): The PDF document to convert.
500
+ page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
501
+ If an int is provided, only that page is converted.
502
+
503
+ Returns:
504
+ dict[int, str]: A dictionary mapping page numbers to text content.
505
+ """
506
+ return {
507
+ page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
508
+ for page_idx in _get_page_indices(
509
+ page_indices=page_indices,
510
+ max_doc_pages=len(doc),
511
+ is_input_zero_based=True,
512
+ )
513
+ }
514
+
515
+
516
+ @contextmanager
517
+ def open_pdf(pdf_input: "PathOrReadable | Document"):
518
+ """Open a PDF document from a file path or use an existing Document object.
519
+
520
+ Args:
521
+ pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
522
+
523
+ Returns:
524
+ tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
525
+ """
526
+ import pymupdf # pyright: ignore[reportMissingTypeStubs]
527
+
528
+ should_close = True
529
+
530
+ if isinstance(pdf_input, pymupdf.Document):
531
+ should_close = False
532
+ doc = pdf_input
533
+ else:
534
+ with read_bytes_stream(pdf_input) as stream:
535
+ if stream is None:
536
+ raise FileNotFoundError(pdf_input)
537
+ doc = pymupdf.Document(stream=stream.read())
538
+ yield doc
539
+ if should_close:
540
+ doc.close()
541
+
542
+
543
+ def _get_page_indices(
544
+ page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool
545
+ ) -> list[int]:
546
+ """Helper function to handle page indices for PDF conversion."""
547
+
548
+ def _to_zero_based_int(idx: int) -> int:
549
+ """Convert a 1-based index to a 0-based index if necessary."""
550
+ if is_input_zero_based:
551
+ return idx
552
+ else:
553
+ if idx < 1 or idx > max_doc_pages:
554
+ raise ValueError(f"Index {idx} is out of bounds for document with {max_doc_pages} pages (1-based).")
555
+ return idx - 1
556
+
557
+ if page_indices is None:
558
+ return list(range(max_doc_pages)) # Convert all pages
559
+ elif isinstance(page_indices, int):
560
+ # Handle single integer input for page index
561
+ return [_to_zero_based_int(page_indices)]
562
+ elif isinstance(page_indices, str):
563
+ # Handle string input for page indices
564
+ return _interpret_index_string(
565
+ index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based
566
+ )
567
+ else:
568
+ # Handle iterable input for page indices
569
+ indices: set[int] = set()
570
+ for idx in page_indices:
571
+ if isinstance(idx, int):
572
+ indices.add(_to_zero_based_int(idx))
573
+ else:
574
+ start, end = idx
575
+ if isinstance(start, EllipsisType):
576
+ start = 0
577
+ else:
578
+ start = _to_zero_based_int(start)
579
+
580
+ if isinstance(end, EllipsisType):
581
+ end = max_doc_pages - 1
582
+ else:
583
+ end = _to_zero_based_int(end)
584
+
585
+ if start > end:
586
+ raise ValueError(
587
+ f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
588
+ )
589
+ indices.update(range(start, end + 1))
590
+
591
+ return sorted(indices) # Return sorted list of indices
592
+
593
+
594
+ def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
595
+ """Interpret a string of comma-separated indices and ranges."""
596
+
597
+ def _to_zero_based_int(idx_str: str) -> int:
598
+ i = int(idx_str)
599
+ if is_input_zero_based:
600
+ if i < 0 or i >= max_doc_pages:
601
+ raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages.")
602
+ return i
603
+ else:
604
+ if i < 1 or i > max_doc_pages:
605
+ raise ValueError(f"Index {i} is out of bounds for document with {max_doc_pages} pages (1-based).")
606
+ return i - 1 # Convert to zero-based index
607
+
608
+ indices: set[int] = set()
609
+ for part in index_str.split(","):
610
+ part: str = part.strip()
611
+ count_dash: int = part.count("-")
612
+ if count_dash == 0:
613
+ indices.add(_to_zero_based_int(part))
614
+ elif count_dash == 1:
615
+ idx_dash: int = part.index("-")
616
+ start = part[:idx_dash].strip()
617
+ end = part[idx_dash + 1 :].strip()
618
+ if not start:
619
+ start = _to_zero_based_int("0") # Default to 0 if no start index is provided
620
+ else:
621
+ start = _to_zero_based_int(start)
622
+
623
+ if not end:
624
+ end = _to_zero_based_int(str(max_doc_pages - 1)) # Default to last page if no end index is provided
625
+ else:
626
+ end = _to_zero_based_int(end)
627
+
628
+ if start > end:
629
+ raise ValueError(
630
+ f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
631
+ )
632
+ indices.update(range(start, end + 1))
633
+ else:
634
+ raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")
635
+
636
+ return sorted(indices) # Return sorted list of indices, ensuring no duplicates