chatterer 0.1.16__py3-none-any.whl → 0.1.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/__init__.py +93 -93
- chatterer/common_types/__init__.py +21 -21
- chatterer/common_types/io.py +19 -19
- chatterer/interactive.py +354 -692
- chatterer/language_model.py +533 -533
- chatterer/messages.py +21 -21
- chatterer/strategies/__init__.py +13 -13
- chatterer/strategies/atom_of_thoughts.py +975 -975
- chatterer/strategies/base.py +14 -14
- chatterer/tools/__init__.py +46 -46
- chatterer/tools/caption_markdown_images.py +384 -384
- chatterer/tools/citation_chunking/__init__.py +3 -3
- chatterer/tools/citation_chunking/chunks.py +53 -53
- chatterer/tools/citation_chunking/citation_chunker.py +118 -118
- chatterer/tools/citation_chunking/citations.py +285 -285
- chatterer/tools/citation_chunking/prompt.py +157 -157
- chatterer/tools/citation_chunking/reference.py +26 -26
- chatterer/tools/citation_chunking/utils.py +138 -138
- chatterer/tools/convert_pdf_to_markdown.py +302 -302
- chatterer/tools/convert_to_text.py +447 -447
- chatterer/tools/upstage_document_parser.py +705 -705
- chatterer/tools/webpage_to_markdown.py +739 -739
- chatterer/tools/youtube.py +146 -146
- chatterer/utils/__init__.py +15 -15
- chatterer/utils/base64_image.py +285 -285
- chatterer/utils/bytesio.py +59 -59
- chatterer/utils/code_agent.py +237 -237
- chatterer/utils/imghdr.py +148 -148
- {chatterer-0.1.16.dist-info → chatterer-0.1.17.dist-info}/METADATA +392 -392
- chatterer-0.1.17.dist-info/RECORD +33 -0
- {chatterer-0.1.16.dist-info → chatterer-0.1.17.dist-info}/WHEEL +1 -1
- chatterer-0.1.16.dist-info/RECORD +0 -33
- {chatterer-0.1.16.dist-info → chatterer-0.1.17.dist-info}/top_level.txt +0 -0
@@ -1,302 +1,302 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import logging
|
4
|
-
import re
|
5
|
-
from contextlib import contextmanager
|
6
|
-
from dataclasses import dataclass
|
7
|
-
from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional, Union
|
8
|
-
|
9
|
-
from ..language_model import Chatterer, HumanMessage
|
10
|
-
from ..utils.base64_image import Base64Image
|
11
|
-
from ..utils.bytesio import PathOrReadable, read_bytes_stream
|
12
|
-
|
13
|
-
if TYPE_CHECKING:
|
14
|
-
from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
|
15
|
-
|
16
|
-
# Setup basic logging
|
17
|
-
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
18
|
-
logger = logging.getLogger(__name__)
|
19
|
-
MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
|
20
|
-
|
21
|
-
|
22
|
-
@dataclass
|
23
|
-
class PdfToMarkdown:
|
24
|
-
"""
|
25
|
-
Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
|
26
|
-
Processes PDFs page by page, providing the LLM with both the extracted raw
|
27
|
-
text and a rendered image of the page to handle complex layouts. It maintains
|
28
|
-
context between pages by feeding the *tail end* of the previously generated
|
29
|
-
Markdown back into the prompt for the next page to ensure smooth transitions.
|
30
|
-
"""
|
31
|
-
|
32
|
-
chatterer: Chatterer
|
33
|
-
"""An instance of the Chatterer class configured with a vision-capable model."""
|
34
|
-
image_zoom: float = 2.0
|
35
|
-
"""Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
|
36
|
-
image_format: Literal["jpg", "jpeg", "png"] = "png"
|
37
|
-
"""The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
|
38
|
-
image_jpg_quality: int = 95
|
39
|
-
"""Quality for JPEG images (if used)."""
|
40
|
-
context_tail_lines: int = 10
|
41
|
-
"""Number of lines from the end of the previous page's Markdown to use as context."""
|
42
|
-
# max_context_tokens: Optional[int] = None # This can be added later if needed
|
43
|
-
|
44
|
-
def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
|
45
|
-
"""Extracts the last N lines from the given markdown text."""
|
46
|
-
if not markdown_text or self.context_tail_lines <= 0:
|
47
|
-
return None
|
48
|
-
lines = markdown_text.strip().splitlines()
|
49
|
-
if not lines:
|
50
|
-
return None
|
51
|
-
# Get the last N lines, or fewer if the text is shorter
|
52
|
-
tail_lines = lines[-self.context_tail_lines :]
|
53
|
-
return "\n".join(tail_lines)
|
54
|
-
|
55
|
-
def _format_prompt_content(
|
56
|
-
self,
|
57
|
-
page_text: str,
|
58
|
-
page_image_b64: Base64Image,
|
59
|
-
previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
|
60
|
-
page_number: int = 0, # For context, 0-indexed
|
61
|
-
total_pages: int = 1,
|
62
|
-
) -> HumanMessage:
|
63
|
-
"""
|
64
|
-
Formats the content list for the HumanMessage input to the LLM.
|
65
|
-
Uses only the tail end of the previous page's markdown for context.
|
66
|
-
"""
|
67
|
-
# Construct the main instruction prompt
|
68
|
-
instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
|
69
|
-
1. The raw text extracted from the page ([Raw Text]).
|
70
|
-
2. A rendered image of the page ([Rendered Image]) showing its visual layout.
|
71
|
-
3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
|
72
|
-
|
73
|
-
**Conversion Requirements:**
|
74
|
-
* **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
|
75
|
-
* **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
|
76
|
-
* **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use ``.
|
77
|
-
* **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
|
78
|
-
* **Continuity (Crucial):**
|
79
|
-
* Examine the [End of Previous Page Markdown] if provided.
|
80
|
-
* If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
|
81
|
-
* For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
|
82
|
-
* **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
|
83
|
-
* If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
|
84
|
-
|
85
|
-
**Input Data:**
|
86
|
-
[Raw Text]
|
87
|
-
```
|
88
|
-
{page_text if page_text else "No text extracted from this page."}
|
89
|
-
```
|
90
|
-
[Rendered Image]
|
91
|
-
(See attached image)
|
92
|
-
"""
|
93
|
-
if previous_markdown_context_tail:
|
94
|
-
instruction += f"""[End of Previous Page Markdown]
|
95
|
-
```markdown
|
96
|
-
... (content from previous page ends with) ...
|
97
|
-
{previous_markdown_context_tail}
|
98
|
-
```
|
99
|
-
**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
|
100
|
-
else:
|
101
|
-
instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
|
102
|
-
|
103
|
-
instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
|
104
|
-
|
105
|
-
# Structure for multimodal input
|
106
|
-
return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
|
107
|
-
|
108
|
-
def convert(
|
109
|
-
self,
|
110
|
-
pdf_input: Union[str, "Document"],
|
111
|
-
page_indices: Optional[Union[Iterable[int], int]] = None,
|
112
|
-
progress_callback: Optional[Callable[[int, int], None]] = None,
|
113
|
-
) -> str:
|
114
|
-
"""
|
115
|
-
Converts a PDF document (or specific pages) to Markdown synchronously.
|
116
|
-
Args:
|
117
|
-
pdf_input: Path to the PDF file or a pymupdf.Document object.
|
118
|
-
page_indices: Specific 0-based page indices to convert. If None, converts all pages.
|
119
|
-
Can be a single int or an iterable of ints.
|
120
|
-
progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
|
121
|
-
after each page is processed.
|
122
|
-
Returns:
|
123
|
-
A single string containing the concatenated Markdown output for the processed pages.
|
124
|
-
"""
|
125
|
-
with open_pdf(pdf_input) as doc:
|
126
|
-
target_page_indices = list(_get_page_indices(page_indices, len(doc)))
|
127
|
-
total_pages_to_process = len(target_page_indices)
|
128
|
-
if total_pages_to_process == 0:
|
129
|
-
logger.warning("No pages selected for processing.")
|
130
|
-
return ""
|
131
|
-
|
132
|
-
full_markdown_output: List[str] = []
|
133
|
-
# --- Context Tracking ---
|
134
|
-
previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
|
135
|
-
|
136
|
-
# Pre-process all pages (optional optimization)
|
137
|
-
logger.info("Extracting text and rendering images for selected pages...")
|
138
|
-
page_text_dict = extract_text_from_pdf(doc, target_page_indices)
|
139
|
-
page_image_dict = render_pdf_as_image(
|
140
|
-
doc,
|
141
|
-
page_indices=target_page_indices,
|
142
|
-
zoom=self.image_zoom,
|
143
|
-
output=self.image_format,
|
144
|
-
jpg_quality=self.image_jpg_quality,
|
145
|
-
)
|
146
|
-
logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
|
147
|
-
|
148
|
-
page_idx: int = target_page_indices.pop(0) # Get the first page index
|
149
|
-
i: int = 1
|
150
|
-
while True:
|
151
|
-
logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
|
152
|
-
try:
|
153
|
-
# --- Get Context Tail ---
|
154
|
-
context_tail = self._get_context_tail(previous_page_markdown)
|
155
|
-
|
156
|
-
message = self._format_prompt_content(
|
157
|
-
page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
|
158
|
-
page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
|
159
|
-
previous_markdown_context_tail=context_tail, # Pass only the tail
|
160
|
-
page_number=page_idx,
|
161
|
-
total_pages=len(doc),
|
162
|
-
)
|
163
|
-
logger.debug(f"Sending request to LLM for page index {page_idx}...")
|
164
|
-
|
165
|
-
response = self.chatterer([message])
|
166
|
-
# Extract markdown, handling potential lack of backticks
|
167
|
-
markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
|
168
|
-
if markdowns:
|
169
|
-
current_page_markdown = "\n".join(markdowns)
|
170
|
-
else:
|
171
|
-
# Fallback: assume the whole response is markdown if no ```markdown blocks found
|
172
|
-
current_page_markdown = response.strip()
|
173
|
-
if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
|
174
|
-
# Basic cleanup if it just missed the 'markdown' language tag
|
175
|
-
current_page_markdown = current_page_markdown[3:-3].strip()
|
176
|
-
elif "```" in current_page_markdown:
|
177
|
-
logger.warning(
|
178
|
-
f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
|
179
|
-
)
|
180
|
-
|
181
|
-
logger.debug(f"Received response from LLM for page index {page_idx}.")
|
182
|
-
|
183
|
-
# --- Store result and update context ---
|
184
|
-
full_markdown_output.append(current_page_markdown)
|
185
|
-
# Update the *full* previous markdown for the *next* iteration's tail calculation
|
186
|
-
previous_page_markdown = current_page_markdown
|
187
|
-
|
188
|
-
except Exception as e:
|
189
|
-
logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
|
190
|
-
continue
|
191
|
-
|
192
|
-
# Progress callback
|
193
|
-
if progress_callback:
|
194
|
-
try:
|
195
|
-
progress_callback(i, total_pages_to_process)
|
196
|
-
except Exception as cb_err:
|
197
|
-
logger.warning(f"Progress callback failed: {cb_err}")
|
198
|
-
|
199
|
-
if not target_page_indices:
|
200
|
-
break
|
201
|
-
|
202
|
-
page_idx = target_page_indices.pop(0) # Get the next page index
|
203
|
-
i += 1 # Increment the page counter
|
204
|
-
|
205
|
-
# Join with double newline, potentially adjust based on how well continuations work
|
206
|
-
return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
|
207
|
-
|
208
|
-
|
209
|
-
def render_pdf_as_image(
|
210
|
-
doc: "Document",
|
211
|
-
zoom: float = 2.0,
|
212
|
-
output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
|
213
|
-
jpg_quality: int = 100,
|
214
|
-
page_indices: Iterable[int] | int | None = None,
|
215
|
-
) -> dict[int, bytes]:
|
216
|
-
"""
|
217
|
-
Convert PDF pages to images in bytes.
|
218
|
-
|
219
|
-
Args:
|
220
|
-
doc (Document): The PDF document to convert.
|
221
|
-
zoom (float): Zoom factor for the image resolution. Default is 2.0.
|
222
|
-
output (str): Output format for the image. Default is 'png'.
|
223
|
-
jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
|
224
|
-
page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
|
225
|
-
If an int is provided, only that page is converted.
|
226
|
-
|
227
|
-
Returns:
|
228
|
-
dict[int, bytes]: A dictionary mapping page numbers to image bytes.
|
229
|
-
"""
|
230
|
-
from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
|
231
|
-
from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
|
232
|
-
|
233
|
-
images_bytes: dict[int, bytes] = {}
|
234
|
-
matrix = Matrix(zoom, zoom) # Control output resolution
|
235
|
-
for page_idx in _get_page_indices(page_indices, len(doc)):
|
236
|
-
img_bytes = bytes(
|
237
|
-
get_pixmap(
|
238
|
-
page=doc[page_idx],
|
239
|
-
matrix=matrix,
|
240
|
-
).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
|
241
|
-
)
|
242
|
-
images_bytes[page_idx] = img_bytes
|
243
|
-
return images_bytes
|
244
|
-
|
245
|
-
|
246
|
-
def extract_text_from_pdf(
|
247
|
-
doc: "Document",
|
248
|
-
page_indices: Iterable[int] | int | None = None,
|
249
|
-
) -> dict[int, str]:
|
250
|
-
"""Convert a PDF file to plain text.
|
251
|
-
|
252
|
-
Extracts text from each page of a PDF file and formats it with page markers.
|
253
|
-
|
254
|
-
Args:
|
255
|
-
doc (Document): The PDF document to convert.
|
256
|
-
page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
|
257
|
-
If an int is provided, only that page is converted.
|
258
|
-
|
259
|
-
Returns:
|
260
|
-
dict[int, str]: A dictionary mapping page numbers to text content.
|
261
|
-
"""
|
262
|
-
return {
|
263
|
-
page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
|
264
|
-
for page_idx in _get_page_indices(page_indices, len(doc))
|
265
|
-
}
|
266
|
-
|
267
|
-
|
268
|
-
@contextmanager
|
269
|
-
def open_pdf(pdf_input: PathOrReadable | Document):
|
270
|
-
"""Open a PDF document from a file path or use an existing Document object.
|
271
|
-
|
272
|
-
Args:
|
273
|
-
pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
|
274
|
-
|
275
|
-
Returns:
|
276
|
-
tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
|
277
|
-
"""
|
278
|
-
import pymupdf # pyright: ignore[reportMissingTypeStubs]
|
279
|
-
|
280
|
-
should_close = True
|
281
|
-
|
282
|
-
if isinstance(pdf_input, pymupdf.Document):
|
283
|
-
should_close = False
|
284
|
-
doc = pdf_input
|
285
|
-
else:
|
286
|
-
with read_bytes_stream(pdf_input) as stream:
|
287
|
-
if stream is None:
|
288
|
-
raise FileNotFoundError(pdf_input)
|
289
|
-
doc = pymupdf.Document(stream=stream.read())
|
290
|
-
yield doc
|
291
|
-
if should_close:
|
292
|
-
doc.close()
|
293
|
-
|
294
|
-
|
295
|
-
def _get_page_indices(page_indices: Iterable[int] | int | None, max_doc_pages: int) -> Iterable[int]:
|
296
|
-
"""Helper function to handle page indices for PDF conversion."""
|
297
|
-
if page_indices is None:
|
298
|
-
return range(max_doc_pages)
|
299
|
-
elif isinstance(page_indices, int):
|
300
|
-
return [page_indices]
|
301
|
-
else:
|
302
|
-
return [i for i in page_indices if 0 <= i < max_doc_pages]
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import logging
|
4
|
+
import re
|
5
|
+
from contextlib import contextmanager
|
6
|
+
from dataclasses import dataclass
|
7
|
+
from typing import TYPE_CHECKING, Callable, Iterable, List, Literal, Optional, Union
|
8
|
+
|
9
|
+
from ..language_model import Chatterer, HumanMessage
|
10
|
+
from ..utils.base64_image import Base64Image
|
11
|
+
from ..utils.bytesio import PathOrReadable, read_bytes_stream
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from pymupdf import Document # pyright: ignore[reportMissingTypeStubs]
|
15
|
+
|
16
|
+
# Setup basic logging
|
17
|
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
18
|
+
logger = logging.getLogger(__name__)
|
19
|
+
MARKDOWN_PATTERN: re.Pattern[str] = re.compile(r"```(?:markdown\s*\n)?(.*?)```", re.DOTALL)
|
20
|
+
|
21
|
+
|
22
|
+
@dataclass
|
23
|
+
class PdfToMarkdown:
|
24
|
+
"""
|
25
|
+
Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
|
26
|
+
Processes PDFs page by page, providing the LLM with both the extracted raw
|
27
|
+
text and a rendered image of the page to handle complex layouts. It maintains
|
28
|
+
context between pages by feeding the *tail end* of the previously generated
|
29
|
+
Markdown back into the prompt for the next page to ensure smooth transitions.
|
30
|
+
"""
|
31
|
+
|
32
|
+
chatterer: Chatterer
|
33
|
+
"""An instance of the Chatterer class configured with a vision-capable model."""
|
34
|
+
image_zoom: float = 2.0
|
35
|
+
"""Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
|
36
|
+
image_format: Literal["jpg", "jpeg", "png"] = "png"
|
37
|
+
"""The format for the rendered image ('png', 'jpeg', 'jpg'.)."""
|
38
|
+
image_jpg_quality: int = 95
|
39
|
+
"""Quality for JPEG images (if used)."""
|
40
|
+
context_tail_lines: int = 10
|
41
|
+
"""Number of lines from the end of the previous page's Markdown to use as context."""
|
42
|
+
# max_context_tokens: Optional[int] = None # This can be added later if needed
|
43
|
+
|
44
|
+
def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
|
45
|
+
"""Extracts the last N lines from the given markdown text."""
|
46
|
+
if not markdown_text or self.context_tail_lines <= 0:
|
47
|
+
return None
|
48
|
+
lines = markdown_text.strip().splitlines()
|
49
|
+
if not lines:
|
50
|
+
return None
|
51
|
+
# Get the last N lines, or fewer if the text is shorter
|
52
|
+
tail_lines = lines[-self.context_tail_lines :]
|
53
|
+
return "\n".join(tail_lines)
|
54
|
+
|
55
|
+
def _format_prompt_content(
|
56
|
+
self,
|
57
|
+
page_text: str,
|
58
|
+
page_image_b64: Base64Image,
|
59
|
+
previous_markdown_context_tail: Optional[str] = None, # Renamed for clarity
|
60
|
+
page_number: int = 0, # For context, 0-indexed
|
61
|
+
total_pages: int = 1,
|
62
|
+
) -> HumanMessage:
|
63
|
+
"""
|
64
|
+
Formats the content list for the HumanMessage input to the LLM.
|
65
|
+
Uses only the tail end of the previous page's markdown for context.
|
66
|
+
"""
|
67
|
+
# Construct the main instruction prompt
|
68
|
+
instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
|
69
|
+
1. The raw text extracted from the page ([Raw Text]).
|
70
|
+
2. A rendered image of the page ([Rendered Image]) showing its visual layout.
|
71
|
+
3. (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
|
72
|
+
|
73
|
+
**Conversion Requirements:**
|
74
|
+
* **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
|
75
|
+
* **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
|
76
|
+
* **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use ``.
|
77
|
+
* **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
|
78
|
+
* **Continuity (Crucial):**
|
79
|
+
* Examine the [End of Previous Page Markdown] if provided.
|
80
|
+
* If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
|
81
|
+
* For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
|
82
|
+
* **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
|
83
|
+
* If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
|
84
|
+
|
85
|
+
**Input Data:**
|
86
|
+
[Raw Text]
|
87
|
+
```
|
88
|
+
{page_text if page_text else "No text extracted from this page."}
|
89
|
+
```
|
90
|
+
[Rendered Image]
|
91
|
+
(See attached image)
|
92
|
+
"""
|
93
|
+
if previous_markdown_context_tail:
|
94
|
+
instruction += f"""[End of Previous Page Markdown]
|
95
|
+
```markdown
|
96
|
+
... (content from previous page ends with) ...
|
97
|
+
{previous_markdown_context_tail}
|
98
|
+
```
|
99
|
+
**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
|
100
|
+
else:
|
101
|
+
instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
|
102
|
+
|
103
|
+
instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
|
104
|
+
|
105
|
+
# Structure for multimodal input
|
106
|
+
return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
|
107
|
+
|
108
|
+
def convert(
|
109
|
+
self,
|
110
|
+
pdf_input: Union[str, "Document"],
|
111
|
+
page_indices: Optional[Union[Iterable[int], int]] = None,
|
112
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
113
|
+
) -> str:
|
114
|
+
"""
|
115
|
+
Converts a PDF document (or specific pages) to Markdown synchronously.
|
116
|
+
Args:
|
117
|
+
pdf_input: Path to the PDF file or a pymupdf.Document object.
|
118
|
+
page_indices: Specific 0-based page indices to convert. If None, converts all pages.
|
119
|
+
Can be a single int or an iterable of ints.
|
120
|
+
progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
|
121
|
+
after each page is processed.
|
122
|
+
Returns:
|
123
|
+
A single string containing the concatenated Markdown output for the processed pages.
|
124
|
+
"""
|
125
|
+
with open_pdf(pdf_input) as doc:
|
126
|
+
target_page_indices = list(_get_page_indices(page_indices, len(doc)))
|
127
|
+
total_pages_to_process = len(target_page_indices)
|
128
|
+
if total_pages_to_process == 0:
|
129
|
+
logger.warning("No pages selected for processing.")
|
130
|
+
return ""
|
131
|
+
|
132
|
+
full_markdown_output: List[str] = []
|
133
|
+
# --- Context Tracking ---
|
134
|
+
previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
|
135
|
+
|
136
|
+
# Pre-process all pages (optional optimization)
|
137
|
+
logger.info("Extracting text and rendering images for selected pages...")
|
138
|
+
page_text_dict = extract_text_from_pdf(doc, target_page_indices)
|
139
|
+
page_image_dict = render_pdf_as_image(
|
140
|
+
doc,
|
141
|
+
page_indices=target_page_indices,
|
142
|
+
zoom=self.image_zoom,
|
143
|
+
output=self.image_format,
|
144
|
+
jpg_quality=self.image_jpg_quality,
|
145
|
+
)
|
146
|
+
logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
|
147
|
+
|
148
|
+
page_idx: int = target_page_indices.pop(0) # Get the first page index
|
149
|
+
i: int = 1
|
150
|
+
while True:
|
151
|
+
logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
|
152
|
+
try:
|
153
|
+
# --- Get Context Tail ---
|
154
|
+
context_tail = self._get_context_tail(previous_page_markdown)
|
155
|
+
|
156
|
+
message = self._format_prompt_content(
|
157
|
+
page_text=page_text_dict.get(page_idx, ""), # Use .get for safety
|
158
|
+
page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
|
159
|
+
previous_markdown_context_tail=context_tail, # Pass only the tail
|
160
|
+
page_number=page_idx,
|
161
|
+
total_pages=len(doc),
|
162
|
+
)
|
163
|
+
logger.debug(f"Sending request to LLM for page index {page_idx}...")
|
164
|
+
|
165
|
+
response = self.chatterer([message])
|
166
|
+
# Extract markdown, handling potential lack of backticks
|
167
|
+
markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
|
168
|
+
if markdowns:
|
169
|
+
current_page_markdown = "\n".join(markdowns)
|
170
|
+
else:
|
171
|
+
# Fallback: assume the whole response is markdown if no ```markdown blocks found
|
172
|
+
current_page_markdown = response.strip()
|
173
|
+
if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
|
174
|
+
# Basic cleanup if it just missed the 'markdown' language tag
|
175
|
+
current_page_markdown = current_page_markdown[3:-3].strip()
|
176
|
+
elif "```" in current_page_markdown:
|
177
|
+
logger.warning(
|
178
|
+
f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
|
179
|
+
)
|
180
|
+
|
181
|
+
logger.debug(f"Received response from LLM for page index {page_idx}.")
|
182
|
+
|
183
|
+
# --- Store result and update context ---
|
184
|
+
full_markdown_output.append(current_page_markdown)
|
185
|
+
# Update the *full* previous markdown for the *next* iteration's tail calculation
|
186
|
+
previous_page_markdown = current_page_markdown
|
187
|
+
|
188
|
+
except Exception as e:
|
189
|
+
logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
|
190
|
+
continue
|
191
|
+
|
192
|
+
# Progress callback
|
193
|
+
if progress_callback:
|
194
|
+
try:
|
195
|
+
progress_callback(i, total_pages_to_process)
|
196
|
+
except Exception as cb_err:
|
197
|
+
logger.warning(f"Progress callback failed: {cb_err}")
|
198
|
+
|
199
|
+
if not target_page_indices:
|
200
|
+
break
|
201
|
+
|
202
|
+
page_idx = target_page_indices.pop(0) # Get the next page index
|
203
|
+
i += 1 # Increment the page counter
|
204
|
+
|
205
|
+
# Join with double newline, potentially adjust based on how well continuations work
|
206
|
+
return "\n\n".join(full_markdown_output).strip() # Add strip() to remove leading/trailing whitespace
|
207
|
+
|
208
|
+
|
209
|
+
def render_pdf_as_image(
|
210
|
+
doc: "Document",
|
211
|
+
zoom: float = 2.0,
|
212
|
+
output: Literal["png", "pnm", "pgm", "ppm", "pbm", "pam", "tga", "tpic", "psd", "ps", "jpg", "jpeg"] = "png",
|
213
|
+
jpg_quality: int = 100,
|
214
|
+
page_indices: Iterable[int] | int | None = None,
|
215
|
+
) -> dict[int, bytes]:
|
216
|
+
"""
|
217
|
+
Convert PDF pages to images in bytes.
|
218
|
+
|
219
|
+
Args:
|
220
|
+
doc (Document): The PDF document to convert.
|
221
|
+
zoom (float): Zoom factor for the image resolution. Default is 2.0.
|
222
|
+
output (str): Output format for the image. Default is 'png'.
|
223
|
+
jpg_quality (int): Quality of JPEG images (1-100). Default is 100.
|
224
|
+
page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
|
225
|
+
If an int is provided, only that page is converted.
|
226
|
+
|
227
|
+
Returns:
|
228
|
+
dict[int, bytes]: A dictionary mapping page numbers to image bytes.
|
229
|
+
"""
|
230
|
+
from pymupdf import Matrix # pyright: ignore[reportMissingTypeStubs]
|
231
|
+
from pymupdf.utils import get_pixmap # pyright: ignore[reportMissingTypeStubs, reportUnknownVariableType]
|
232
|
+
|
233
|
+
images_bytes: dict[int, bytes] = {}
|
234
|
+
matrix = Matrix(zoom, zoom) # Control output resolution
|
235
|
+
for page_idx in _get_page_indices(page_indices, len(doc)):
|
236
|
+
img_bytes = bytes(
|
237
|
+
get_pixmap(
|
238
|
+
page=doc[page_idx],
|
239
|
+
matrix=matrix,
|
240
|
+
).tobytes(output=output, jpg_quality=jpg_quality) # pyright: ignore[reportUnknownArgumentType]
|
241
|
+
)
|
242
|
+
images_bytes[page_idx] = img_bytes
|
243
|
+
return images_bytes
|
244
|
+
|
245
|
+
|
246
|
+
def extract_text_from_pdf(
|
247
|
+
doc: "Document",
|
248
|
+
page_indices: Iterable[int] | int | None = None,
|
249
|
+
) -> dict[int, str]:
|
250
|
+
"""Convert a PDF file to plain text.
|
251
|
+
|
252
|
+
Extracts text from each page of a PDF file and formats it with page markers.
|
253
|
+
|
254
|
+
Args:
|
255
|
+
doc (Document): The PDF document to convert.
|
256
|
+
page_indices (Iterable[int] | int | None): Specific pages to convert. If None, all pages are converted.
|
257
|
+
If an int is provided, only that page is converted.
|
258
|
+
|
259
|
+
Returns:
|
260
|
+
dict[int, str]: A dictionary mapping page numbers to text content.
|
261
|
+
"""
|
262
|
+
return {
|
263
|
+
page_idx: doc[page_idx].get_textpage().extractText().strip() # pyright: ignore[reportUnknownMemberType]
|
264
|
+
for page_idx in _get_page_indices(page_indices, len(doc))
|
265
|
+
}
|
266
|
+
|
267
|
+
|
268
|
+
@contextmanager
|
269
|
+
def open_pdf(pdf_input: PathOrReadable | Document):
|
270
|
+
"""Open a PDF document from a file path or use an existing Document object.
|
271
|
+
|
272
|
+
Args:
|
273
|
+
pdf_input (PathOrReadable | Document): The PDF file path or a pymupdf.Document object.
|
274
|
+
|
275
|
+
Returns:
|
276
|
+
tuple[Document, bool]: A tuple containing the opened Document object and a boolean indicating if it was opened internally.
|
277
|
+
"""
|
278
|
+
import pymupdf # pyright: ignore[reportMissingTypeStubs]
|
279
|
+
|
280
|
+
should_close = True
|
281
|
+
|
282
|
+
if isinstance(pdf_input, pymupdf.Document):
|
283
|
+
should_close = False
|
284
|
+
doc = pdf_input
|
285
|
+
else:
|
286
|
+
with read_bytes_stream(pdf_input) as stream:
|
287
|
+
if stream is None:
|
288
|
+
raise FileNotFoundError(pdf_input)
|
289
|
+
doc = pymupdf.Document(stream=stream.read())
|
290
|
+
yield doc
|
291
|
+
if should_close:
|
292
|
+
doc.close()
|
293
|
+
|
294
|
+
|
295
|
+
def _get_page_indices(page_indices: Iterable[int] | int | None, max_doc_pages: int) -> Iterable[int]:
|
296
|
+
"""Helper function to handle page indices for PDF conversion."""
|
297
|
+
if page_indices is None:
|
298
|
+
return range(max_doc_pages)
|
299
|
+
elif isinstance(page_indices, int):
|
300
|
+
return [page_indices]
|
301
|
+
else:
|
302
|
+
return [i for i in page_indices if 0 <= i < max_doc_pages]
|