chatterer 0.1.22__py3-none-any.whl → 0.1.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chatterer/examples/__main__.py +75 -0
- chatterer/examples/{anything_to_markdown.py → any2md.py} +9 -9
- chatterer/examples/pdf2md.py +338 -0
- chatterer/examples/{pdf_to_text.py → pdf2txt.py} +5 -5
- chatterer/examples/{make_ppt.py → ppt.py} +5 -7
- chatterer/examples/pw.py +137 -0
- chatterer/examples/{get_code_snippets.py → snippet.py} +7 -7
- chatterer/examples/{transcription_api.py → transcribe.py} +6 -6
- chatterer/examples/{upstage_parser.py → upstage.py} +17 -17
- chatterer/examples/{webpage_to_markdown.py → web2md.py} +8 -12
- chatterer/strategies/atom_of_thoughts.py +161 -161
- chatterer/tools/convert_pdf_to_markdown.py +326 -94
- {chatterer-0.1.22.dist-info → chatterer-0.1.24.dist-info}/METADATA +6 -9
- {chatterer-0.1.22.dist-info → chatterer-0.1.24.dist-info}/RECORD +17 -16
- chatterer-0.1.24.dist-info/entry_points.txt +2 -0
- chatterer/examples/login_with_playwright.py +0 -156
- chatterer/examples/pdf_to_markdown.py +0 -77
- chatterer-0.1.22.dist-info/entry_points.txt +0 -10
- {chatterer-0.1.22.dist-info → chatterer-0.1.24.dist-info}/WHEEL +0 -0
- {chatterer-0.1.22.dist-info → chatterer-0.1.24.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import asyncio
|
3
4
|
import logging
|
4
5
|
import re
|
5
6
|
from contextlib import contextmanager
|
@@ -25,10 +26,11 @@ PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] |
|
|
25
26
|
class PdfToMarkdown:
|
26
27
|
"""
|
27
28
|
Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
29
|
+
|
30
|
+
This class supports both sequential and parallel processing:
|
31
|
+
- Sequential processing preserves strict page continuity using previous page context
|
32
|
+
- Parallel processing enables faster conversion for large documents by using
|
33
|
+
previous page image and text for context instead of generated markdown
|
32
34
|
"""
|
33
35
|
|
34
36
|
chatterer: Chatterer
|
@@ -40,8 +42,7 @@ class PdfToMarkdown:
|
|
40
42
|
image_jpg_quality: int = 95
|
41
43
|
"""Quality for JPEG images (if used)."""
|
42
44
|
context_tail_lines: int = 10
|
43
|
-
"""Number of lines from the end of the previous page's Markdown to use as context."""
|
44
|
-
# max_context_tokens: Optional[int] = None # This can be added later if needed
|
45
|
+
"""Number of lines from the end of the previous page's Markdown to use as context (sequential mode only)."""
|
45
46
|
|
46
47
|
def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
|
47
48
|
"""Extracts the last N lines from the given markdown text."""
|
@@ -50,94 +51,279 @@ class PdfToMarkdown:
|
|
50
51
|
lines = markdown_text.strip().splitlines()
|
51
52
|
if not lines:
|
52
53
|
return None
|
53
|
-
# Get the last N lines, or fewer if the text is shorter
|
54
54
|
tail_lines = lines[-self.context_tail_lines :]
|
55
55
|
return "\n".join(tail_lines)
|
56
56
|
|
57
|
-
def
|
57
|
+
def _format_prompt_content_sequential(
|
58
58
|
self,
|
59
59
|
page_text: str,
|
60
60
|
page_image_b64: Base64Image,
|
61
|
-
previous_markdown_context_tail: Optional[str] = None,
|
62
|
-
page_number: int = 0,
|
61
|
+
previous_markdown_context_tail: Optional[str] = None,
|
62
|
+
page_number: int = 0,
|
63
63
|
total_pages: int = 1,
|
64
64
|
) -> HumanMessage:
|
65
65
|
"""
|
66
|
-
Formats the content
|
67
|
-
Uses only the tail end of the previous page's markdown for context.
|
66
|
+
Formats the content for sequential processing using previous page's markdown context.
|
68
67
|
"""
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
**Input Data:**
|
88
|
-
[Raw Text]
|
68
|
+
instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
|
69
|
+
|
70
|
+
**Input provided:**
|
71
|
+
1. **Raw Text**: Extracted text from the PDF page (may contain OCR errors)
|
72
|
+
2. **Page Image**: Visual rendering of the page showing actual layout
|
73
|
+
3. **Previous Context**: End portion of the previous page's generated Markdown (if available)
|
74
|
+
|
75
|
+
**Conversion Rules:**
|
76
|
+
• **Text Structure**: Use the image to understand the actual layout and fix any OCR errors in the raw text
|
77
|
+
• **Headings**: Use appropriate heading levels (# ## ### etc.) based on visual hierarchy
|
78
|
+
• **Lists**: Convert to proper Markdown lists (- or 1. 2. 3.) maintaining structure
|
79
|
+
• **Tables**: Convert to Markdown table format using | pipes |
|
80
|
+
• **Images/Diagrams**: Describe significant visual elements as: `<details><summary>Figure: Brief title</summary>Detailed description based on what you see in the image</details>`
|
81
|
+
• **Code/Formulas**: Use ``` code blocks ``` or LaTeX $$ math $$ as appropriate
|
82
|
+
• **Continuity**: If previous context shows incomplete content (mid-sentence, list, table), seamlessly continue from that point
|
83
|
+
• **NO REPETITION**: Never repeat content from the previous context - only generate new content for this page
|
84
|
+
|
85
|
+
**Raw Text:**
|
89
86
|
```
|
90
87
|
{page_text if page_text else "No text extracted from this page."}
|
91
88
|
```
|
92
|
-
|
93
|
-
(
|
89
|
+
|
90
|
+
**Page Image:** (attached)
|
94
91
|
"""
|
92
|
+
|
95
93
|
if previous_markdown_context_tail:
|
96
|
-
instruction += f"""
|
94
|
+
instruction += f"""
|
95
|
+
**Previous Page Context (DO NOT REPEAT):**
|
97
96
|
```markdown
|
98
|
-
... (
|
97
|
+
... (previous page ended with) ...
|
99
98
|
{previous_markdown_context_tail}
|
100
99
|
```
|
101
|
-
|
100
|
+
|
101
|
+
Continue seamlessly from the above context if the current page content flows from it.
|
102
|
+
"""
|
102
103
|
else:
|
103
|
-
instruction += "**
|
104
|
+
instruction += "\n**Note:** This is the first page or start of a new section."
|
104
105
|
|
105
|
-
instruction += "\n\n**Output only the Markdown content for the current page
|
106
|
+
instruction += "\n\n**Output only the Markdown content for the current page. Ensure proper formatting and NO repetition of previous content.**"
|
106
107
|
|
107
|
-
# Structure for multimodal input
|
108
108
|
return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
|
109
109
|
|
110
|
+
def _format_prompt_content_parallel(
|
111
|
+
self,
|
112
|
+
page_text: str,
|
113
|
+
page_image_b64: Base64Image,
|
114
|
+
previous_page_text: Optional[str] = None,
|
115
|
+
previous_page_image_b64: Optional[Base64Image] = None,
|
116
|
+
page_number: int = 0,
|
117
|
+
total_pages: int = 1,
|
118
|
+
) -> HumanMessage:
|
119
|
+
"""
|
120
|
+
Formats the content for parallel processing using previous page's raw data.
|
121
|
+
"""
|
122
|
+
instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
|
123
|
+
|
124
|
+
**Task**: Convert the current page to Markdown while maintaining proper continuity with the previous page.
|
125
|
+
|
126
|
+
**Current Page Data:**
|
127
|
+
- **Raw Text**: Extracted text (may have OCR errors - use image to verify)
|
128
|
+
- **Page Image**: Visual rendering showing actual layout
|
129
|
+
|
130
|
+
**Previous Page Data** (for context only):
|
131
|
+
- **Previous Raw Text**: Text from the previous page
|
132
|
+
- **Previous Page Image**: Visual of the previous page
|
133
|
+
|
134
|
+
**Conversion Instructions:**
|
135
|
+
1. **Primary Focus**: Convert the CURRENT page content accurately
|
136
|
+
2. **Continuity Check**:
|
137
|
+
- Examine if the current page continues content from the previous page (sentences, paragraphs, lists, tables)
|
138
|
+
- If yes, start your Markdown naturally continuing that content
|
139
|
+
- If no, start fresh with proper heading/structure
|
140
|
+
3. **Format Rules**:
|
141
|
+
- Use image to fix OCR errors and understand layout
|
142
|
+
- Convert headings to # ## ### based on visual hierarchy
|
143
|
+
- Convert lists to proper Markdown (- or 1. 2. 3.)
|
144
|
+
- Convert tables to | pipe | format
|
145
|
+
- Describe significant images/charts as: `<details><summary>Figure: Title</summary>Description</details>`
|
146
|
+
- Use ``` for code blocks and $$ for math formulas
|
147
|
+
|
148
|
+
**Current Page Raw Text:**
|
149
|
+
```
|
150
|
+
{page_text if page_text else "No text extracted from this page."}
|
151
|
+
```
|
152
|
+
|
153
|
+
**Current Page Image:** (see first attached image)
|
154
|
+
"""
|
155
|
+
|
156
|
+
content = [instruction, page_image_b64.data_uri_content]
|
157
|
+
|
158
|
+
if previous_page_text is not None and previous_page_image_b64 is not None:
|
159
|
+
instruction += f"""
|
160
|
+
|
161
|
+
**Previous Page Raw Text (for context):**
|
162
|
+
```
|
163
|
+
{previous_page_text if previous_page_text else "No text from previous page."}
|
164
|
+
```
|
165
|
+
|
166
|
+
**Previous Page Image:** (see second attached image)
|
167
|
+
"""
|
168
|
+
content.append(previous_page_image_b64.data_uri_content)
|
169
|
+
else:
|
170
|
+
instruction += "\n**Note:** This is the first page - no previous context available."
|
171
|
+
|
172
|
+
instruction += "\n\n**Generate ONLY the Markdown for the current page. Ensure proper continuity and formatting.**"
|
173
|
+
content[0] = instruction
|
174
|
+
|
175
|
+
return HumanMessage(content=content)
|
176
|
+
|
110
177
|
def convert(
|
111
178
|
self,
|
112
179
|
pdf_input: "Document | PathOrReadable",
|
113
180
|
page_indices: Optional[PageIndexType] = None,
|
114
181
|
progress_callback: Optional[Callable[[int, int], None]] = None,
|
182
|
+
mode: Literal["sequential", "parallel"] = "sequential",
|
183
|
+
) -> str:
|
184
|
+
"""
|
185
|
+
Converts a PDF document to Markdown synchronously.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
pdf_input: Path to PDF file or pymupdf.Document object
|
189
|
+
page_indices: Specific page indices to convert (0-based). If None, converts all pages
|
190
|
+
progress_callback: Optional callback function called with (current_page, total_pages)
|
191
|
+
mode: "sequential" for strict continuity or "parallel" for independent page processing
|
192
|
+
|
193
|
+
Returns:
|
194
|
+
Concatenated Markdown string for all processed pages
|
195
|
+
"""
|
196
|
+
if mode == "sequential":
|
197
|
+
return self._convert_sequential(pdf_input, page_indices, progress_callback)
|
198
|
+
else:
|
199
|
+
return self._convert_parallel_sync(pdf_input, page_indices, progress_callback)
|
200
|
+
|
201
|
+
async def aconvert(
|
202
|
+
self,
|
203
|
+
pdf_input: "Document | PathOrReadable",
|
204
|
+
page_indices: Optional[PageIndexType] = None,
|
205
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
206
|
+
max_concurrent: int = 5,
|
115
207
|
) -> str:
|
116
208
|
"""
|
117
|
-
Converts a PDF document
|
209
|
+
Converts a PDF document to Markdown asynchronously with parallel processing.
|
210
|
+
|
118
211
|
Args:
|
119
|
-
pdf_input: Path to
|
120
|
-
page_indices: Specific
|
121
|
-
|
122
|
-
|
123
|
-
|
212
|
+
pdf_input: Path to PDF file or pymupdf.Document object
|
213
|
+
page_indices: Specific page indices to convert (0-based). If None, converts all pages
|
214
|
+
progress_callback: Optional callback function called with (current_page, total_pages)
|
215
|
+
max_concurrent: Maximum number of concurrent LLM requests
|
216
|
+
|
124
217
|
Returns:
|
125
|
-
|
218
|
+
Concatenated Markdown string for all processed pages
|
126
219
|
"""
|
127
220
|
with open_pdf(pdf_input) as doc:
|
128
|
-
target_page_indices = list(
|
129
|
-
|
221
|
+
target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
|
222
|
+
total_pages_to_process = len(target_page_indices)
|
223
|
+
|
224
|
+
if total_pages_to_process == 0:
|
225
|
+
logger.warning("No pages selected for processing.")
|
226
|
+
return ""
|
227
|
+
|
228
|
+
logger.info(f"Starting parallel Markdown conversion for {total_pages_to_process} pages...")
|
229
|
+
|
230
|
+
# Pre-process all pages
|
231
|
+
page_text_dict = extract_text_from_pdf(doc, target_page_indices)
|
232
|
+
page_image_dict = render_pdf_as_image(
|
233
|
+
doc,
|
234
|
+
page_indices=target_page_indices,
|
235
|
+
zoom=self.image_zoom,
|
236
|
+
output=self.image_format,
|
237
|
+
jpg_quality=self.image_jpg_quality,
|
130
238
|
)
|
239
|
+
|
240
|
+
# Process pages in parallel with semaphore for concurrency control
|
241
|
+
semaphore = asyncio.Semaphore(max_concurrent)
|
242
|
+
|
243
|
+
async def process_page(i: int, page_idx: int) -> tuple[int, str]:
|
244
|
+
async with semaphore:
|
245
|
+
logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
|
246
|
+
|
247
|
+
try:
|
248
|
+
# Get previous page data for context
|
249
|
+
prev_page_idx = target_page_indices[i - 1] if i > 0 else None
|
250
|
+
previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
|
251
|
+
previous_page_image_b64 = None
|
252
|
+
if prev_page_idx is not None:
|
253
|
+
previous_page_image_b64 = Base64Image.from_bytes(page_image_dict[prev_page_idx], ext=self.image_format)
|
254
|
+
|
255
|
+
message = self._format_prompt_content_parallel(
|
256
|
+
page_text=page_text_dict.get(page_idx, ""),
|
257
|
+
page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
|
258
|
+
previous_page_text=previous_page_text,
|
259
|
+
previous_page_image_b64=previous_page_image_b64,
|
260
|
+
page_number=page_idx,
|
261
|
+
total_pages=len(doc),
|
262
|
+
)
|
263
|
+
|
264
|
+
response = await self.chatterer.agenerate([message])
|
265
|
+
|
266
|
+
# Extract markdown
|
267
|
+
markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
|
268
|
+
if markdowns:
|
269
|
+
current_page_markdown = "\n".join(markdowns)
|
270
|
+
else:
|
271
|
+
current_page_markdown = response.strip()
|
272
|
+
if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
|
273
|
+
current_page_markdown = current_page_markdown[3:-3].strip()
|
274
|
+
|
275
|
+
logger.debug(f"Completed processing page {i + 1}/{total_pages_to_process}")
|
276
|
+
|
277
|
+
# Call progress callback if provided
|
278
|
+
if progress_callback:
|
279
|
+
try:
|
280
|
+
progress_callback(i + 1, total_pages_to_process)
|
281
|
+
except Exception as cb_err:
|
282
|
+
logger.warning(f"Progress callback failed: {cb_err}")
|
283
|
+
|
284
|
+
return (i, current_page_markdown)
|
285
|
+
|
286
|
+
except Exception as e:
|
287
|
+
logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
|
288
|
+
return (i, f"<!-- Error processing page {page_idx + 1}: {str(e)} -->")
|
289
|
+
|
290
|
+
# Execute all page processing tasks
|
291
|
+
|
292
|
+
tasks = [process_page(i, page_idx) for i, page_idx in enumerate(target_page_indices)]
|
293
|
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
294
|
+
|
295
|
+
# Sort results by original page order and extract markdown
|
296
|
+
markdown_results = [""] * total_pages_to_process
|
297
|
+
for result in results:
|
298
|
+
if isinstance(result, Exception):
|
299
|
+
logger.error(f"Task failed with exception: {result}")
|
300
|
+
continue
|
301
|
+
if isinstance(result, tuple) and len(result) == 2:
|
302
|
+
page_order, markdown = result
|
303
|
+
markdown_results[page_order] = markdown
|
304
|
+
else:
|
305
|
+
logger.error(f"Unexpected result format: {result}")
|
306
|
+
|
307
|
+
return "\n\n".join(markdown_results).strip()
|
308
|
+
|
309
|
+
def _convert_sequential(
|
310
|
+
self,
|
311
|
+
pdf_input: "Document | PathOrReadable",
|
312
|
+
page_indices: Optional[PageIndexType] = None,
|
313
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
314
|
+
) -> str:
|
315
|
+
"""Sequential conversion maintaining strict page continuity."""
|
316
|
+
with open_pdf(pdf_input) as doc:
|
317
|
+
target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
|
131
318
|
total_pages_to_process = len(target_page_indices)
|
132
319
|
if total_pages_to_process == 0:
|
133
320
|
logger.warning("No pages selected for processing.")
|
134
321
|
return ""
|
135
322
|
|
136
323
|
full_markdown_output: List[str] = []
|
137
|
-
|
138
|
-
previous_page_markdown: Optional[str] = None # Store the full markdown of the previous page
|
324
|
+
previous_page_markdown: Optional[str] = None
|
139
325
|
|
140
|
-
# Pre-process all pages
|
326
|
+
# Pre-process all pages
|
141
327
|
logger.info("Extracting text and rendering images for selected pages...")
|
142
328
|
page_text_dict = extract_text_from_pdf(doc, target_page_indices)
|
143
329
|
page_image_dict = render_pdf_as_image(
|
@@ -147,46 +333,33 @@ class PdfToMarkdown:
|
|
147
333
|
output=self.image_format,
|
148
334
|
jpg_quality=self.image_jpg_quality,
|
149
335
|
)
|
150
|
-
logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
|
336
|
+
logger.info(f"Starting sequential Markdown conversion for {total_pages_to_process} pages...")
|
151
337
|
|
152
|
-
page_idx
|
153
|
-
|
154
|
-
while True:
|
155
|
-
logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
|
338
|
+
for i, page_idx in enumerate(target_page_indices):
|
339
|
+
logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
|
156
340
|
try:
|
157
|
-
# --- Get Context Tail ---
|
158
341
|
context_tail = self._get_context_tail(previous_page_markdown)
|
159
342
|
|
160
|
-
message = self.
|
161
|
-
page_text=page_text_dict.get(page_idx, ""),
|
343
|
+
message = self._format_prompt_content_sequential(
|
344
|
+
page_text=page_text_dict.get(page_idx, ""),
|
162
345
|
page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
|
163
|
-
previous_markdown_context_tail=context_tail,
|
346
|
+
previous_markdown_context_tail=context_tail,
|
164
347
|
page_number=page_idx,
|
165
348
|
total_pages=len(doc),
|
166
349
|
)
|
167
|
-
logger.debug(f"Sending request to LLM for page index {page_idx}...")
|
168
350
|
|
169
|
-
response = self.chatterer([message])
|
170
|
-
|
171
|
-
|
351
|
+
response = self.chatterer.generate([message])
|
352
|
+
|
353
|
+
# Extract markdown
|
354
|
+
markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
|
172
355
|
if markdowns:
|
173
356
|
current_page_markdown = "\n".join(markdowns)
|
174
357
|
else:
|
175
|
-
# Fallback: assume the whole response is markdown if no ```markdown blocks found
|
176
358
|
current_page_markdown = response.strip()
|
177
359
|
if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
|
178
|
-
# Basic cleanup if it just missed the 'markdown' language tag
|
179
360
|
current_page_markdown = current_page_markdown[3:-3].strip()
|
180
|
-
elif "```" in current_page_markdown:
|
181
|
-
logger.warning(
|
182
|
-
f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
|
183
|
-
)
|
184
361
|
|
185
|
-
logger.debug(f"Received response from LLM for page index {page_idx}.")
|
186
|
-
|
187
|
-
# --- Store result and update context ---
|
188
362
|
full_markdown_output.append(current_page_markdown)
|
189
|
-
# Update the *full* previous markdown for the *next* iteration's tail calculation
|
190
363
|
previous_page_markdown = current_page_markdown
|
191
364
|
|
192
365
|
except Exception as e:
|
@@ -196,18 +369,85 @@ class PdfToMarkdown:
|
|
196
369
|
# Progress callback
|
197
370
|
if progress_callback:
|
198
371
|
try:
|
199
|
-
progress_callback(i, total_pages_to_process)
|
372
|
+
progress_callback(i + 1, total_pages_to_process)
|
200
373
|
except Exception as cb_err:
|
201
374
|
logger.warning(f"Progress callback failed: {cb_err}")
|
202
375
|
|
203
|
-
|
204
|
-
|
376
|
+
return "\n\n".join(full_markdown_output).strip()
|
377
|
+
|
378
|
+
def _convert_parallel_sync(
|
379
|
+
self,
|
380
|
+
pdf_input: "Document | PathOrReadable",
|
381
|
+
page_indices: Optional[PageIndexType] = None,
|
382
|
+
progress_callback: Optional[Callable[[int, int], None]] = None,
|
383
|
+
) -> str:
|
384
|
+
"""Synchronous parallel-style conversion (processes independently but sequentially)."""
|
385
|
+
with open_pdf(pdf_input) as doc:
|
386
|
+
target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
|
387
|
+
total_pages_to_process = len(target_page_indices)
|
388
|
+
if total_pages_to_process == 0:
|
389
|
+
logger.warning("No pages selected for processing.")
|
390
|
+
return ""
|
391
|
+
|
392
|
+
logger.info(f"Starting parallel-style Markdown conversion for {total_pages_to_process} pages...")
|
393
|
+
|
394
|
+
# Pre-process all pages
|
395
|
+
page_text_dict = extract_text_from_pdf(doc, target_page_indices)
|
396
|
+
page_image_dict = render_pdf_as_image(
|
397
|
+
doc,
|
398
|
+
page_indices=target_page_indices,
|
399
|
+
zoom=self.image_zoom,
|
400
|
+
output=self.image_format,
|
401
|
+
jpg_quality=self.image_jpg_quality,
|
402
|
+
)
|
403
|
+
|
404
|
+
full_markdown_output: List[str] = []
|
405
|
+
|
406
|
+
for i, page_idx in enumerate(target_page_indices):
|
407
|
+
logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
|
205
408
|
|
206
|
-
|
207
|
-
|
409
|
+
try:
|
410
|
+
# Get previous page data for context
|
411
|
+
prev_page_idx = target_page_indices[i - 1] if i > 0 else None
|
412
|
+
previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
|
413
|
+
previous_page_image_b64 = None
|
414
|
+
if prev_page_idx is not None:
|
415
|
+
previous_page_image_b64 = Base64Image.from_bytes(page_image_dict[prev_page_idx], ext=self.image_format)
|
416
|
+
|
417
|
+
message = self._format_prompt_content_parallel(
|
418
|
+
page_text=page_text_dict.get(page_idx, ""),
|
419
|
+
page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
|
420
|
+
previous_page_text=previous_page_text,
|
421
|
+
previous_page_image_b64=previous_page_image_b64,
|
422
|
+
page_number=page_idx,
|
423
|
+
total_pages=len(doc),
|
424
|
+
)
|
208
425
|
|
209
|
-
|
210
|
-
|
426
|
+
response = self.chatterer.generate([message])
|
427
|
+
|
428
|
+
# Extract markdown
|
429
|
+
markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
|
430
|
+
if markdowns:
|
431
|
+
current_page_markdown = "\n".join(markdowns)
|
432
|
+
else:
|
433
|
+
current_page_markdown = response.strip()
|
434
|
+
if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
|
435
|
+
current_page_markdown = current_page_markdown[3:-3].strip()
|
436
|
+
|
437
|
+
full_markdown_output.append(current_page_markdown)
|
438
|
+
|
439
|
+
except Exception as e:
|
440
|
+
logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
|
441
|
+
continue
|
442
|
+
|
443
|
+
# Progress callback
|
444
|
+
if progress_callback:
|
445
|
+
try:
|
446
|
+
progress_callback(i + 1, total_pages_to_process)
|
447
|
+
except Exception as cb_err:
|
448
|
+
logger.warning(f"Progress callback failed: {cb_err}")
|
449
|
+
|
450
|
+
return "\n\n".join(full_markdown_output).strip()
|
211
451
|
|
212
452
|
|
213
453
|
def render_pdf_as_image(
|
@@ -297,9 +537,7 @@ def open_pdf(pdf_input: PathOrReadable | Document):
|
|
297
537
|
doc.close()
|
298
538
|
|
299
539
|
|
300
|
-
def _get_page_indices(
|
301
|
-
page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool
|
302
|
-
) -> list[int]:
|
540
|
+
def _get_page_indices(page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
|
303
541
|
"""Helper function to handle page indices for PDF conversion."""
|
304
542
|
|
305
543
|
def _to_zero_based_int(idx: int) -> int:
|
@@ -318,9 +556,7 @@ def _get_page_indices(
|
|
318
556
|
return [_to_zero_based_int(page_indices)]
|
319
557
|
elif isinstance(page_indices, str):
|
320
558
|
# Handle string input for page indices
|
321
|
-
return _interpret_index_string(
|
322
|
-
index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based
|
323
|
-
)
|
559
|
+
return _interpret_index_string(index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based)
|
324
560
|
else:
|
325
561
|
# Handle iterable input for page indices
|
326
562
|
indices: set[int] = set()
|
@@ -340,9 +576,7 @@ def _get_page_indices(
|
|
340
576
|
end = _to_zero_based_int(end)
|
341
577
|
|
342
578
|
if start > end:
|
343
|
-
raise ValueError(
|
344
|
-
f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
|
345
|
-
)
|
579
|
+
raise ValueError(f"Invalid range: {start} - {end}. Start index must be less than or equal to end index.")
|
346
580
|
indices.update(range(start, end + 1))
|
347
581
|
|
348
582
|
return sorted(indices) # Return sorted list of indices
|
@@ -383,9 +617,7 @@ def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_ba
|
|
383
617
|
end = _to_zero_based_int(end)
|
384
618
|
|
385
619
|
if start > end:
|
386
|
-
raise ValueError(
|
387
|
-
f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
|
388
|
-
)
|
620
|
+
raise ValueError(f"Invalid range: {start} - {end}. Start index must be less than or equal to end index.")
|
389
621
|
indices.update(range(start, end + 1))
|
390
622
|
else:
|
391
623
|
raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: chatterer
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.24
|
4
4
|
Summary: The highest-level interface for various LLM APIs.
|
5
5
|
Requires-Python: >=3.12
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -11,10 +11,9 @@ Requires-Dist: pillow>=11.1.0
|
|
11
11
|
Requires-Dist: regex>=2024.11.6
|
12
12
|
Requires-Dist: rich>=13.9.4
|
13
13
|
Requires-Dist: colorama>=0.4.6
|
14
|
-
Requires-Dist: spargear>=0.2.
|
14
|
+
Requires-Dist: spargear>=0.2.7
|
15
15
|
Provides-Extra: dev
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist: ipykernel>=6.29.5; extra == "dev"
|
16
|
+
Requires-Dist: pyright>=1.1.401; extra == "dev"
|
18
17
|
Provides-Extra: conversion
|
19
18
|
Requires-Dist: youtube-transcript-api>=1.0.3; extra == "conversion"
|
20
19
|
Requires-Dist: chatterer[browser]; extra == "conversion"
|
@@ -34,12 +33,10 @@ Requires-Dist: mistune>=3.1.3; extra == "markdown"
|
|
34
33
|
Provides-Extra: video
|
35
34
|
Requires-Dist: pydub>=0.25.1; extra == "video"
|
36
35
|
Provides-Extra: langchain
|
37
|
-
Requires-Dist:
|
36
|
+
Requires-Dist: langchain-anthropic>=0.3.10; extra == "langchain"
|
37
|
+
Requires-Dist: langchain-google-genai>=2.1.1; extra == "langchain"
|
38
|
+
Requires-Dist: langchain-ollama>=0.3.0; extra == "langchain"
|
38
39
|
Requires-Dist: langchain-experimental>=0.3.4; extra == "langchain"
|
39
|
-
Provides-Extra: langchain-providers
|
40
|
-
Requires-Dist: langchain-anthropic>=0.3.10; extra == "langchain-providers"
|
41
|
-
Requires-Dist: langchain-google-genai>=2.1.1; extra == "langchain-providers"
|
42
|
-
Requires-Dist: langchain-ollama>=0.3.0; extra == "langchain-providers"
|
43
40
|
Provides-Extra: all
|
44
41
|
Requires-Dist: chatterer[dev]; extra == "all"
|
45
42
|
Requires-Dist: chatterer[langchain]; extra == "all"
|
@@ -6,21 +6,22 @@ chatterer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
6
|
chatterer/common_types/__init__.py,sha256=jfS6m5UANSvGjzQ_nzYDpryn5uZqNb06-4xCsQ2C_lw,376
|
7
7
|
chatterer/common_types/io.py,sha256=fetiyi1suZ3NF2mj5k5KDLJLGKS1n4J-5UmH7JN36g8,817
|
8
8
|
chatterer/examples/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
chatterer/examples/
|
10
|
-
chatterer/examples/
|
11
|
-
chatterer/examples/
|
12
|
-
chatterer/examples/
|
13
|
-
chatterer/examples/
|
14
|
-
chatterer/examples/
|
15
|
-
chatterer/examples/
|
16
|
-
chatterer/examples/
|
17
|
-
chatterer/examples/
|
9
|
+
chatterer/examples/__main__.py,sha256=W-Zo7z9RyA0PrY-tPDqf9BSkOqPpaIXROcHCXCwNXc4,1566
|
10
|
+
chatterer/examples/any2md.py,sha256=4AtdlwP1jxSsvh31yWmB5HP2Wmof4Fh0W_F3im2yJ_M,2739
|
11
|
+
chatterer/examples/pdf2md.py,sha256=viru-9vrUdiYMXRpQVpbYiZy6bjkkF-HTXSvy37ICUY,13625
|
12
|
+
chatterer/examples/pdf2txt.py,sha256=ULfA2cr-lrfLVqpMlSa08qo5AXVXiyL8N2-KiD0Orhc,1602
|
13
|
+
chatterer/examples/ppt.py,sha256=7AhS2hZtmMHOJQt1j5DQDDgrMwM-GX1HjPrmKDV2Bgs,23253
|
14
|
+
chatterer/examples/pw.py,sha256=FGmCQg5XFyVAczBF8mQcobJcvITKEOtbrXm4pyKvbAw,5138
|
15
|
+
chatterer/examples/snippet.py,sha256=JvR_xBV8skePCtIczz73EdjmiHzj_A-5HzS53j0bLI4,1973
|
16
|
+
chatterer/examples/transcribe.py,sha256=fBFuo442VEM7NbF9xN3ub3nAnYnQojrseN_kI049fsM,3894
|
17
|
+
chatterer/examples/upstage.py,sha256=lK2OOY6U4GGnDBbPHKaqwlh_0Vu-0RMb0M01M8dngRs,3219
|
18
|
+
chatterer/examples/web2md.py,sha256=zfemaE3KwfU8LHvWzJHX-knASpikBUUNzv6jTmfac1E,2740
|
18
19
|
chatterer/strategies/__init__.py,sha256=SdOggbmHpw4f7Njwy-T8q64e91OLOUp1k0a0ozZd4qI,221
|
19
|
-
chatterer/strategies/atom_of_thoughts.py,sha256=
|
20
|
+
chatterer/strategies/atom_of_thoughts.py,sha256=30XvnVKjty8Geo2z_n2-RWL_eEvo_AnK8sg8uVPQHOQ,41178
|
20
21
|
chatterer/strategies/base.py,sha256=b2gMPqodp97OP1dkHfj0UqixjdjVhmTw_V5qJ7i2S6g,427
|
21
22
|
chatterer/tools/__init__.py,sha256=m3PRK9H5vOhk-2gG9W2eg8CYBlEn-K9-eaulOu91bgo,1474
|
22
23
|
chatterer/tools/caption_markdown_images.py,sha256=r4QajHYuL4mdyYQXP1vQcNmqKN8lxBf5y0VKELXILOI,15392
|
23
|
-
chatterer/tools/convert_pdf_to_markdown.py,sha256=
|
24
|
+
chatterer/tools/convert_pdf_to_markdown.py,sha256=_a-nVNs_9j4QsDPKI5p6AZeasgOW3x_2rb49-yfBSPs,28501
|
24
25
|
chatterer/tools/convert_to_text.py,sha256=WHQ0Xj4Ri_jYbFjzTx3mjmvJ9U8bAv4wGaKEVC88Nlk,15457
|
25
26
|
chatterer/tools/upstage_document_parser.py,sha256=CXslVYAHDK8EV8jtUAUWzf8rxU4qilSnW8_dhAxHOE8,33142
|
26
27
|
chatterer/tools/webpage_to_markdown.py,sha256=ADH4sqM6iquJR7HU6umMQ5qO7EvcbNutuchXDpAcxAo,31961
|
@@ -37,8 +38,8 @@ chatterer/utils/base64_image.py,sha256=m_qAT3ERBiq8D-H4H9Z7rLfL31_BiPmV_m4uQ5XRL
|
|
37
38
|
chatterer/utils/bytesio.py,sha256=3MC2atOOFKo5YxuReo_y_t8Wem9p2Y1ahC5M2lGclwI,2618
|
38
39
|
chatterer/utils/code_agent.py,sha256=7ka_WRI4TQmZ5H46mjY3hI6RO_pxw6pg3LAxjgW4AbM,10495
|
39
40
|
chatterer/utils/imghdr.py,sha256=6JhJMXD4MZ0dQolT2VM87YrRYm3hPf3RTEWnP4lYRVc,3842
|
40
|
-
chatterer-0.1.
|
41
|
-
chatterer-0.1.
|
42
|
-
chatterer-0.1.
|
43
|
-
chatterer-0.1.
|
44
|
-
chatterer-0.1.
|
41
|
+
chatterer-0.1.24.dist-info/METADATA,sha256=mpTNGDkwWEK-9XdP52DGaVKQphtJ_p6Wmibq-eiq07g,11633
|
42
|
+
chatterer-0.1.24.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
43
|
+
chatterer-0.1.24.dist-info/entry_points.txt,sha256=IzGKhTnZ7G5V23SRmulmSsyt9HcaFH4lU4r3wR1zMsc,63
|
44
|
+
chatterer-0.1.24.dist-info/top_level.txt,sha256=7nSQKP0bHxPRc7HyzdbKsJdkvPgYD0214o6slRizv9s,10
|
45
|
+
chatterer-0.1.24.dist-info/RECORD,,
|