PyPI - chatterer - Versions diffs - 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl - Mend

chatterer 0.1.21py3-none-any.whl → 0.1.23py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

chatterer/examples/pdf_to_markdown.py CHANGED Viewed

@@ -1,76 +1,337 @@
+#!/usr/bin/env python3
+"""
+PDF to Markdown Converter CLI
+A command-line tool for converting PDF documents to Markdown using multimodal LLMs.
+Supports both sequential and parallel processing modes with async capabilities.
+"""
+import asyncio
 import logging
 import sys
+import time
 from pathlib import Path
-from typing import Optional
+from typing import List, Literal, Optional, TypedDict
 from spargear import ArgumentSpec, BaseArguments
-from chatterer import Chatterer, PdfToMarkdown
+from chatterer import Chatterer
+from chatterer.tools.convert_pdf_to_markdown import PdfToMarkdown
+class ConversionResult(TypedDict, total=False):
+    """Type definition for conversion results."""
+    input: str
+    output: str
+    result: str
+    processing_time: float
+    characters: int
+    error: str
+# Setup enhanced logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%H:%M:%S")
 logger = logging.getLogger(__name__)
 class PdfToMarkdownArgs(BaseArguments):
+    """Command-line arguments for PDF to Markdown conversion."""
     input: str
     """Input PDF file or directory containing PDF files to convert to markdown."""
     output: Optional[str] = None
     """Output path. For a file, path to the output markdown file. For a directory, output directory for .md files."""
-    """Chatterer instance for communication."""
     page: Optional[str] = None
-    """Zero-based page indices to convert (e.g., '0,2,4-8')."""
+    """Zero-based page indices to convert (e.g., '0,2,4-8'). If None, converts all pages."""
     recursive: bool = False
     """If input is a directory, search for PDFs recursively."""
+    mode: Literal["sequential", "parallel"] = "parallel"
+    """Processing mode: 'sequential' for strict continuity, 'parallel' for faster processing."""
+    sync: bool = False
+    """Enable synchronous processing for sequential mode. If set to True, will run in sync mode."""
+    max_concurrent: int = 10
+    """Maximum number of concurrent LLM requests when using async mode."""
+    image_zoom: float = 2.0
+    """Zoom factor for rendering PDF pages as images (higher zoom = higher resolution)."""
+    image_format: Literal["png", "jpg", "jpeg"] = "png"
+    """Image format for PDF page rendering."""
+    image_quality: int = 95
+    """JPEG quality when using jpg/jpeg format (1-100)."""
+    context_tail_lines: int = 10
+    """Number of lines from previous page's markdown to use as context (sequential mode only)."""
+    verbose: bool = False
+    """Enable verbose logging output."""
     chatterer: ArgumentSpec[Chatterer] = ArgumentSpec(
         ["--chatterer"],
         default_factory=lambda: Chatterer.from_provider("google:gemini-2.5-flash-preview-05-20"),
-        help="Chatterer instance for communication.",
+        help="Chatterer instance configuration (e.g., 'google:gemini-2.5-flash-preview-05-20').",
         type=Chatterer.from_provider,
     )
-    def run(self) -> list[dict[str, str]]:
-        input = Path(self.input).resolve()
-        pdf_files: list[Path] = []
+    def __post_init__(self) -> None:
+        """Validate and adjust arguments after initialization."""
+        if self.verbose:
+            logging.getLogger().setLevel(logging.DEBUG)
+        if not self.sync and self.mode == "sequential":
+            logger.warning("Async mode is only available with parallel mode. Switching to parallel mode.")
+            self.mode = "parallel"
+        if self.max_concurrent < 1:
+            logger.warning("max_concurrent must be >= 1. Setting to 1.")
+            self.max_concurrent = 1
+        elif self.max_concurrent > 10:
+            logger.warning("max_concurrent > 10 may cause rate limiting. Consider reducing.")
+    def run(self) -> List[ConversionResult]:
+        """Execute the PDF to Markdown conversion."""
+        if not self.sync:
+            return asyncio.run(self._run_async())
+        else:
+            return self._run_sync()
+    def _run_sync(self) -> List[ConversionResult]:
+        """Execute synchronous conversion."""
+        pdf_files, output_base, is_dir = self._prepare_files()
+        converter = PdfToMarkdown(
+            chatterer=self.chatterer.unwrap(),
+            image_zoom=self.image_zoom,
+            image_format=self.image_format,
+            image_jpg_quality=self.image_quality,
+            context_tail_lines=self.context_tail_lines,
+        )
+        results: List[ConversionResult] = []
+        total_start_time = time.time()
+        logger.info(f"🚀 Starting {self.mode} conversion of {len(pdf_files)} PDF(s)...")
+        for i, pdf in enumerate(pdf_files, 1):
+            output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
+            logger.info(f"📄 Processing {i}/{len(pdf_files)}: {pdf.name}")
+            start_time = time.time()
+            # Progress callback for individual PDF
+            def progress_callback(current: int, total: int) -> None:
+                progress = (current / total) * 100
+                logger.info(f"  └─ Progress: {current}/{total} pages ({progress:.1f}%)")
+            try:
+                markdown = converter.convert(
+                    pdf_input=str(pdf),
+                    page_indices=self.page,
+                    mode=self.mode,
+                    progress_callback=progress_callback,
+                )
+                # Save result
+                output_path.parent.mkdir(parents=True, exist_ok=True)
+                output_path.write_text(markdown, encoding="utf-8")
+                elapsed = time.time() - start_time
+                chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
+                logger.info(f"  ✅ Completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
+                logger.info(f"  📝 Generated {len(markdown):,} characters → {output_path}")
+                results.append({
+                    "input": pdf.as_posix(),
+                    "output": output_path.as_posix(),
+                    "result": markdown,
+                    "processing_time": elapsed,
+                    "characters": len(markdown),
+                })
+            except Exception as e:
+                logger.error(f"  ❌ Failed to process {pdf.name}: {e}")
+                results.append({
+                    "input": pdf.as_posix(),
+                    "output": "",
+                    "result": "",
+                    "error": str(e),
+                })
+        total_elapsed = time.time() - total_start_time
+        total_chars = sum(len(r.get("result", "")) for r in results)
+        successful_conversions = sum(1 for r in results if "error" not in r)
+        logger.info("🎉 Conversion complete!")
+        logger.info(f"  📊 Total time: {total_elapsed:.1f}s")
+        logger.info(f"  📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
+        logger.info(f"  📝 Total output: {total_chars:,} characters")
+        logger.info(f"  ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
+        return results
+    async def _run_async(self) -> List[ConversionResult]:
+        """Execute asynchronous conversion with parallel processing."""
+        pdf_files, output_base, is_dir = self._prepare_files()
+        converter = PdfToMarkdown(
+            chatterer=self.chatterer.unwrap(),
+            image_zoom=self.image_zoom,
+            image_format=self.image_format,
+            image_jpg_quality=self.image_quality,
+            context_tail_lines=self.context_tail_lines,
+        )
+        total_start_time = time.time()
+        logger.info(f"🚀 Starting ASYNC parallel conversion of {len(pdf_files)} PDF(s)...")
+        logger.info(f"⚡ Max concurrent: {self.max_concurrent} LLM requests")
+        # Process PDFs concurrently
+        semaphore = asyncio.Semaphore(self.max_concurrent)
+        async def process_pdf(pdf: Path, index: int) -> ConversionResult:
+            async with semaphore:
+                output_path = (output_base / f"{pdf.stem}.md") if is_dir else output_base
+                logger.info(f"📄 Processing {index}/{len(pdf_files)}: {pdf.name}")
+                start_time = time.time()
+                # Progress callback for individual PDF
+                def progress_callback(current: int, total: int) -> None:
+                    progress = (current / total) * 100
+                    logger.info(f"  └─ {pdf.name}: {current}/{total} pages ({progress:.1f}%)")
+                try:
+                    markdown = await converter.aconvert(
+                        pdf_input=str(pdf),
+                        page_indices=self.page,
+                        progress_callback=progress_callback,
+                        max_concurrent=self.max_concurrent,  # Limit per-PDF concurrency
+                    )
+                    # Save result
+                    output_path.parent.mkdir(parents=True, exist_ok=True)
+                    output_path.write_text(markdown, encoding="utf-8")
+                    elapsed = time.time() - start_time
+                    chars_per_sec = len(markdown) / elapsed if elapsed > 0 else 0
+                    logger.info(f"  ✅ {pdf.name} completed in {elapsed:.1f}s ({chars_per_sec:.0f} chars/s)")
+                    logger.info(f"  📝 Generated {len(markdown):,} characters → {output_path}")
+                    return {
+                        "input": pdf.as_posix(),
+                        "output": output_path.as_posix(),
+                        "result": markdown,
+                        "processing_time": elapsed,
+                        "characters": len(markdown),
+                    }
+                except Exception as e:
+                    logger.error(f"  ❌ Failed to process {pdf.name}: {e}")
+                    return {
+                        "input": pdf.as_posix(),
+                        "output": "",
+                        "result": "",
+                        "error": str(e),
+                    }
+        # Execute all PDF processing tasks
+        tasks = [process_pdf(pdf, i) for i, pdf in enumerate(pdf_files, 1)]
+        raw_results = await asyncio.gather(*tasks, return_exceptions=True)
+        # Handle exceptions in results
+        final_results: List[ConversionResult] = []
+        for result in raw_results:
+            if isinstance(result, Exception):
+                logger.error(f"Task failed with exception: {result}")
+                final_results.append(ConversionResult(input="", output="", result="", error=str(result)))
+            else:
+                # Type narrowing: result is ConversionResult after isinstance check
+                final_results.append(result)  # type: ignore[arg-type]
+        total_elapsed = time.time() - total_start_time
+        total_chars = sum(len(r.get("result", "")) for r in final_results)
+        successful_conversions = sum(1 for r in final_results if "error" not in r)
+        logger.info("🎉 ASYNC conversion complete!")
+        logger.info(f"  📊 Total time: {total_elapsed:.1f}s")
+        logger.info(f"  📈 Success rate: {successful_conversions}/{len(pdf_files)} ({(successful_conversions / len(pdf_files) * 100):.1f}%)")
+        logger.info(f"  📝 Total output: {total_chars:,} characters")
+        logger.info(f"  ⚡ Average speed: {total_chars / total_elapsed:.0f} chars/s")
+        logger.info(f"  🚀 Speedup: ~{len(pdf_files) / max(1, total_elapsed / 60):.1f}x faster than sequential")
+        return final_results
+    def _prepare_files(self) -> tuple[List[Path], Path, bool]:
+        """Prepare input and output file paths."""
+        input_path = Path(self.input).resolve()
+        pdf_files: List[Path] = []
         is_dir = False
-        if input.is_file():
-            if input.suffix.lower() != ".pdf":
+        # Determine input files
+        if input_path.is_file():
+            if input_path.suffix.lower() != ".pdf":
+                logger.error(f"❌ Input file must be a PDF: {input_path}")
                 sys.exit(1)
-            pdf_files.append(input)
-        elif input.is_dir():
+            pdf_files.append(input_path)
+        elif input_path.is_dir():
             is_dir = True
-            pattern = "*.pdf"
-            pdf_files = sorted([
-                f for f in (input.rglob(pattern) if self.recursive else input.glob(pattern)) if f.is_file()
-            ])
+            pattern = "**/*.pdf" if self.recursive else "*.pdf"
+            pdf_files = sorted([f for f in input_path.glob(pattern) if f.is_file()])
             if not pdf_files:
+                logger.warning(f"⚠️  No PDF files found in {input_path}")
                 sys.exit(0)
         else:
+            logger.error(f"❌ Input path does not exist: {input_path}")
             sys.exit(1)
+        # Determine output path
         if self.output:
-            out_base = Path(self.output).resolve()
+            output_base = Path(self.output).resolve()
         elif is_dir:
-            out_base = input
+            output_base = input_path
         else:
-            out_base = input.with_suffix(".md")
+            output_base = input_path.with_suffix(".md")
+        # Create output directories
         if is_dir:
-            out_base.mkdir(parents=True, exist_ok=True)
+            output_base.mkdir(parents=True, exist_ok=True)
         else:
-            out_base.parent.mkdir(parents=True, exist_ok=True)
-        converter = PdfToMarkdown(chatterer=self.chatterer.unwrap())
-        results: list[dict[str, str]] = []
-        for pdf in pdf_files:
-            output: Path = (out_base / (pdf.stem + ".md")) if is_dir else out_base
-            md: str = converter.convert(pdf_input=str(pdf), page_indices=self.page)
-            output.parent.mkdir(parents=True, exist_ok=True)
-            output.write_text(md, encoding="utf-8")
-            results.append({"input": pdf.as_posix(), "output": output.as_posix(), "result": md})
-        logger.info(f"Converted {len(pdf_files)} PDF(s) to markdown and saved to `{out_base}`.")
-        return results
+            output_base.parent.mkdir(parents=True, exist_ok=True)
+        logger.info(f"📂 Input: {input_path}")
+        logger.info(f"📁 Output: {output_base}")
+        logger.info(f"📄 Found {len(pdf_files)} PDF file(s)")
+        return pdf_files, output_base, is_dir
 def main() -> None:
-    PdfToMarkdownArgs().run()
+    """Main entry point for the CLI application."""
+    args = None
+    try:
+        args = PdfToMarkdownArgs()
+        args.run()
+    except KeyboardInterrupt:
+        logger.info("🛑 Conversion interrupted by user")
+        sys.exit(130)
+    except Exception as e:
+        logger.error(f"❌ Unexpected error: {e}")
+        if args and hasattr(args, "verbose") and args.verbose:
+            import traceback
+            traceback.print_exc()
+        sys.exit(1)
 if __name__ == "__main__":

chatterer/language_model.py CHANGED Viewed

@@ -66,12 +66,15 @@ class Chatterer(BaseModel):
     @classmethod
     def from_provider(
-        cls, provider_and_model: str, structured_output_kwargs: Optional[dict[str, object]] = {"strict": True}
+        cls,
+        provider_and_model: str,
+        structured_output_kwargs: Optional[dict[str, object]] = {"strict": True},
+        **kwargs: object,
     ) -> Self:
         backend, model = provider_and_model.split(":", 1)
         backends = cls.get_backends()
         if func := backends.get(backend):
-            return func(model, structured_output_kwargs)
+            return func(model, structured_output_kwargs, **kwargs)
         else:
             raise ValueError(f"Unsupported provider: {backend}. Supported providers are: {', '.join(backends.keys())}.")

chatterer/tools/convert_pdf_to_markdown.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import asyncio
 import logging
 import re
 from contextlib import contextmanager
@@ -25,10 +26,11 @@ PageIndexType = Iterable[int | tuple[int | EllipsisType, int | EllipsisType]] |
 class PdfToMarkdown:
     """
     Converts PDF documents to Markdown using a multimodal LLM (Chatterer).
-    Processes PDFs page by page, providing the LLM with both the extracted raw
-    text and a rendered image of the page to handle complex layouts. It maintains
-    context between pages by feeding the *tail end* of the previously generated
-    Markdown back into the prompt for the next page to ensure smooth transitions.
+    This class supports both sequential and parallel processing:
+    - Sequential processing preserves strict page continuity using previous page context
+    - Parallel processing enables faster conversion for large documents by using
+      previous page image and text for context instead of generated markdown
     """
     chatterer: Chatterer
@@ -40,8 +42,7 @@ class PdfToMarkdown:
     image_jpg_quality: int = 95
     """Quality for JPEG images (if used)."""
     context_tail_lines: int = 10
-    """Number of lines from the end of the previous page's Markdown to use as context."""
-    # max_context_tokens: Optional[int] = None # This can be added later if needed
+    """Number of lines from the end of the previous page's Markdown to use as context (sequential mode only)."""
     def _get_context_tail(self, markdown_text: Optional[str]) -> Optional[str]:
         """Extracts the last N lines from the given markdown text."""
@@ -50,94 +51,279 @@ class PdfToMarkdown:
         lines = markdown_text.strip().splitlines()
         if not lines:
             return None
-        # Get the last N lines, or fewer if the text is shorter
         tail_lines = lines[-self.context_tail_lines :]
         return "\n".join(tail_lines)
-    def _format_prompt_content(
+    def _format_prompt_content_sequential(
         self,
         page_text: str,
         page_image_b64: Base64Image,
-        previous_markdown_context_tail: Optional[str] = None,  # Renamed for clarity
-        page_number: int = 0,  # For context, 0-indexed
+        previous_markdown_context_tail: Optional[str] = None,
+        page_number: int = 0,
         total_pages: int = 1,
     ) -> HumanMessage:
         """
-        Formats the content list for the HumanMessage input to the LLM.
-        Uses only the tail end of the previous page's markdown for context.
+        Formats the content for sequential processing using previous page's markdown context.
         """
-        # Construct the main instruction prompt
-        instruction = f"""You are an expert PDF to Markdown converter. Your task is to convert the content of the provided PDF page (Page {page_number + 1} of {total_pages}) into accurate and well-formatted Markdown. You are given:
-1.  The raw text extracted from the page ([Raw Text]).
-2.  A rendered image of the page ([Rendered Image]) showing its visual layout.
-3.  (Optional) The *ending portion* of the Markdown generated from the previous page ([End of Previous Page Markdown]) for context continuity.
-**Conversion Requirements:**
-*   **Text:** Reconstruct paragraphs, headings, lists, etc., naturally based on the visual layout. Correct OCR/formatting issues from [Raw Text] using the image. Minimize unnecessary whitespace.
-*   **Tables:** Convert tables accurately into Markdown table format (`| ... |`). Use image for text if [Raw Text] is garbled.
-*   **Images/Diagrams:** Describe significant visual elements (charts, graphs) within `<details>` tags. Example: `<details><summary>Figure 1: Description</summary>Detailed textual description from the image.</details>`. Ignore simple decorative images. Do **not** use `![alt](...)`.
-*   **Layout:** Respect columns, code blocks (``` ```), footnotes, etc., using standard Markdown.
-*   **Continuity (Crucial):**
-    *   Examine the [End of Previous Page Markdown] if provided.
-    *   If the current page's content *continues* a sentence, paragraph, list, or code block from the previous page, ensure your generated Markdown for *this page* starts seamlessly from that continuation point.
-    *   For example, if the previous page ended mid-sentence, the Markdown for *this page* should begin with the rest of that sentence.
-    *   **Do NOT repeat the content already present in [End of Previous Page Markdown] in your output.**
-    *   If the current page starts a new section (e.g., with a heading), begin the Markdown output fresh, ignoring the previous context tail unless necessary for list numbering, etc.
-**Input Data:**
-[Raw Text]
+        instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
+**Input provided:**
+1. **Raw Text**: Extracted text from the PDF page (may contain OCR errors)
+2. **Page Image**: Visual rendering of the page showing actual layout
+3. **Previous Context**: End portion of the previous page's generated Markdown (if available)
+**Conversion Rules:**
+• **Text Structure**: Use the image to understand the actual layout and fix any OCR errors in the raw text
+• **Headings**: Use appropriate heading levels (# ## ### etc.) based on visual hierarchy
+• **Lists**: Convert to proper Markdown lists (- or 1. 2. 3.) maintaining structure
+• **Tables**: Convert to Markdown table format using | pipes |
+• **Images/Diagrams**: Describe significant visual elements as: `<details><summary>Figure: Brief title</summary>Detailed description based on what you see in the image</details>`
+• **Code/Formulas**: Use ``` code blocks ``` or LaTeX $$ math $$ as appropriate
+• **Continuity**: If previous context shows incomplete content (mid-sentence, list, table), seamlessly continue from that point
+• **NO REPETITION**: Never repeat content from the previous context - only generate new content for this page
+**Raw Text:**
 ```
 {page_text if page_text else "No text extracted from this page."}
 ```
-[Rendered Image]
-(See attached image)
+**Page Image:** (attached)
 """
         if previous_markdown_context_tail:
-            instruction += f"""[End of Previous Page Markdown]
+            instruction += f"""
+**Previous Page Context (DO NOT REPEAT):**
 ```markdown
-... (content from previous page ends with) ...
+... (previous page ended with) ...
 {previous_markdown_context_tail}
 ```
-**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}), ensuring it correctly continues from or follows the [End of Previous Page Markdown]. Start the output *only* with the content belonging to the current page."""
+Continue seamlessly from the above context if the current page content flows from it.
+"""
         else:
-            instruction += "**Task:** Generate the Markdown for the *current* page (Page {page_number + 1}). This is the first page being processed in this batch."
+            instruction += "\n**Note:** This is the first page or start of a new section."
-        instruction += "\n\n**Output only the Markdown content for the current page.** Ensure your output starts correctly based on the continuity rules."
+        instruction += "\n\n**Output only the Markdown content for the current page. Ensure proper formatting and NO repetition of previous content.**"
-        # Structure for multimodal input
         return HumanMessage(content=[instruction, page_image_b64.data_uri_content])
+    def _format_prompt_content_parallel(
+        self,
+        page_text: str,
+        page_image_b64: Base64Image,
+        previous_page_text: Optional[str] = None,
+        previous_page_image_b64: Optional[Base64Image] = None,
+        page_number: int = 0,
+        total_pages: int = 1,
+    ) -> HumanMessage:
+        """
+        Formats the content for parallel processing using previous page's raw data.
+        """
+        instruction = f"""You are an expert PDF to Markdown converter. Convert Page {page_number + 1} of {total_pages} into accurate, well-formatted Markdown.
+**Task**: Convert the current page to Markdown while maintaining proper continuity with the previous page.
+**Current Page Data:**
+- **Raw Text**: Extracted text (may have OCR errors - use image to verify)
+- **Page Image**: Visual rendering showing actual layout
+**Previous Page Data** (for context only):
+- **Previous Raw Text**: Text from the previous page
+- **Previous Page Image**: Visual of the previous page
+**Conversion Instructions:**
+1. **Primary Focus**: Convert the CURRENT page content accurately
+2. **Continuity Check**:
+   - Examine if the current page continues content from the previous page (sentences, paragraphs, lists, tables)
+   - If yes, start your Markdown naturally continuing that content
+   - If no, start fresh with proper heading/structure
+3. **Format Rules**:
+   - Use image to fix OCR errors and understand layout
+   - Convert headings to # ## ### based on visual hierarchy
+   - Convert lists to proper Markdown (- or 1. 2. 3.)
+   - Convert tables to | pipe | format
+   - Describe significant images/charts as: `<details><summary>Figure: Title</summary>Description</details>`
+   - Use ``` for code blocks and $$ for math formulas
+**Current Page Raw Text:**
+```
+{page_text if page_text else "No text extracted from this page."}
+```
+**Current Page Image:** (see first attached image)
+"""
+        content = [instruction, page_image_b64.data_uri_content]
+        if previous_page_text is not None and previous_page_image_b64 is not None:
+            instruction += f"""
+**Previous Page Raw Text (for context):**
+```
+{previous_page_text if previous_page_text else "No text from previous page."}
+```
+**Previous Page Image:** (see second attached image)
+"""
+            content.append(previous_page_image_b64.data_uri_content)
+        else:
+            instruction += "\n**Note:** This is the first page - no previous context available."
+        instruction += "\n\n**Generate ONLY the Markdown for the current page. Ensure proper continuity and formatting.**"
+        content[0] = instruction
+        return HumanMessage(content=content)
     def convert(
         self,
         pdf_input: "Document | PathOrReadable",
         page_indices: Optional[PageIndexType] = None,
         progress_callback: Optional[Callable[[int, int], None]] = None,
+        mode: Literal["sequential", "parallel"] = "sequential",
+    ) -> str:
+        """
+        Converts a PDF document to Markdown synchronously.
+        Args:
+            pdf_input: Path to PDF file or pymupdf.Document object
+            page_indices: Specific page indices to convert (0-based). If None, converts all pages
+            progress_callback: Optional callback function called with (current_page, total_pages)
+            mode: "sequential" for strict continuity or "parallel" for independent page processing
+        Returns:
+            Concatenated Markdown string for all processed pages
+        """
+        if mode == "sequential":
+            return self._convert_sequential(pdf_input, page_indices, progress_callback)
+        else:
+            return self._convert_parallel_sync(pdf_input, page_indices, progress_callback)
+    async def aconvert(
+        self,
+        pdf_input: "Document | PathOrReadable",
+        page_indices: Optional[PageIndexType] = None,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+        max_concurrent: int = 5,
     ) -> str:
         """
-        Converts a PDF document (or specific pages) to Markdown synchronously.
+        Converts a PDF document to Markdown asynchronously with parallel processing.
         Args:
-            pdf_input: Path to the PDF file or a pymupdf.Document object.
-            page_indices: Specific 0-based page indices to convert. If None, converts all pages.
-                          Can be a single int or an iterable of ints.
-            progress_callback: An optional function to call with (current_page_index, total_pages_to_process)
-                               after each page is processed.
+            pdf_input: Path to PDF file or pymupdf.Document object
+            page_indices: Specific page indices to convert (0-based). If None, converts all pages
+            progress_callback: Optional callback function called with (current_page, total_pages)
+            max_concurrent: Maximum number of concurrent LLM requests
         Returns:
-            A single string containing the concatenated Markdown output for the processed pages.
+            Concatenated Markdown string for all processed pages
         """
         with open_pdf(pdf_input) as doc:
-            target_page_indices = list(
-                _get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True)
+            target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
+            total_pages_to_process = len(target_page_indices)
+            if total_pages_to_process == 0:
+                logger.warning("No pages selected for processing.")
+                return ""
+            logger.info(f"Starting parallel Markdown conversion for {total_pages_to_process} pages...")
+            # Pre-process all pages
+            page_text_dict = extract_text_from_pdf(doc, target_page_indices)
+            page_image_dict = render_pdf_as_image(
+                doc,
+                page_indices=target_page_indices,
+                zoom=self.image_zoom,
+                output=self.image_format,
+                jpg_quality=self.image_jpg_quality,
             )
+            # Process pages in parallel with semaphore for concurrency control
+            semaphore = asyncio.Semaphore(max_concurrent)
+            async def process_page(i: int, page_idx: int) -> tuple[int, str]:
+                async with semaphore:
+                    logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
+                    try:
+                        # Get previous page data for context
+                        prev_page_idx = target_page_indices[i - 1] if i > 0 else None
+                        previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
+                        previous_page_image_b64 = None
+                        if prev_page_idx is not None:
+                            previous_page_image_b64 = Base64Image.from_bytes(page_image_dict[prev_page_idx], ext=self.image_format)
+                        message = self._format_prompt_content_parallel(
+                            page_text=page_text_dict.get(page_idx, ""),
+                            page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
+                            previous_page_text=previous_page_text,
+                            previous_page_image_b64=previous_page_image_b64,
+                            page_number=page_idx,
+                            total_pages=len(doc),
+                        )
+                        response = await self.chatterer.agenerate([message])
+                        # Extract markdown
+                        markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
+                        if markdowns:
+                            current_page_markdown = "\n".join(markdowns)
+                        else:
+                            current_page_markdown = response.strip()
+                            if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
+                                current_page_markdown = current_page_markdown[3:-3].strip()
+                        logger.debug(f"Completed processing page {i + 1}/{total_pages_to_process}")
+                        # Call progress callback if provided
+                        if progress_callback:
+                            try:
+                                progress_callback(i + 1, total_pages_to_process)
+                            except Exception as cb_err:
+                                logger.warning(f"Progress callback failed: {cb_err}")
+                        return (i, current_page_markdown)
+                    except Exception as e:
+                        logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
+                        return (i, f"<!-- Error processing page {page_idx + 1}: {str(e)} -->")
+                        # Execute all page processing tasks
+            tasks = [process_page(i, page_idx) for i, page_idx in enumerate(target_page_indices)]
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            # Sort results by original page order and extract markdown
+            markdown_results = [""] * total_pages_to_process
+            for result in results:
+                if isinstance(result, Exception):
+                    logger.error(f"Task failed with exception: {result}")
+                    continue
+                if isinstance(result, tuple) and len(result) == 2:
+                    page_order, markdown = result
+                    markdown_results[page_order] = markdown
+                else:
+                    logger.error(f"Unexpected result format: {result}")
+            return "\n\n".join(markdown_results).strip()
+    def _convert_sequential(
+        self,
+        pdf_input: "Document | PathOrReadable",
+        page_indices: Optional[PageIndexType] = None,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+    ) -> str:
+        """Sequential conversion maintaining strict page continuity."""
+        with open_pdf(pdf_input) as doc:
+            target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
             total_pages_to_process = len(target_page_indices)
             if total_pages_to_process == 0:
                 logger.warning("No pages selected for processing.")
                 return ""
             full_markdown_output: List[str] = []
-            # --- Context Tracking ---
-            previous_page_markdown: Optional[str] = None  # Store the full markdown of the previous page
+            previous_page_markdown: Optional[str] = None
-            # Pre-process all pages (optional optimization)
+            # Pre-process all pages
             logger.info("Extracting text and rendering images for selected pages...")
             page_text_dict = extract_text_from_pdf(doc, target_page_indices)
             page_image_dict = render_pdf_as_image(
@@ -147,46 +333,33 @@ class PdfToMarkdown:
                 output=self.image_format,
                 jpg_quality=self.image_jpg_quality,
             )
-            logger.info(f"Starting Markdown conversion for {total_pages_to_process} pages...")
+            logger.info(f"Starting sequential Markdown conversion for {total_pages_to_process} pages...")
-            page_idx: int = target_page_indices.pop(0)  # Get the first page index
-            i: int = 1
-            while True:
-                logger.info(f"Processing page {i}/{total_pages_to_process} (Index: {page_idx})...")
+            for i, page_idx in enumerate(target_page_indices):
+                logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
                 try:
-                    # --- Get Context Tail ---
                     context_tail = self._get_context_tail(previous_page_markdown)
-                    message = self._format_prompt_content(
-                        page_text=page_text_dict.get(page_idx, ""),  # Use .get for safety
+                    message = self._format_prompt_content_sequential(
+                        page_text=page_text_dict.get(page_idx, ""),
                         page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
-                        previous_markdown_context_tail=context_tail,  # Pass only the tail
+                        previous_markdown_context_tail=context_tail,
                         page_number=page_idx,
                         total_pages=len(doc),
                     )
-                    logger.debug(f"Sending request to LLM for page index {page_idx}...")
-                    response = self.chatterer([message])
-                    # Extract markdown, handling potential lack of backticks
-                    markdowns: list[str] = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
+                    response = self.chatterer.generate([message])
+                    # Extract markdown
+                    markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
                     if markdowns:
                         current_page_markdown = "\n".join(markdowns)
                     else:
-                        # Fallback: assume the whole response is markdown if no ```markdown blocks found
                         current_page_markdown = response.strip()
                         if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
-                            # Basic cleanup if it just missed the 'markdown' language tag
                             current_page_markdown = current_page_markdown[3:-3].strip()
-                        elif "```" in current_page_markdown:
-                            logger.warning(
-                                f"Page {page_idx + 1}: Response contains '```' but not in expected format. Using raw response."
-                            )
-                    logger.debug(f"Received response from LLM for page index {page_idx}.")
-                    # --- Store result and update context ---
                     full_markdown_output.append(current_page_markdown)
-                    # Update the *full* previous markdown for the *next* iteration's tail calculation
                     previous_page_markdown = current_page_markdown
                 except Exception as e:
@@ -196,18 +369,85 @@ class PdfToMarkdown:
                 # Progress callback
                 if progress_callback:
                     try:
-                        progress_callback(i, total_pages_to_process)
+                        progress_callback(i + 1, total_pages_to_process)
                     except Exception as cb_err:
                         logger.warning(f"Progress callback failed: {cb_err}")
-                if not target_page_indices:
-                    break
+            return "\n\n".join(full_markdown_output).strip()
+    def _convert_parallel_sync(
+        self,
+        pdf_input: "Document | PathOrReadable",
+        page_indices: Optional[PageIndexType] = None,
+        progress_callback: Optional[Callable[[int, int], None]] = None,
+    ) -> str:
+        """Synchronous parallel-style conversion (processes independently but sequentially)."""
+        with open_pdf(pdf_input) as doc:
+            target_page_indices = list(_get_page_indices(page_indices=page_indices, max_doc_pages=len(doc), is_input_zero_based=True))
+            total_pages_to_process = len(target_page_indices)
+            if total_pages_to_process == 0:
+                logger.warning("No pages selected for processing.")
+                return ""
+            logger.info(f"Starting parallel-style Markdown conversion for {total_pages_to_process} pages...")
+            # Pre-process all pages
+            page_text_dict = extract_text_from_pdf(doc, target_page_indices)
+            page_image_dict = render_pdf_as_image(
+                doc,
+                page_indices=target_page_indices,
+                zoom=self.image_zoom,
+                output=self.image_format,
+                jpg_quality=self.image_jpg_quality,
+            )
+            full_markdown_output: List[str] = []
+            for i, page_idx in enumerate(target_page_indices):
+                logger.info(f"Processing page {i + 1}/{total_pages_to_process} (Index: {page_idx})...")
-                page_idx = target_page_indices.pop(0)  # Get the next page index
-                i += 1  # Increment the page counter
+                try:
+                    # Get previous page data for context
+                    prev_page_idx = target_page_indices[i - 1] if i > 0 else None
+                    previous_page_text = page_text_dict.get(prev_page_idx) if prev_page_idx is not None else None
+                    previous_page_image_b64 = None
+                    if prev_page_idx is not None:
+                        previous_page_image_b64 = Base64Image.from_bytes(page_image_dict[prev_page_idx], ext=self.image_format)
+                    message = self._format_prompt_content_parallel(
+                        page_text=page_text_dict.get(page_idx, ""),
+                        page_image_b64=Base64Image.from_bytes(page_image_dict[page_idx], ext=self.image_format),
+                        previous_page_text=previous_page_text,
+                        previous_page_image_b64=previous_page_image_b64,
+                        page_number=page_idx,
+                        total_pages=len(doc),
+                    )
-        # Join with double newline, potentially adjust based on how well continuations work
-        return "\n\n".join(full_markdown_output).strip()  # Add strip() to remove leading/trailing whitespace
+                    response = self.chatterer.generate([message])
+                    # Extract markdown
+                    markdowns = [match.group(1).strip() for match in MARKDOWN_PATTERN.finditer(response)]
+                    if markdowns:
+                        current_page_markdown = "\n".join(markdowns)
+                    else:
+                        current_page_markdown = response.strip()
+                        if current_page_markdown.startswith("```") and current_page_markdown.endswith("```"):
+                            current_page_markdown = current_page_markdown[3:-3].strip()
+                    full_markdown_output.append(current_page_markdown)
+                except Exception as e:
+                    logger.error(f"Failed to process page index {page_idx}: {e}", exc_info=True)
+                    continue
+                # Progress callback
+                if progress_callback:
+                    try:
+                        progress_callback(i + 1, total_pages_to_process)
+                    except Exception as cb_err:
+                        logger.warning(f"Progress callback failed: {cb_err}")
+            return "\n\n".join(full_markdown_output).strip()
 def render_pdf_as_image(
@@ -297,9 +537,7 @@ def open_pdf(pdf_input: PathOrReadable | Document):
         doc.close()
-def _get_page_indices(
-    page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool
-) -> list[int]:
+def _get_page_indices(page_indices: Optional[PageIndexType], max_doc_pages: int, is_input_zero_based: bool) -> list[int]:
     """Helper function to handle page indices for PDF conversion."""
     def _to_zero_based_int(idx: int) -> int:
@@ -318,9 +556,7 @@ def _get_page_indices(
         return [_to_zero_based_int(page_indices)]
     elif isinstance(page_indices, str):
         # Handle string input for page indices
-        return _interpret_index_string(
-            index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based
-        )
+        return _interpret_index_string(index_str=page_indices, max_doc_pages=max_doc_pages, is_input_zero_based=is_input_zero_based)
     else:
         # Handle iterable input for page indices
         indices: set[int] = set()
@@ -340,9 +576,7 @@ def _get_page_indices(
                     end = _to_zero_based_int(end)
                 if start > end:
-                    raise ValueError(
-                        f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
-                    )
+                    raise ValueError(f"Invalid range: {start} - {end}. Start index must be less than or equal to end index.")
                 indices.update(range(start, end + 1))
         return sorted(indices)  # Return sorted list of indices
@@ -383,9 +617,7 @@ def _interpret_index_string(index_str: str, max_doc_pages: int, is_input_zero_ba
                 end = _to_zero_based_int(end)
             if start > end:
-                raise ValueError(
-                    f"Invalid range: {start} - {end}. Start index must be less than or equal to end index."
-                )
+                raise ValueError(f"Invalid range: {start} - {end}. Start index must be less than or equal to end index.")
             indices.update(range(start, end + 1))
         else:
             raise ValueError(f"Invalid page index format: '{part}'. Expected format is '1,2,3' or '1-3'.")

{chatterer-0.1.21.dist-info → chatterer-0.1.23.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: chatterer
-Version: 0.1.21
+Version: 0.1.23
 Summary: The highest-level interface for various LLM APIs.
 Requires-Python: >=3.12
 Description-Content-Type: text/markdown

{chatterer-0.1.21.dist-info → chatterer-0.1.23.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 chatterer/__init__.py,sha256=1z3ocUMqgbqQ3eD4wq5Jq-JPt-VuWwdWT_U8r38Hodo,2267
 chatterer/interactive.py,sha256=B8KvlXAGpNEF-czJJpS_f9eJj1TenkE6896w9ixNjOk,17056
-chatterer/language_model.py,sha256=4aJrBHpDbFrGfcGOmglSy1IYFOhyiNGen20-BysqQTM,20659
+chatterer/language_model.py,sha256=QkJLmmTYcWbqosm3D70zfhDSFETD7PIafRaY5upT7Gc,20715
 chatterer/messages.py,sha256=j_bjOVE2FbBaYYpykmJrQL-IH_BWyiZ1VAUCj_wSA2U,479
 chatterer/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 chatterer/common_types/__init__.py,sha256=jfS6m5UANSvGjzQ_nzYDpryn5uZqNb06-4xCsQ2C_lw,376
@@ -10,7 +10,7 @@ chatterer/examples/anything_to_markdown.py,sha256=4O9ze7AIHcwEzvVmm5JMMKo_rVSFwh
 chatterer/examples/get_code_snippets.py,sha256=pz05JjhKaWAknVKlk1ftEEzpSG4-sqD9oa_gyIQoCAs,1911
 chatterer/examples/login_with_playwright.py,sha256=EhvJLaH5TD7bmDi12uP8YLd0fRhdjR-oyIkBHLi1Jjs,5988
 chatterer/examples/make_ppt.py,sha256=vsT_iL_jS2ami5VYrReLMQcD576FfZUH7913F7_As0A,23278
-chatterer/examples/pdf_to_markdown.py,sha256=ZeGRO5CZxGQxJpScK0iB1lTzUkfSiXtuqoeKEQL1ICA,2787
+chatterer/examples/pdf_to_markdown.py,sha256=mur63PxI0uwl90Mh49VXPuO0YSwyEfs0-MwxJWKWXec,13577
 chatterer/examples/pdf_to_text.py,sha256=DznTyhu1REv8Wp4RimQWVgEU5j0_BmlwjfJYJvx3dbI,1590
 chatterer/examples/transcription_api.py,sha256=WUs12qHH4616eLMQDHOiyVGxaXstTpgeE47djYyli6c,3897
 chatterer/examples/upstage_parser.py,sha256=TrfeSIiF0xklhFCknop22TIOVibI4CJ_UKj5-lD8c8E,3487
@@ -20,7 +20,7 @@ chatterer/strategies/atom_of_thoughts.py,sha256=pUhqt47YlzBIVNRh0UebeBwuJ0J94Ge6
 chatterer/strategies/base.py,sha256=b2gMPqodp97OP1dkHfj0UqixjdjVhmTw_V5qJ7i2S6g,427
 chatterer/tools/__init__.py,sha256=m3PRK9H5vOhk-2gG9W2eg8CYBlEn-K9-eaulOu91bgo,1474
 chatterer/tools/caption_markdown_images.py,sha256=r4QajHYuL4mdyYQXP1vQcNmqKN8lxBf5y0VKELXILOI,15392
-chatterer/tools/convert_pdf_to_markdown.py,sha256=Q5ln-_av2eor0A2LkQG7-IgyQKJ79wwrSOvv5Jncfso,18901
+chatterer/tools/convert_pdf_to_markdown.py,sha256=_a-nVNs_9j4QsDPKI5p6AZeasgOW3x_2rb49-yfBSPs,28501
 chatterer/tools/convert_to_text.py,sha256=WHQ0Xj4Ri_jYbFjzTx3mjmvJ9U8bAv4wGaKEVC88Nlk,15457
 chatterer/tools/upstage_document_parser.py,sha256=CXslVYAHDK8EV8jtUAUWzf8rxU4qilSnW8_dhAxHOE8,33142
 chatterer/tools/webpage_to_markdown.py,sha256=ADH4sqM6iquJR7HU6umMQ5qO7EvcbNutuchXDpAcxAo,31961
@@ -37,8 +37,8 @@ chatterer/utils/base64_image.py,sha256=m_qAT3ERBiq8D-H4H9Z7rLfL31_BiPmV_m4uQ5XRL
 chatterer/utils/bytesio.py,sha256=3MC2atOOFKo5YxuReo_y_t8Wem9p2Y1ahC5M2lGclwI,2618
 chatterer/utils/code_agent.py,sha256=7ka_WRI4TQmZ5H46mjY3hI6RO_pxw6pg3LAxjgW4AbM,10495
 chatterer/utils/imghdr.py,sha256=6JhJMXD4MZ0dQolT2VM87YrRYm3hPf3RTEWnP4lYRVc,3842
-chatterer-0.1.21.dist-info/METADATA,sha256=j3QGPYik-jm75MHIfAvbvUbv-EaxvlVKdEIc7_dMUjk,11826
-chatterer-0.1.21.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
-chatterer-0.1.21.dist-info/entry_points.txt,sha256=KhxL2dctnZalnDSmPoB5dZBBa9hZpJETW3C5xkoRaW4,554
-chatterer-0.1.21.dist-info/top_level.txt,sha256=7nSQKP0bHxPRc7HyzdbKsJdkvPgYD0214o6slRizv9s,10
-chatterer-0.1.21.dist-info/RECORD,,
+chatterer-0.1.23.dist-info/METADATA,sha256=zCTgA4OAI2tSpNRiLwjCDPweTrW4oxzJnIXT7PA69Ck,11826
+chatterer-0.1.23.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+chatterer-0.1.23.dist-info/entry_points.txt,sha256=KhxL2dctnZalnDSmPoB5dZBBa9hZpJETW3C5xkoRaW4,554
+chatterer-0.1.23.dist-info/top_level.txt,sha256=7nSQKP0bHxPRc7HyzdbKsJdkvPgYD0214o6slRizv9s,10
+chatterer-0.1.23.dist-info/RECORD,,

{chatterer-0.1.21.dist-info → chatterer-0.1.23.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.8.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{chatterer-0.1.21.dist-info → chatterer-0.1.23.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{chatterer-0.1.21.dist-info → chatterer-0.1.23.dist-info}/top_level.txt RENAMED Viewed

File without changes

chatterer 0.1.21__py3-none-any.whl → 0.1.23__py3-none-any.whl

chatterer 0.1.21py3-none-any.whl → 0.1.23py3-none-any.whl