PyPI - polytext - Versions diffs - 0.2.0__tar.gz → 0.2.2b1__tar.gz - Mend

polytext 0.2.0tar.gz → 0.2.2b1tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

{polytext-0.2.0 → polytext-0.2.2b1}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: polytext
-Version: 0.2.0
+Version: 0.2.2b1
 Summary: Python utilities to simplify document files management
 Home-page: https://github.com/docsity/polytext
 Author: Matteo Senardi
@@ -8,11 +8,12 @@ Author-email: matteo.s@docsity.com
 License: MIT
 Classifier: License :: OSI Approved :: MIT License
 Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
 Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
 Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Requires-Python: >=3.12
+Requires-Python: >=3.11
 Description-Content-Type: text/markdown
 License-File: LICENSE
 Requires-Dist: pypdf==5.5.0
@@ -90,7 +91,7 @@ pip install polytext
 | Requirement | Notes                                                                           | macOS (Homebrew) | Ubuntu / Debian |
 |-------------|---------------------------------------------------------------------------------|------------------|-----------------|
-| **Python**  | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
+| **Python**  | Supported on **3.11 – 3.13**<br> WeasyPrint still requires its native libraries | `brew install python@3.11` | `sudo apt install python3.11` |
 | **WeasyPrint – native stack** | installs Pango, Cairo, etc.                                                     | `brew install weasyprint` | `sudo apt install weasyprint` |
 | **LibreOffice** | used for Office → PDF conversion                                                | `brew install --cask libreoffice` | `sudo apt install libreoffice` |

{polytext-0.2.0 → polytext-0.2.2b1}/README.md RENAMED Viewed

@@ -35,7 +35,7 @@ pip install polytext
 | Requirement | Notes                                                                           | macOS (Homebrew) | Ubuntu / Debian |
 |-------------|---------------------------------------------------------------------------------|------------------|-----------------|
-| **Python**  | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
+| **Python**  | Supported on **3.11 – 3.13**<br> WeasyPrint still requires its native libraries | `brew install python@3.11` | `sudo apt install python3.11` |
 | **WeasyPrint – native stack** | installs Pango, Cairo, etc.                                                     | `brew install weasyprint` | `sudo apt install weasyprint` |
 | **LibreOffice** | used for Office → PDF conversion                                                | `brew install --cask libreoffice` | `sudo apt install libreoffice` |

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/audio_to_text.py RENAMED Viewed

@@ -5,6 +5,7 @@ import tempfile
 import time
 import mimetypes
 import uuid
+import re
 import ffmpeg
 from retry import retry
 from google import genai
@@ -42,11 +43,28 @@ INJECTION_GUARD_SYSTEM_INSTRUCTION = (
 AUDIO_MIN_OUTPUT_TOKENS = 500
 AUDIO_TAIL_REPETITION_LINES = int(os.getenv("AUDIO_TAIL_REPETITION_LINES", "200"))
 AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHOLD", "0.35"))
-AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
+AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
 AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
 AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
 AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
 AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
+NO_HUMAN_SPEECH_MARKER = "no human speech detected"
+def normalize_no_human_speech_marker(text: str) -> tuple[str, bool]:
+    if not text:
+        return "", False
+    marker_line_pattern = re.compile(r"(?im)^\s*no human speech detected\s*$")
+    non_empty_lines = [line.strip() for line in text.splitlines() if line.strip()]
+    if non_empty_lines and all(line.lower() == NO_HUMAN_SPEECH_MARKER for line in non_empty_lines):
+        return "", True
+    cleaned_text = marker_line_pattern.sub("", text)
+    cleaned_text = re.sub(r"(?i)\bno human speech detected\b", "", cleaned_text)
+    cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
+    return cleaned_text, False
 def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
     """
@@ -74,7 +92,7 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
     logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
     ffmpeg.input(input_path).output(
         temp_audio_path,
-        q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
+        q=bitrate_quality,  # Variable bitrate quality (0-9, 9 being lowest)
         acodec='libmp3lame',
         ac=1,  # Convert to mono
         ar=16000,  # Lower sample rate
@@ -86,6 +104,7 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
     logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
     return temp_audio_path
 def transcribe_full_audio(audio_file, markdown_output: bool = False,
                           llm_api_key: str = None,
                           save_transcript_chunks: bool = False, bitrate_quality=9,
@@ -118,16 +137,19 @@ def transcribe_full_audio(audio_file, markdown_output: bool = False,
                                      max_llm_tokens=max_llm_tokens, max_output_tokens=max_output_tokens)
     return converter.transcribe_full_audio(audio_file, save_transcript_chunks)
 class AudioToTextConverter:
-    def __init__(self, transcription_model: str ="gemini-3.1-flash-lite-preview", transcription_model_provider: str ="google",
-                 k: int =5, min_matches: int =3, markdown_output: bool =True, llm_api_key: str =None, max_llm_tokens: int =4250,
-                 max_output_tokens: int | None =None, temp_dir: str ="temp",
-                 bitrate_quality: int =9, timeout_minutes: int =None):
+    def __init__(self, transcription_model: str = "gemini-3.1-flash-lite",
+                 transcription_model_provider: str = "google",
+                 k: int = 5, min_matches: int = 3, markdown_output: bool = True, llm_api_key: str = None,
+                 max_llm_tokens: int = 4250,
+                 max_output_tokens: int | None = None, temp_dir: str = "temp",
+                 bitrate_quality: int = 9, timeout_minutes: int = None):
         """
         Initialize the AudioToTextConverter class with a specified transcription model and provider.
         Args:
-            transcription_model (str): Model name for transcription. Defaults to "gemini-3.1-flash-lite-preview".
+            transcription_model (str): Model name for transcription. Defaults to "gemini-3.1-flash-lite".
             transcription_model_provider (str): Provider of transcription service. Defaults to "google".
             k (int): Number of words to use when searching for overlap between chunks. Defaults to 5.
             min_matches (int): Minimum matching words for chunk merging. Defaults to 3.
@@ -401,8 +423,10 @@ class AudioToTextConverter:
                     code=997,
                 )
+            response_text, marker_only = normalize_no_human_speech_marker(response_text)
             response_dict = {
-                "transcript": response_text if "no human speech detected" not in response_text.lower() else "",
+                "transcript": "" if marker_only else response_text,
                 "completion_tokens": completion_tokens,
                 "prompt_tokens": prompt_tokens,
                 "completion_model": self.transcription_model,

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/document_ocr_to_text.py RENAMED Viewed

@@ -10,7 +10,11 @@ from google import genai
 from google.genai import types
 from google.api_core import exceptions as google_exceptions
-from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
+from ..prompts.ocr import (
+    OCR_TO_MARKDOWN_PROMPT,
+    OCR_TO_PLAIN_TEXT_PROMPT,
+    build_ocr_prompt,
+)
 from ..exceptions.base import EmptyDocument, ExceededMaxPages
 from .gemini_quality_guards import (
     extract_finish_reason,
@@ -103,6 +107,7 @@ def get_document_ocr(
     timeout_minutes=None,
     ocr_model: str | None = None,
     max_output_tokens: int | None = None,
+    include_image_descriptions: bool = False,
 ):
     """
     Convenience function to extract text from an image file using OCR, optionally formatted as Markdown.
@@ -123,26 +128,30 @@ def get_document_ocr(
         ocr_model (str | None, optional): Gemini OCR model to use. Defaults to the converter default.
         max_output_tokens (int | None, optional): Maximum Gemini output tokens.
             Defaults to the converter default.
+        include_image_descriptions (bool, optional): If True, OCR prompts include
+            brief functional descriptions for meaningful non-text images.
+            Defaults to False.
     Returns:
         dict: Dictionary containing the OCR results and metadata.
     """
     converter = DocumentOCRToTextConverter(
-        ocr_model=ocr_model or "gemini-3.1-flash-lite-preview",
+        ocr_model=ocr_model or "gemini-3.1-flash-lite",
         markdown_output=markdown_output,
         llm_api_key=llm_api_key,
         target_size=target_size,
         page_range=page_range,
         timeout_minutes=timeout_minutes,
         max_output_tokens=max_output_tokens,
+        include_image_descriptions=include_image_descriptions,
     )
     return converter.get_document_ocr(document_for_ocr)
 class DocumentOCRToTextConverter:
-    def __init__(self, ocr_model="gemini-3.1-flash-lite-preview", ocr_model_provider="google",
+    def __init__(self, ocr_model="gemini-3.1-flash-lite", ocr_model_provider="google",
                 markdown_output=True, llm_api_key=None, target_size=1, temp_dir="temp",
                  page_range=None, timeout_minutes: int = None, fallback_stage: int = 0,
-                 max_output_tokens: int | None = None):
+                 max_output_tokens: int | None = None, include_image_descriptions: bool = False):
         """
         Initialize the DocumentOCRToTextConverter class with specified OCR model and formatting options.
@@ -150,7 +159,7 @@ class DocumentOCRToTextConverter:
         It supports various image formats and can output either plain text or markdown.
         Args:
-            ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite-preview".
+            ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite".
             ocr_model_provider (str): Provider of OCR service. Defaults to "google".
             markdown_output (bool): Enable markdown formatting in output. Defaults to True.
             llm_api_key (str, optional): Override API key for language model. Defaults to None.
@@ -162,6 +171,9 @@ class DocumentOCRToTextConverter:
                 Defaults to 0.
             max_output_tokens (int | None, optional): Maximum Gemini output tokens.
                 Defaults to `OCR_MAX_OUTPUT_TOKENS`.
+            include_image_descriptions (bool, optional): If True, OCR prompts include
+                brief functional descriptions for meaningful non-text images.
+                Defaults to False.
         Raises:
             OSError: If temp directory creation fails
@@ -174,6 +186,7 @@ class DocumentOCRToTextConverter:
         self.target_size = target_size
         self.page_range = page_range
         self.timeout_minutes = timeout_minutes
+        self.include_image_descriptions = include_image_descriptions
         requested_output_tokens = OCR_MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
         self.max_output_tokens = max(requested_output_tokens, OCR_MIN_OUTPUT_TOKENS)
         self.fallback_stage = fallback_stage
@@ -187,6 +200,13 @@ class DocumentOCRToTextConverter:
         os.makedirs(self.temp_dir, exist_ok=True)
         tempfile.tempdir = self.temp_dir
+    def _build_prompt_template(self) -> str:
+        base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
+        return build_ocr_prompt(
+            base_prompt,
+            include_image_descriptions=self.include_image_descriptions,
+        )
     def should_fallback_temperature_retry(self, error: EmptyDocument, temperature: float) -> bool:
         if self.fallback_stage != 0:
             return False
@@ -233,6 +253,7 @@ class DocumentOCRToTextConverter:
             timeout_minutes=self.timeout_minutes,
             fallback_stage=fallback_stage,
             max_output_tokens=self.max_output_tokens,
+            include_image_descriptions=self.include_image_descriptions,
         )
         result = fallback_converter.get_ocr(
             file_for_ocr=file_for_ocr,
@@ -289,12 +310,9 @@ class DocumentOCRToTextConverter:
         if self.markdown_output:
             logger.info("Using prompt for markdown format")
-            # Convert the text to markdown format
-            prompt_template = OCR_TO_MARKDOWN_PROMPT
         else:
             logger.info("Using prompt for plain text format")
-            # Convert the text to plain text format
-            prompt_template = OCR_TO_PLAIN_TEXT_PROMPT
+        prompt_template = self._build_prompt_template()
         try:
             if self.llm_api_key:

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/document_ocr_to_text_azure_oai.py RENAMED Viewed

@@ -18,7 +18,11 @@ from openai import (
     InternalServerError,
 )
-from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
+from ..prompts.ocr import (
+    OCR_TO_MARKDOWN_PROMPT,
+    OCR_TO_PLAIN_TEXT_PROMPT,
+    build_ocr_prompt,
+)
 from ..exceptions.base import EmptyDocument, ExceededMaxPages
 logger = logging.getLogger(__name__)
@@ -95,6 +99,7 @@ def get_document_ocr(
     page_range=None,
     timeout_minutes=None,
     ocr_model="gpt-5-mini",  # Azure deployment name
+    include_image_descriptions: bool = False,
 ):
     """
     Convenience function to OCR a document (PDF) using Azure OpenAI vision.
@@ -106,6 +111,7 @@ def get_document_ocr(
         target_size=target_size,
         page_range=page_range,
         timeout_minutes=timeout_minutes,
+        include_image_descriptions=include_image_descriptions,
     )
     return converter.get_document_ocr(document_for_ocr)
@@ -127,6 +133,7 @@ class DocumentOCRToTextConverter:
         azure_api_version=None,     # your resource-supported API version
         max_tokens=4096,            # avoid truncation
         max_workers=None,           # ThreadPoolExecutor workers (None = default)
+        include_image_descriptions: bool = False,
     ):
         if ocr_model is None:
             ocr_model = "gpt-4.1-mini"
@@ -139,6 +146,7 @@ class DocumentOCRToTextConverter:
         self.timeout_minutes = timeout_minutes
         self.max_tokens = max_tokens
         self.max_workers = max_workers
+        self.include_image_descriptions = include_image_descriptions
         # Azure config
         self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
@@ -154,6 +162,13 @@ class DocumentOCRToTextConverter:
         if not self.azure_api_version:
             raise ValueError("Missing Azure API version. Set azure_api_version or AZURE_OPENAI_API_VERSION.")
+    def _build_prompt_template(self) -> str:
+        base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
+        return build_ocr_prompt(
+            base_prompt,
+            include_image_descriptions=self.include_image_descriptions,
+        )
     def _build_client(self) -> AzureOpenAI:
         azure_api_key = self.llm_api_key or os.getenv("AZURE_OPENAI_API_KEY")
         if not azure_api_key:
@@ -189,7 +204,7 @@ class DocumentOCRToTextConverter:
         temp_file_for_ocr = None
         start_time = time.time()
-        prompt_template = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
+        prompt_template = self._build_prompt_template()
         logger.info("Using prompt for %s format", "markdown" if self.markdown_output else "plain text")
         client = self._build_client()
@@ -370,4 +385,4 @@ class DocumentOCRToTextConverter:
             start_page = 0
             end_page = total_pages
-        return start_page, end_page
+        return start_page, end_page

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/gemini_quality_guards.py RENAMED Viewed

@@ -20,6 +20,23 @@ def repetition_ratio(items: list[str], min_occurrences: int = 2) -> float:
     return repeated_items / len(items)
+def has_consecutive_repetition(items: list[str], min_run_length: int = 3) -> bool:
+    previous = None
+    run_length = 0
+    for item in items:
+        if item == previous:
+            run_length += 1
+        else:
+            previous = item
+            run_length = 1
+        if run_length >= min_run_length:
+            return True
+    return False
 def tail_has_excessive_repetition(
     text: str,
     tail_lines: int,
@@ -30,10 +47,14 @@ def tail_has_excessive_repetition(
     lines = [normalize_text_line(line) for line in text.splitlines() if normalize_text_line(line)]
     tail = lines[-tail_lines:] if len(lines) > tail_lines else lines
+    if has_consecutive_repetition(tail):
+        return True
     if len(tail) >= 4 and repetition_ratio(tail) >= threshold:
         return True
     sentences = split_sentences("\n".join(tail))
+    if has_consecutive_repetition(sentences):
+        return True
     if len(sentences) >= 4 and repetition_ratio(sentences) >= threshold:
         return True

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/ocr_to_text.py RENAMED Viewed

@@ -10,7 +10,11 @@ from google import genai
 from google.genai import types
 from google.api_core import exceptions as google_exceptions
-from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
+from ..prompts.ocr import (
+    OCR_TO_MARKDOWN_PROMPT,
+    OCR_TO_PLAIN_TEXT_PROMPT,
+    build_ocr_prompt,
+)
 from ..exceptions import EmptyDocument
 from .gemini_quality_guards import (
     extract_finish_reason,
@@ -102,6 +106,7 @@ def get_ocr(
     timeout_minutes=None,
     ocr_model: str | None = None,
     max_output_tokens: int | None = None,
+    include_image_descriptions: bool = False,
 ):
     """
     Convenience function to extract text from an image file using OCR, optionally formatted as Markdown.
@@ -121,24 +126,29 @@ def get_ocr(
         ocr_model (str | None, optional): Gemini OCR model to use. Defaults to the converter default.
         max_output_tokens (int | None, optional): Maximum Gemini output tokens.
             Defaults to the converter default.
+        include_image_descriptions (bool, optional): If True, OCR prompts include
+            brief functional descriptions for meaningful non-text images.
+            Defaults to False.
     Returns:
         dict: Dictionary containing the OCR results and metadata.
     """
     converter = OCRToTextConverter(
-        ocr_model=ocr_model or "gemini-3.1-flash-lite-preview",
+        ocr_model=ocr_model or "gemini-3.1-flash-lite",
         markdown_output=markdown_output,
         llm_api_key=llm_api_key,
         target_size=target_size,
         timeout_minutes=timeout_minutes,
         max_output_tokens=max_output_tokens,
+        include_image_descriptions=include_image_descriptions,
     )
     return converter.get_ocr(file_for_ocr)
 class OCRToTextConverter:
-    def __init__(self, ocr_model="gemini-3.1-flash-lite-preview", ocr_model_provider="google",
+    def __init__(self, ocr_model="gemini-3.1-flash-lite", ocr_model_provider="google",
                 markdown_output=True, llm_api_key=None, target_size=1, temp_dir="temp",
-                 timeout_minutes=None, fallback_stage: int = 0, max_output_tokens: int | None = None):
+                 timeout_minutes=None, fallback_stage: int = 0, max_output_tokens: int | None = None,
+                 include_image_descriptions: bool = False):
         """
         Initialize the OCRToTextConverter class with specified OCR model and formatting options.
@@ -146,7 +156,7 @@ class OCRToTextConverter:
         It supports various image formats and can output either plain text or markdown.
         Args:
-            ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite-preview".
+            ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite".
             ocr_model_provider (str): Provider of OCR service. Defaults to "google".
             markdown_output (bool): Enable markdown formatting in output. Defaults to True.
             llm_api_key (str, optional): Override API key for language model. Defaults to None.
@@ -157,6 +167,9 @@ class OCRToTextConverter:
                 Defaults to 0.
             max_output_tokens (int | None, optional): Maximum Gemini output tokens.
                 Defaults to `OCR_MAX_OUTPUT_TOKENS`.
+            include_image_descriptions (bool, optional): If True, OCR prompts include
+                brief functional descriptions for meaningful non-text images.
+                Defaults to False.
         Raises:
             OSError: If temp directory creation fails
@@ -168,6 +181,7 @@ class OCRToTextConverter:
         self.llm_api_key = llm_api_key
         self.target_size = target_size
         self.timeout_minutes = timeout_minutes
+        self.include_image_descriptions = include_image_descriptions
         requested_output_tokens = OCR_MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
         self.max_output_tokens = max(requested_output_tokens, OCR_MIN_OUTPUT_TOKENS)
         self.fallback_stage = fallback_stage
@@ -181,6 +195,13 @@ class OCRToTextConverter:
         os.makedirs(self.temp_dir, exist_ok=True)
         tempfile.tempdir = self.temp_dir
+    def _build_prompt_template(self) -> str:
+        base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
+        return build_ocr_prompt(
+            base_prompt,
+            include_image_descriptions=self.include_image_descriptions,
+        )
     def should_fallback_temperature_retry(self, error: EmptyDocument, temperature: float) -> bool:
         if self.fallback_stage != 0:
             return False
@@ -226,6 +247,7 @@ class OCRToTextConverter:
             timeout_minutes=self.timeout_minutes,
             fallback_stage=fallback_stage,
             max_output_tokens=self.max_output_tokens,
+            include_image_descriptions=self.include_image_descriptions,
         )
         result = fallback_converter.get_ocr(
             file_for_ocr=file_for_ocr,
@@ -282,12 +304,9 @@ class OCRToTextConverter:
         if self.markdown_output:
             logger.info("Using prompt for markdown format")
-            # Convert the text to markdown format
-            prompt_template = OCR_TO_MARKDOWN_PROMPT
         else:
             logger.info("Using prompt for plain text format")
-            # Convert the text to plain text format
-            prompt_template = OCR_TO_PLAIN_TEXT_PROMPT
+        prompt_template = self._build_prompt_template()
         try:
             if self.llm_api_key:

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/ocr_to_text_azure_oai.py RENAMED Viewed

@@ -18,7 +18,11 @@ from openai import (
     InternalServerError,
 )
-from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
+from ..prompts.ocr import (
+    OCR_TO_MARKDOWN_PROMPT,
+    OCR_TO_PLAIN_TEXT_PROMPT,
+    build_ocr_prompt,
+)
 logger = logging.getLogger(__name__)
@@ -86,7 +90,14 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
         raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
-def get_ocr(file_for_ocr, markdown_output=False, llm_api_key=None, target_size=1, timeout_minutes=None):
+def get_ocr(
+    file_for_ocr,
+    markdown_output=False,
+    llm_api_key=None,
+    target_size=1,
+    timeout_minutes=None,
+    include_image_descriptions: bool = False,
+):
     """
     Convenience function to extract text from an image file using OCR (Azure OpenAI),
     optionally formatted as Markdown.
@@ -96,6 +107,7 @@ def get_ocr(file_for_ocr, markdown_output=False, llm_api_key=None, target_size=1
         llm_api_key=llm_api_key,
         target_size=target_size,
         timeout_minutes=timeout_minutes,
+        include_image_descriptions=include_image_descriptions,
     )
     return converter.get_ocr(file_for_ocr)
@@ -114,6 +126,7 @@ class OCRToTextConverter:
         azure_endpoint=None,       # e.g. https://<resource>.openai.azure.com
         azure_api_version=None,    # e.g. "2024-10-21" (use your resource-supported version)
         max_tokens=4096,           # avoid truncation
+        include_image_descriptions: bool = False,
     ):
         self.ocr_model = ocr_model
         self.ocr_model_provider = ocr_model_provider
@@ -122,6 +135,7 @@ class OCRToTextConverter:
         self.target_size = target_size
         self.timeout_minutes = timeout_minutes
         self.max_tokens = max_tokens
+        self.include_image_descriptions = include_image_descriptions
         # Azure config
         self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
@@ -137,6 +151,13 @@ class OCRToTextConverter:
         if not self.azure_api_version:
             raise ValueError("Missing Azure API version. Set azure_api_version or AZURE_OPENAI_API_VERSION.")
+    def _build_prompt_template(self) -> str:
+        base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
+        return build_ocr_prompt(
+            base_prompt,
+            include_image_descriptions=self.include_image_descriptions,
+        )
     @retry(
         (
             APITimeoutError,
@@ -158,7 +179,7 @@ class OCRToTextConverter:
         temp_file_for_ocr = None
         start_time = time.time()
-        prompt_template = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
+        prompt_template = self._build_prompt_template()
         logger.info("Using prompt for %s format", "markdown" if self.markdown_output else "plain text")
         # Build Azure client
@@ -252,4 +273,4 @@ class OCRToTextConverter:
         finally:
             # Clean up the temporary compressed file (only if we created one)
             if temp_file_for_ocr and temp_file_for_ocr != file_for_ocr and os.path.exists(temp_file_for_ocr):
-                os.remove(temp_file_for_ocr)
+                os.remove(temp_file_for_ocr)

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/text_to_md.py RENAMED Viewed

@@ -54,7 +54,7 @@ class TextToMdConverter:
             overlap_chars: int = 500,
             k: int = 5,
             min_matches: int = 3,
-            model: str = "gemini-3.1-flash-lite-preview",
+            model: str = "gemini-3.1-flash-lite",
             model_provider: str = "google",
     ) -> None:
         """

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/base.py RENAMED Viewed

@@ -39,11 +39,20 @@ dotenv.load_dotenv()
 logger = logging.getLogger(__name__)
 MIN_DOC_TEXT_LENGTH_ACCEPTED = int(os.getenv("MIN_DOC_TEXT_LENGTH_ACCEPTED", "400"))
+OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV = "OCR_INCLUDE_IMAGE_DESCRIPTIONS"
+def _read_bool_env(name: str, default: bool = False) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "y", "on"}
 class BaseLoader:
     def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
-                 ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None, **kwargs):
+                 ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
+                 include_image_descriptions: bool | None = None, **kwargs):
         """
         Initialize the BaseLoader with cloud storage and LLM configurations.
@@ -58,6 +67,9 @@ class BaseLoader:
             provider (str, optional): Provider of the model. Default to "google".
             ocr_model (str, optional): OCR model to use for text extraction from images. Defaults to "gpt-5-mini".
             timeout_minutes (int, optional): Timeout in minutes. Defaults to None.
+            include_image_descriptions (bool | None, optional): If True, OCR prompts
+                include brief functional descriptions for meaningful non-text images.
+                If None, defaults from OCR_INCLUDE_IMAGE_DESCRIPTIONS. Defaults to None.
              **kwargs: Additional keyword arguments to pass to the underlying loader or extraction logic.
                 - target_size (int, optional): Target file size in bytes. Defaults to 1MB
                 - source (str): Source of the document. Must be either "cloud" or "local"
@@ -76,6 +88,11 @@ class BaseLoader:
         self.provider = provider
         self.ocr_model = ocr_model
         self.timeout_minutes = timeout_minutes
+        self.include_image_descriptions = (
+            _read_bool_env(OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV)
+            if include_image_descriptions is None
+            else include_image_descriptions
+        )
         self.kwargs = kwargs
         self.target_size = kwargs.get("target_size", 1)
         self.source = kwargs.get("source", "cloud")
@@ -287,7 +304,7 @@ class BaseLoader:
             file_extension = file_extension.lower()
         if is_document_fallback:
-            return DocumentOCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, ocr_provider=self.provider, ocr_model=self.ocr_model, **kwargs)
+            return DocumentOCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, ocr_provider=self.provider, ocr_model=self.ocr_model, include_image_descriptions=self.include_image_descriptions, **kwargs)
         if file_extension in [".xml", ".xbrl"]:
             return XmlXbrlLoader(temp_dir=self.temp_dir, markdown_output=self.markdown_output, **kwargs)
@@ -308,7 +325,7 @@ class BaseLoader:
             elif mime_type.startswith("video/"):
                 return VideoLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
             elif mime_type.startswith("image/"):
-                return OCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
+                return OCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, include_image_descriptions=self.include_image_descriptions, **kwargs)
             elif mime_type.startswith("text/markdown"):
                 return MarkdownLoader(markdown_output=self.markdown_output, temp_dir=self.temp_dir, **kwargs)
             elif mime_type == "text/html":

{polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/document_ocr.py RENAMED Viewed

@@ -45,6 +45,7 @@ class DocumentOCRLoader:
                  timeout_minutes: int = None,
                  ocr_provider: str = "google",
                  ocr_model: str | None = None,
+                 include_image_descriptions: bool = False,
                  **kwargs
                  ):
         """
@@ -78,6 +79,9 @@ class DocumentOCRLoader:
                 - For Google Gemini: optional / usually ignored.
                 - For Azure OpenAI: **deployment name** (e.g. "gpt-5-mini").
                 Defaults to None.
+            include_image_descriptions (bool, optional): If True, OCR prompts include
+                brief functional descriptions for meaningful non-text images.
+                Defaults to False.
             **kwargs:
                 max_output_tokens (int, optional): Maximum Gemini output tokens for
                     Google document OCR generation.
@@ -100,6 +104,7 @@ class DocumentOCRLoader:
         self.ocr_provider = (ocr_provider or "google").lower()
         self.ocr_model = ocr_model
+        self.include_image_descriptions = include_image_descriptions
         self.max_output_tokens = kwargs.get("max_output_tokens")
         # Set up custom temp directory
@@ -248,6 +253,7 @@ class DocumentOCRLoader:
                 page_range=self.page_range,
                 timeout_minutes=self.timeout_minutes,
                 ocr_model=self.ocr_model or None,
+                include_image_descriptions=self.include_image_descriptions,
             )
         else:
             result_dict = ocr_fn(
@@ -263,6 +269,7 @@ class DocumentOCRLoader:
                     else None
                 ),
                 max_output_tokens=self.max_output_tokens,
+                include_image_descriptions=self.include_image_descriptions,
             )
         result_dict["type"] = self.type

polytext 0.2.0__tar.gz → 0.2.2b1__tar.gz

polytext 0.2.0tar.gz → 0.2.2b1tar.gz