polytext 0.2.0__tar.gz → 0.2.2b1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. {polytext-0.2.0 → polytext-0.2.2b1}/PKG-INFO +4 -3
  2. {polytext-0.2.0 → polytext-0.2.2b1}/README.md +1 -1
  3. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/audio_to_text.py +32 -8
  4. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/document_ocr_to_text.py +27 -9
  5. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/document_ocr_to_text_azure_oai.py +18 -3
  6. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/gemini_quality_guards.py +21 -0
  7. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/ocr_to_text.py +28 -9
  8. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/ocr_to_text_azure_oai.py +25 -4
  9. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/text_to_md.py +1 -1
  10. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/base.py +20 -3
  11. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/document_ocr.py +7 -0
  12. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/ocr.py +7 -1
  13. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/youtube_llm.py +86 -14
  14. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/text_merger.py +1 -1
  15. polytext-0.2.2b1/polytext/prompts/ocr.py +38 -0
  16. polytext-0.2.2b1/polytext/prompts/transcription.py +305 -0
  17. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/utils/utils.py +2 -0
  18. {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/PKG-INFO +4 -3
  19. {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/SOURCES.txt +2 -0
  20. {polytext-0.2.0 → polytext-0.2.2b1}/pyproject.toml +2 -2
  21. {polytext-0.2.0 → polytext-0.2.2b1}/setup.py +3 -2
  22. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_audio_transcription_model_migration.py +101 -11
  23. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_document_ocr_to_text_models.py +1 -0
  24. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_extracted_text_whitespace.py +18 -19
  25. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_ocr_from_image.py +5 -2
  26. polytext-0.2.2b1/tests/test_ocr_image_descriptions.py +167 -0
  27. polytext-0.2.2b1/tests/test_youtube_gemini_minimal_check.py +175 -0
  28. polytext-0.2.2b1/tests/test_youtube_transcript.py +65 -0
  29. polytext-0.2.0/polytext/prompts/ocr.py +0 -16
  30. polytext-0.2.0/polytext/prompts/transcription.py +0 -190
  31. polytext-0.2.0/tests/test_youtube_transcript.py +0 -45
  32. {polytext-0.2.0 → polytext-0.2.2b1}/LICENSE +0 -0
  33. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/__init__.py +0 -0
  34. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/__init__.py +0 -0
  35. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/base.py +0 -0
  36. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/html_to_md.py +0 -0
  37. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/md_to_text.py +0 -0
  38. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/pdf.py +0 -0
  39. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/video_to_audio.py +0 -0
  40. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/exceptions/__init__.py +0 -0
  41. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/exceptions/base.py +0 -0
  42. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/generator/__init__.py +0 -0
  43. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/generator/pdf.py +0 -0
  44. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/__init__.py +0 -0
  45. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/audio.py +0 -0
  46. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/document.py +0 -0
  47. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/downloader/__init__.py +0 -0
  48. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/downloader/downloader.py +0 -0
  49. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/html.py +0 -0
  50. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/markdown.py +0 -0
  51. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/notebook.py +0 -0
  52. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/plain_text.py +0 -0
  53. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/video.py +0 -0
  54. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/xml_xbrl.py +0 -0
  55. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/youtube.py +0 -0
  56. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/__init__.py +0 -0
  57. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/audio_chunker.py +0 -0
  58. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/transcript_chunker.py +0 -0
  59. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/prompts/__init__.py +0 -0
  60. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/prompts/text_merging.py +0 -0
  61. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/prompts/text_to_md.py +0 -0
  62. {polytext-0.2.0 → polytext-0.2.2b1}/polytext/utils/__init__.py +0 -0
  63. {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/dependency_links.txt +0 -0
  64. {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/not-zip-safe +0 -0
  65. {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/requires.txt +0 -0
  66. {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/top_level.txt +0 -0
  67. {polytext-0.2.0 → polytext-0.2.2b1}/setup.cfg +0 -0
  68. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_audio_chunker.py +0 -0
  69. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_audio_comparison_helpers.py +0 -0
  70. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_audio_models.py +0 -0
  71. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_ocr_to_text_models.py +0 -0
  72. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_youtube_models.py +0 -0
  73. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_dowload_audio_from_youtube.py +0 -0
  74. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
  75. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_audio_transcript_from_gcs.py +0 -0
  76. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_customized_pdf_from_markdown.py +0 -0
  77. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_ocr.py +0 -0
  78. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_ocr_azure_oai.py +0 -0
  79. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_text.py +0 -0
  80. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_text_from_gcs.py +0 -0
  81. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_text_from_markdown.py +0 -0
  82. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_video_transcript_from_gcs.py +0 -0
  83. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_library.py +0 -0
  84. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_markdown_loader_gzip.py +0 -0
  85. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_markitdown_html.py +0 -0
  86. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_notebook_loader.py +0 -0
  87. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_ocr_fallbacks.py +0 -0
  88. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_pain_text.py +0 -0
  89. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_split_audio_with_llm.py +0 -0
  90. {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_xml_xbrl_loader.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.0
3
+ Version: 0.2.2b1
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -8,11 +8,12 @@ Author-email: matteo.s@docsity.com
8
8
  License: MIT
9
9
  Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3.11
11
12
  Classifier: Programming Language :: Python :: 3.12
12
13
  Classifier: Programming Language :: Python :: 3.13
13
14
  Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
14
15
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
- Requires-Python: >=3.12
16
+ Requires-Python: >=3.11
16
17
  Description-Content-Type: text/markdown
17
18
  License-File: LICENSE
18
19
  Requires-Dist: pypdf==5.5.0
@@ -90,7 +91,7 @@ pip install polytext
90
91
 
91
92
  | Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
92
93
  |-------------|---------------------------------------------------------------------------------|------------------|-----------------|
93
- | **Python** | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
94
+ | **Python** | Supported on **3.11 – 3.13**<br> WeasyPrint still requires its native libraries | `brew install python@3.11` | `sudo apt install python3.11` |
94
95
  | **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
95
96
  | **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
96
97
 
@@ -35,7 +35,7 @@ pip install polytext
35
35
 
36
36
  | Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
37
37
  |-------------|---------------------------------------------------------------------------------|------------------|-----------------|
38
- | **Python** | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
38
+ | **Python** | Supported on **3.11 – 3.13**<br> WeasyPrint still requires its native libraries | `brew install python@3.11` | `sudo apt install python3.11` |
39
39
  | **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
40
40
  | **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
41
41
 
@@ -5,6 +5,7 @@ import tempfile
5
5
  import time
6
6
  import mimetypes
7
7
  import uuid
8
+ import re
8
9
  import ffmpeg
9
10
  from retry import retry
10
11
  from google import genai
@@ -42,11 +43,28 @@ INJECTION_GUARD_SYSTEM_INSTRUCTION = (
42
43
  AUDIO_MIN_OUTPUT_TOKENS = 500
43
44
  AUDIO_TAIL_REPETITION_LINES = int(os.getenv("AUDIO_TAIL_REPETITION_LINES", "200"))
44
45
  AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHOLD", "0.35"))
45
- AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
46
+ AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
46
47
  AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
47
48
  AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
48
49
  AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
49
50
  AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
51
+ NO_HUMAN_SPEECH_MARKER = "no human speech detected"
52
+
53
+
54
+ def normalize_no_human_speech_marker(text: str) -> tuple[str, bool]:
55
+ if not text:
56
+ return "", False
57
+
58
+ marker_line_pattern = re.compile(r"(?im)^\s*no human speech detected\s*$")
59
+ non_empty_lines = [line.strip() for line in text.splitlines() if line.strip()]
60
+ if non_empty_lines and all(line.lower() == NO_HUMAN_SPEECH_MARKER for line in non_empty_lines):
61
+ return "", True
62
+
63
+ cleaned_text = marker_line_pattern.sub("", text)
64
+ cleaned_text = re.sub(r"(?i)\bno human speech detected\b", "", cleaned_text)
65
+ cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
66
+ return cleaned_text, False
67
+
50
68
 
51
69
  def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
52
70
  """
@@ -74,7 +92,7 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
74
92
  logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
75
93
  ffmpeg.input(input_path).output(
76
94
  temp_audio_path,
77
- q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
95
+ q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
78
96
  acodec='libmp3lame',
79
97
  ac=1, # Convert to mono
80
98
  ar=16000, # Lower sample rate
@@ -86,6 +104,7 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
86
104
  logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
87
105
  return temp_audio_path
88
106
 
107
+
89
108
  def transcribe_full_audio(audio_file, markdown_output: bool = False,
90
109
  llm_api_key: str = None,
91
110
  save_transcript_chunks: bool = False, bitrate_quality=9,
@@ -118,16 +137,19 @@ def transcribe_full_audio(audio_file, markdown_output: bool = False,
118
137
  max_llm_tokens=max_llm_tokens, max_output_tokens=max_output_tokens)
119
138
  return converter.transcribe_full_audio(audio_file, save_transcript_chunks)
120
139
 
140
+
121
141
  class AudioToTextConverter:
122
- def __init__(self, transcription_model: str ="gemini-3.1-flash-lite-preview", transcription_model_provider: str ="google",
123
- k: int =5, min_matches: int =3, markdown_output: bool =True, llm_api_key: str =None, max_llm_tokens: int =4250,
124
- max_output_tokens: int | None =None, temp_dir: str ="temp",
125
- bitrate_quality: int =9, timeout_minutes: int =None):
142
+ def __init__(self, transcription_model: str = "gemini-3.1-flash-lite",
143
+ transcription_model_provider: str = "google",
144
+ k: int = 5, min_matches: int = 3, markdown_output: bool = True, llm_api_key: str = None,
145
+ max_llm_tokens: int = 4250,
146
+ max_output_tokens: int | None = None, temp_dir: str = "temp",
147
+ bitrate_quality: int = 9, timeout_minutes: int = None):
126
148
  """
127
149
  Initialize the AudioToTextConverter class with a specified transcription model and provider.
128
150
 
129
151
  Args:
130
- transcription_model (str): Model name for transcription. Defaults to "gemini-3.1-flash-lite-preview".
152
+ transcription_model (str): Model name for transcription. Defaults to "gemini-3.1-flash-lite".
131
153
  transcription_model_provider (str): Provider of transcription service. Defaults to "google".
132
154
  k (int): Number of words to use when searching for overlap between chunks. Defaults to 5.
133
155
  min_matches (int): Minimum matching words for chunk merging. Defaults to 3.
@@ -401,8 +423,10 @@ class AudioToTextConverter:
401
423
  code=997,
402
424
  )
403
425
 
426
+ response_text, marker_only = normalize_no_human_speech_marker(response_text)
427
+
404
428
  response_dict = {
405
- "transcript": response_text if "no human speech detected" not in response_text.lower() else "",
429
+ "transcript": "" if marker_only else response_text,
406
430
  "completion_tokens": completion_tokens,
407
431
  "prompt_tokens": prompt_tokens,
408
432
  "completion_model": self.transcription_model,
@@ -10,7 +10,11 @@ from google import genai
10
10
  from google.genai import types
11
11
  from google.api_core import exceptions as google_exceptions
12
12
 
13
- from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
13
+ from ..prompts.ocr import (
14
+ OCR_TO_MARKDOWN_PROMPT,
15
+ OCR_TO_PLAIN_TEXT_PROMPT,
16
+ build_ocr_prompt,
17
+ )
14
18
  from ..exceptions.base import EmptyDocument, ExceededMaxPages
15
19
  from .gemini_quality_guards import (
16
20
  extract_finish_reason,
@@ -103,6 +107,7 @@ def get_document_ocr(
103
107
  timeout_minutes=None,
104
108
  ocr_model: str | None = None,
105
109
  max_output_tokens: int | None = None,
110
+ include_image_descriptions: bool = False,
106
111
  ):
107
112
  """
108
113
  Convenience function to extract text from an image file using OCR, optionally formatted as Markdown.
@@ -123,26 +128,30 @@ def get_document_ocr(
123
128
  ocr_model (str | None, optional): Gemini OCR model to use. Defaults to the converter default.
124
129
  max_output_tokens (int | None, optional): Maximum Gemini output tokens.
125
130
  Defaults to the converter default.
131
+ include_image_descriptions (bool, optional): If True, OCR prompts include
132
+ brief functional descriptions for meaningful non-text images.
133
+ Defaults to False.
126
134
 
127
135
  Returns:
128
136
  dict: Dictionary containing the OCR results and metadata.
129
137
  """
130
138
  converter = DocumentOCRToTextConverter(
131
- ocr_model=ocr_model or "gemini-3.1-flash-lite-preview",
139
+ ocr_model=ocr_model or "gemini-3.1-flash-lite",
132
140
  markdown_output=markdown_output,
133
141
  llm_api_key=llm_api_key,
134
142
  target_size=target_size,
135
143
  page_range=page_range,
136
144
  timeout_minutes=timeout_minutes,
137
145
  max_output_tokens=max_output_tokens,
146
+ include_image_descriptions=include_image_descriptions,
138
147
  )
139
148
  return converter.get_document_ocr(document_for_ocr)
140
149
 
141
150
  class DocumentOCRToTextConverter:
142
- def __init__(self, ocr_model="gemini-3.1-flash-lite-preview", ocr_model_provider="google",
151
+ def __init__(self, ocr_model="gemini-3.1-flash-lite", ocr_model_provider="google",
143
152
  markdown_output=True, llm_api_key=None, target_size=1, temp_dir="temp",
144
153
  page_range=None, timeout_minutes: int = None, fallback_stage: int = 0,
145
- max_output_tokens: int | None = None):
154
+ max_output_tokens: int | None = None, include_image_descriptions: bool = False):
146
155
  """
147
156
  Initialize the DocumentOCRToTextConverter class with specified OCR model and formatting options.
148
157
 
@@ -150,7 +159,7 @@ class DocumentOCRToTextConverter:
150
159
  It supports various image formats and can output either plain text or markdown.
151
160
 
152
161
  Args:
153
- ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite-preview".
162
+ ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite".
154
163
  ocr_model_provider (str): Provider of OCR service. Defaults to "google".
155
164
  markdown_output (bool): Enable markdown formatting in output. Defaults to True.
156
165
  llm_api_key (str, optional): Override API key for language model. Defaults to None.
@@ -162,6 +171,9 @@ class DocumentOCRToTextConverter:
162
171
  Defaults to 0.
163
172
  max_output_tokens (int | None, optional): Maximum Gemini output tokens.
164
173
  Defaults to `OCR_MAX_OUTPUT_TOKENS`.
174
+ include_image_descriptions (bool, optional): If True, OCR prompts include
175
+ brief functional descriptions for meaningful non-text images.
176
+ Defaults to False.
165
177
 
166
178
  Raises:
167
179
  OSError: If temp directory creation fails
@@ -174,6 +186,7 @@ class DocumentOCRToTextConverter:
174
186
  self.target_size = target_size
175
187
  self.page_range = page_range
176
188
  self.timeout_minutes = timeout_minutes
189
+ self.include_image_descriptions = include_image_descriptions
177
190
  requested_output_tokens = OCR_MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
178
191
  self.max_output_tokens = max(requested_output_tokens, OCR_MIN_OUTPUT_TOKENS)
179
192
  self.fallback_stage = fallback_stage
@@ -187,6 +200,13 @@ class DocumentOCRToTextConverter:
187
200
  os.makedirs(self.temp_dir, exist_ok=True)
188
201
  tempfile.tempdir = self.temp_dir
189
202
 
203
+ def _build_prompt_template(self) -> str:
204
+ base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
205
+ return build_ocr_prompt(
206
+ base_prompt,
207
+ include_image_descriptions=self.include_image_descriptions,
208
+ )
209
+
190
210
  def should_fallback_temperature_retry(self, error: EmptyDocument, temperature: float) -> bool:
191
211
  if self.fallback_stage != 0:
192
212
  return False
@@ -233,6 +253,7 @@ class DocumentOCRToTextConverter:
233
253
  timeout_minutes=self.timeout_minutes,
234
254
  fallback_stage=fallback_stage,
235
255
  max_output_tokens=self.max_output_tokens,
256
+ include_image_descriptions=self.include_image_descriptions,
236
257
  )
237
258
  result = fallback_converter.get_ocr(
238
259
  file_for_ocr=file_for_ocr,
@@ -289,12 +310,9 @@ class DocumentOCRToTextConverter:
289
310
 
290
311
  if self.markdown_output:
291
312
  logger.info("Using prompt for markdown format")
292
- # Convert the text to markdown format
293
- prompt_template = OCR_TO_MARKDOWN_PROMPT
294
313
  else:
295
314
  logger.info("Using prompt for plain text format")
296
- # Convert the text to plain text format
297
- prompt_template = OCR_TO_PLAIN_TEXT_PROMPT
315
+ prompt_template = self._build_prompt_template()
298
316
 
299
317
  try:
300
318
  if self.llm_api_key:
@@ -18,7 +18,11 @@ from openai import (
18
18
  InternalServerError,
19
19
  )
20
20
 
21
- from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
21
+ from ..prompts.ocr import (
22
+ OCR_TO_MARKDOWN_PROMPT,
23
+ OCR_TO_PLAIN_TEXT_PROMPT,
24
+ build_ocr_prompt,
25
+ )
22
26
  from ..exceptions.base import EmptyDocument, ExceededMaxPages
23
27
 
24
28
  logger = logging.getLogger(__name__)
@@ -95,6 +99,7 @@ def get_document_ocr(
95
99
  page_range=None,
96
100
  timeout_minutes=None,
97
101
  ocr_model="gpt-5-mini", # Azure deployment name
102
+ include_image_descriptions: bool = False,
98
103
  ):
99
104
  """
100
105
  Convenience function to OCR a document (PDF) using Azure OpenAI vision.
@@ -106,6 +111,7 @@ def get_document_ocr(
106
111
  target_size=target_size,
107
112
  page_range=page_range,
108
113
  timeout_minutes=timeout_minutes,
114
+ include_image_descriptions=include_image_descriptions,
109
115
  )
110
116
  return converter.get_document_ocr(document_for_ocr)
111
117
 
@@ -127,6 +133,7 @@ class DocumentOCRToTextConverter:
127
133
  azure_api_version=None, # your resource-supported API version
128
134
  max_tokens=4096, # avoid truncation
129
135
  max_workers=None, # ThreadPoolExecutor workers (None = default)
136
+ include_image_descriptions: bool = False,
130
137
  ):
131
138
  if ocr_model is None:
132
139
  ocr_model = "gpt-4.1-mini"
@@ -139,6 +146,7 @@ class DocumentOCRToTextConverter:
139
146
  self.timeout_minutes = timeout_minutes
140
147
  self.max_tokens = max_tokens
141
148
  self.max_workers = max_workers
149
+ self.include_image_descriptions = include_image_descriptions
142
150
 
143
151
  # Azure config
144
152
  self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
@@ -154,6 +162,13 @@ class DocumentOCRToTextConverter:
154
162
  if not self.azure_api_version:
155
163
  raise ValueError("Missing Azure API version. Set azure_api_version or AZURE_OPENAI_API_VERSION.")
156
164
 
165
+ def _build_prompt_template(self) -> str:
166
+ base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
167
+ return build_ocr_prompt(
168
+ base_prompt,
169
+ include_image_descriptions=self.include_image_descriptions,
170
+ )
171
+
157
172
  def _build_client(self) -> AzureOpenAI:
158
173
  azure_api_key = self.llm_api_key or os.getenv("AZURE_OPENAI_API_KEY")
159
174
  if not azure_api_key:
@@ -189,7 +204,7 @@ class DocumentOCRToTextConverter:
189
204
  temp_file_for_ocr = None
190
205
  start_time = time.time()
191
206
 
192
- prompt_template = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
207
+ prompt_template = self._build_prompt_template()
193
208
  logger.info("Using prompt for %s format", "markdown" if self.markdown_output else "plain text")
194
209
 
195
210
  client = self._build_client()
@@ -370,4 +385,4 @@ class DocumentOCRToTextConverter:
370
385
  start_page = 0
371
386
  end_page = total_pages
372
387
 
373
- return start_page, end_page
388
+ return start_page, end_page
@@ -20,6 +20,23 @@ def repetition_ratio(items: list[str], min_occurrences: int = 2) -> float:
20
20
  return repeated_items / len(items)
21
21
 
22
22
 
23
+ def has_consecutive_repetition(items: list[str], min_run_length: int = 3) -> bool:
24
+ previous = None
25
+ run_length = 0
26
+
27
+ for item in items:
28
+ if item == previous:
29
+ run_length += 1
30
+ else:
31
+ previous = item
32
+ run_length = 1
33
+
34
+ if run_length >= min_run_length:
35
+ return True
36
+
37
+ return False
38
+
39
+
23
40
  def tail_has_excessive_repetition(
24
41
  text: str,
25
42
  tail_lines: int,
@@ -30,10 +47,14 @@ def tail_has_excessive_repetition(
30
47
 
31
48
  lines = [normalize_text_line(line) for line in text.splitlines() if normalize_text_line(line)]
32
49
  tail = lines[-tail_lines:] if len(lines) > tail_lines else lines
50
+ if has_consecutive_repetition(tail):
51
+ return True
33
52
  if len(tail) >= 4 and repetition_ratio(tail) >= threshold:
34
53
  return True
35
54
 
36
55
  sentences = split_sentences("\n".join(tail))
56
+ if has_consecutive_repetition(sentences):
57
+ return True
37
58
  if len(sentences) >= 4 and repetition_ratio(sentences) >= threshold:
38
59
  return True
39
60
 
@@ -10,7 +10,11 @@ from google import genai
10
10
  from google.genai import types
11
11
  from google.api_core import exceptions as google_exceptions
12
12
 
13
- from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
13
+ from ..prompts.ocr import (
14
+ OCR_TO_MARKDOWN_PROMPT,
15
+ OCR_TO_PLAIN_TEXT_PROMPT,
16
+ build_ocr_prompt,
17
+ )
14
18
  from ..exceptions import EmptyDocument
15
19
  from .gemini_quality_guards import (
16
20
  extract_finish_reason,
@@ -102,6 +106,7 @@ def get_ocr(
102
106
  timeout_minutes=None,
103
107
  ocr_model: str | None = None,
104
108
  max_output_tokens: int | None = None,
109
+ include_image_descriptions: bool = False,
105
110
  ):
106
111
  """
107
112
  Convenience function to extract text from an image file using OCR, optionally formatted as Markdown.
@@ -121,24 +126,29 @@ def get_ocr(
121
126
  ocr_model (str | None, optional): Gemini OCR model to use. Defaults to the converter default.
122
127
  max_output_tokens (int | None, optional): Maximum Gemini output tokens.
123
128
  Defaults to the converter default.
129
+ include_image_descriptions (bool, optional): If True, OCR prompts include
130
+ brief functional descriptions for meaningful non-text images.
131
+ Defaults to False.
124
132
 
125
133
  Returns:
126
134
  dict: Dictionary containing the OCR results and metadata.
127
135
  """
128
136
  converter = OCRToTextConverter(
129
- ocr_model=ocr_model or "gemini-3.1-flash-lite-preview",
137
+ ocr_model=ocr_model or "gemini-3.1-flash-lite",
130
138
  markdown_output=markdown_output,
131
139
  llm_api_key=llm_api_key,
132
140
  target_size=target_size,
133
141
  timeout_minutes=timeout_minutes,
134
142
  max_output_tokens=max_output_tokens,
143
+ include_image_descriptions=include_image_descriptions,
135
144
  )
136
145
  return converter.get_ocr(file_for_ocr)
137
146
 
138
147
  class OCRToTextConverter:
139
- def __init__(self, ocr_model="gemini-3.1-flash-lite-preview", ocr_model_provider="google",
148
+ def __init__(self, ocr_model="gemini-3.1-flash-lite", ocr_model_provider="google",
140
149
  markdown_output=True, llm_api_key=None, target_size=1, temp_dir="temp",
141
- timeout_minutes=None, fallback_stage: int = 0, max_output_tokens: int | None = None):
150
+ timeout_minutes=None, fallback_stage: int = 0, max_output_tokens: int | None = None,
151
+ include_image_descriptions: bool = False):
142
152
  """
143
153
  Initialize the OCRToTextConverter class with specified OCR model and formatting options.
144
154
 
@@ -146,7 +156,7 @@ class OCRToTextConverter:
146
156
  It supports various image formats and can output either plain text or markdown.
147
157
 
148
158
  Args:
149
- ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite-preview".
159
+ ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite".
150
160
  ocr_model_provider (str): Provider of OCR service. Defaults to "google".
151
161
  markdown_output (bool): Enable markdown formatting in output. Defaults to True.
152
162
  llm_api_key (str, optional): Override API key for language model. Defaults to None.
@@ -157,6 +167,9 @@ class OCRToTextConverter:
157
167
  Defaults to 0.
158
168
  max_output_tokens (int | None, optional): Maximum Gemini output tokens.
159
169
  Defaults to `OCR_MAX_OUTPUT_TOKENS`.
170
+ include_image_descriptions (bool, optional): If True, OCR prompts include
171
+ brief functional descriptions for meaningful non-text images.
172
+ Defaults to False.
160
173
 
161
174
  Raises:
162
175
  OSError: If temp directory creation fails
@@ -168,6 +181,7 @@ class OCRToTextConverter:
168
181
  self.llm_api_key = llm_api_key
169
182
  self.target_size = target_size
170
183
  self.timeout_minutes = timeout_minutes
184
+ self.include_image_descriptions = include_image_descriptions
171
185
  requested_output_tokens = OCR_MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
172
186
  self.max_output_tokens = max(requested_output_tokens, OCR_MIN_OUTPUT_TOKENS)
173
187
  self.fallback_stage = fallback_stage
@@ -181,6 +195,13 @@ class OCRToTextConverter:
181
195
  os.makedirs(self.temp_dir, exist_ok=True)
182
196
  tempfile.tempdir = self.temp_dir
183
197
 
198
+ def _build_prompt_template(self) -> str:
199
+ base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
200
+ return build_ocr_prompt(
201
+ base_prompt,
202
+ include_image_descriptions=self.include_image_descriptions,
203
+ )
204
+
184
205
  def should_fallback_temperature_retry(self, error: EmptyDocument, temperature: float) -> bool:
185
206
  if self.fallback_stage != 0:
186
207
  return False
@@ -226,6 +247,7 @@ class OCRToTextConverter:
226
247
  timeout_minutes=self.timeout_minutes,
227
248
  fallback_stage=fallback_stage,
228
249
  max_output_tokens=self.max_output_tokens,
250
+ include_image_descriptions=self.include_image_descriptions,
229
251
  )
230
252
  result = fallback_converter.get_ocr(
231
253
  file_for_ocr=file_for_ocr,
@@ -282,12 +304,9 @@ class OCRToTextConverter:
282
304
 
283
305
  if self.markdown_output:
284
306
  logger.info("Using prompt for markdown format")
285
- # Convert the text to markdown format
286
- prompt_template = OCR_TO_MARKDOWN_PROMPT
287
307
  else:
288
308
  logger.info("Using prompt for plain text format")
289
- # Convert the text to plain text format
290
- prompt_template = OCR_TO_PLAIN_TEXT_PROMPT
309
+ prompt_template = self._build_prompt_template()
291
310
 
292
311
  try:
293
312
  if self.llm_api_key:
@@ -18,7 +18,11 @@ from openai import (
18
18
  InternalServerError,
19
19
  )
20
20
 
21
- from ..prompts.ocr import OCR_TO_MARKDOWN_PROMPT, OCR_TO_PLAIN_TEXT_PROMPT
21
+ from ..prompts.ocr import (
22
+ OCR_TO_MARKDOWN_PROMPT,
23
+ OCR_TO_PLAIN_TEXT_PROMPT,
24
+ build_ocr_prompt,
25
+ )
22
26
 
23
27
  logger = logging.getLogger(__name__)
24
28
 
@@ -86,7 +90,14 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
86
90
  raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
87
91
 
88
92
 
89
- def get_ocr(file_for_ocr, markdown_output=False, llm_api_key=None, target_size=1, timeout_minutes=None):
93
+ def get_ocr(
94
+ file_for_ocr,
95
+ markdown_output=False,
96
+ llm_api_key=None,
97
+ target_size=1,
98
+ timeout_minutes=None,
99
+ include_image_descriptions: bool = False,
100
+ ):
90
101
  """
91
102
  Convenience function to extract text from an image file using OCR (Azure OpenAI),
92
103
  optionally formatted as Markdown.
@@ -96,6 +107,7 @@ def get_ocr(file_for_ocr, markdown_output=False, llm_api_key=None, target_size=1
96
107
  llm_api_key=llm_api_key,
97
108
  target_size=target_size,
98
109
  timeout_minutes=timeout_minutes,
110
+ include_image_descriptions=include_image_descriptions,
99
111
  )
100
112
  return converter.get_ocr(file_for_ocr)
101
113
 
@@ -114,6 +126,7 @@ class OCRToTextConverter:
114
126
  azure_endpoint=None, # e.g. https://<resource>.openai.azure.com
115
127
  azure_api_version=None, # e.g. "2024-10-21" (use your resource-supported version)
116
128
  max_tokens=4096, # avoid truncation
129
+ include_image_descriptions: bool = False,
117
130
  ):
118
131
  self.ocr_model = ocr_model
119
132
  self.ocr_model_provider = ocr_model_provider
@@ -122,6 +135,7 @@ class OCRToTextConverter:
122
135
  self.target_size = target_size
123
136
  self.timeout_minutes = timeout_minutes
124
137
  self.max_tokens = max_tokens
138
+ self.include_image_descriptions = include_image_descriptions
125
139
 
126
140
  # Azure config
127
141
  self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
@@ -137,6 +151,13 @@ class OCRToTextConverter:
137
151
  if not self.azure_api_version:
138
152
  raise ValueError("Missing Azure API version. Set azure_api_version or AZURE_OPENAI_API_VERSION.")
139
153
 
154
+ def _build_prompt_template(self) -> str:
155
+ base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
156
+ return build_ocr_prompt(
157
+ base_prompt,
158
+ include_image_descriptions=self.include_image_descriptions,
159
+ )
160
+
140
161
  @retry(
141
162
  (
142
163
  APITimeoutError,
@@ -158,7 +179,7 @@ class OCRToTextConverter:
158
179
  temp_file_for_ocr = None
159
180
  start_time = time.time()
160
181
 
161
- prompt_template = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
182
+ prompt_template = self._build_prompt_template()
162
183
  logger.info("Using prompt for %s format", "markdown" if self.markdown_output else "plain text")
163
184
 
164
185
  # Build Azure client
@@ -252,4 +273,4 @@ class OCRToTextConverter:
252
273
  finally:
253
274
  # Clean up the temporary compressed file (only if we created one)
254
275
  if temp_file_for_ocr and temp_file_for_ocr != file_for_ocr and os.path.exists(temp_file_for_ocr):
255
- os.remove(temp_file_for_ocr)
276
+ os.remove(temp_file_for_ocr)
@@ -54,7 +54,7 @@ class TextToMdConverter:
54
54
  overlap_chars: int = 500,
55
55
  k: int = 5,
56
56
  min_matches: int = 3,
57
- model: str = "gemini-3.1-flash-lite-preview",
57
+ model: str = "gemini-3.1-flash-lite",
58
58
  model_provider: str = "google",
59
59
  ) -> None:
60
60
  """
@@ -39,11 +39,20 @@ dotenv.load_dotenv()
39
39
  logger = logging.getLogger(__name__)
40
40
 
41
41
  MIN_DOC_TEXT_LENGTH_ACCEPTED = int(os.getenv("MIN_DOC_TEXT_LENGTH_ACCEPTED", "400"))
42
+ OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV = "OCR_INCLUDE_IMAGE_DESCRIPTIONS"
43
+
44
+
45
+ def _read_bool_env(name: str, default: bool = False) -> bool:
46
+ value = os.getenv(name)
47
+ if value is None:
48
+ return default
49
+ return value.strip().lower() in {"1", "true", "yes", "y", "on"}
42
50
 
43
51
 
44
52
  class BaseLoader:
45
53
  def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
46
- ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None, **kwargs):
54
+ ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
55
+ include_image_descriptions: bool | None = None, **kwargs):
47
56
  """
48
57
  Initialize the BaseLoader with cloud storage and LLM configurations.
49
58
 
@@ -58,6 +67,9 @@ class BaseLoader:
58
67
  provider (str, optional): Provider of the model. Default to "google".
59
68
  ocr_model (str, optional): OCR model to use for text extraction from images. Defaults to "gpt-5-mini".
60
69
  timeout_minutes (int, optional): Timeout in minutes. Defaults to None.
70
+ include_image_descriptions (bool | None, optional): If True, OCR prompts
71
+ include brief functional descriptions for meaningful non-text images.
72
+ If None, defaults from OCR_INCLUDE_IMAGE_DESCRIPTIONS. Defaults to None.
61
73
  **kwargs: Additional keyword arguments to pass to the underlying loader or extraction logic.
62
74
  - target_size (int, optional): Target file size in bytes. Defaults to 1MB
63
75
  - source (str): Source of the document. Must be either "cloud" or "local"
@@ -76,6 +88,11 @@ class BaseLoader:
76
88
  self.provider = provider
77
89
  self.ocr_model = ocr_model
78
90
  self.timeout_minutes = timeout_minutes
91
+ self.include_image_descriptions = (
92
+ _read_bool_env(OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV)
93
+ if include_image_descriptions is None
94
+ else include_image_descriptions
95
+ )
79
96
  self.kwargs = kwargs
80
97
  self.target_size = kwargs.get("target_size", 1)
81
98
  self.source = kwargs.get("source", "cloud")
@@ -287,7 +304,7 @@ class BaseLoader:
287
304
  file_extension = file_extension.lower()
288
305
 
289
306
  if is_document_fallback:
290
- return DocumentOCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, ocr_provider=self.provider, ocr_model=self.ocr_model, **kwargs)
307
+ return DocumentOCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, ocr_provider=self.provider, ocr_model=self.ocr_model, include_image_descriptions=self.include_image_descriptions, **kwargs)
291
308
 
292
309
  if file_extension in [".xml", ".xbrl"]:
293
310
  return XmlXbrlLoader(temp_dir=self.temp_dir, markdown_output=self.markdown_output, **kwargs)
@@ -308,7 +325,7 @@ class BaseLoader:
308
325
  elif mime_type.startswith("video/"):
309
326
  return VideoLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
310
327
  elif mime_type.startswith("image/"):
311
- return OCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
328
+ return OCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, include_image_descriptions=self.include_image_descriptions, **kwargs)
312
329
  elif mime_type.startswith("text/markdown"):
313
330
  return MarkdownLoader(markdown_output=self.markdown_output, temp_dir=self.temp_dir, **kwargs)
314
331
  elif mime_type == "text/html":
@@ -45,6 +45,7 @@ class DocumentOCRLoader:
45
45
  timeout_minutes: int = None,
46
46
  ocr_provider: str = "google",
47
47
  ocr_model: str | None = None,
48
+ include_image_descriptions: bool = False,
48
49
  **kwargs
49
50
  ):
50
51
  """
@@ -78,6 +79,9 @@ class DocumentOCRLoader:
78
79
  - For Google Gemini: optional / usually ignored.
79
80
  - For Azure OpenAI: **deployment name** (e.g. "gpt-5-mini").
80
81
  Defaults to None.
82
+ include_image_descriptions (bool, optional): If True, OCR prompts include
83
+ brief functional descriptions for meaningful non-text images.
84
+ Defaults to False.
81
85
  **kwargs:
82
86
  max_output_tokens (int, optional): Maximum Gemini output tokens for
83
87
  Google document OCR generation.
@@ -100,6 +104,7 @@ class DocumentOCRLoader:
100
104
 
101
105
  self.ocr_provider = (ocr_provider or "google").lower()
102
106
  self.ocr_model = ocr_model
107
+ self.include_image_descriptions = include_image_descriptions
103
108
  self.max_output_tokens = kwargs.get("max_output_tokens")
104
109
 
105
110
  # Set up custom temp directory
@@ -248,6 +253,7 @@ class DocumentOCRLoader:
248
253
  page_range=self.page_range,
249
254
  timeout_minutes=self.timeout_minutes,
250
255
  ocr_model=self.ocr_model or None,
256
+ include_image_descriptions=self.include_image_descriptions,
251
257
  )
252
258
  else:
253
259
  result_dict = ocr_fn(
@@ -263,6 +269,7 @@ class DocumentOCRLoader:
263
269
  else None
264
270
  ),
265
271
  max_output_tokens=self.max_output_tokens,
272
+ include_image_descriptions=self.include_image_descriptions,
266
273
  )
267
274
 
268
275
  result_dict["type"] = self.type