polytext 0.2.0__tar.gz → 0.2.2b1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.2.0 → polytext-0.2.2b1}/PKG-INFO +4 -3
- {polytext-0.2.0 → polytext-0.2.2b1}/README.md +1 -1
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/audio_to_text.py +32 -8
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/document_ocr_to_text.py +27 -9
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/document_ocr_to_text_azure_oai.py +18 -3
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/gemini_quality_guards.py +21 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/ocr_to_text.py +28 -9
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/ocr_to_text_azure_oai.py +25 -4
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/text_to_md.py +1 -1
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/base.py +20 -3
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/document_ocr.py +7 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/ocr.py +7 -1
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/youtube_llm.py +86 -14
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/text_merger.py +1 -1
- polytext-0.2.2b1/polytext/prompts/ocr.py +38 -0
- polytext-0.2.2b1/polytext/prompts/transcription.py +305 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/utils/utils.py +2 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/PKG-INFO +4 -3
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/SOURCES.txt +2 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/pyproject.toml +2 -2
- {polytext-0.2.0 → polytext-0.2.2b1}/setup.py +3 -2
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_audio_transcription_model_migration.py +101 -11
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_document_ocr_to_text_models.py +1 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_extracted_text_whitespace.py +18 -19
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_ocr_from_image.py +5 -2
- polytext-0.2.2b1/tests/test_ocr_image_descriptions.py +167 -0
- polytext-0.2.2b1/tests/test_youtube_gemini_minimal_check.py +175 -0
- polytext-0.2.2b1/tests/test_youtube_transcript.py +65 -0
- polytext-0.2.0/polytext/prompts/ocr.py +0 -16
- polytext-0.2.0/polytext/prompts/transcription.py +0 -190
- polytext-0.2.0/tests/test_youtube_transcript.py +0 -45
- {polytext-0.2.0 → polytext-0.2.2b1}/LICENSE +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/base.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/html_to_md.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/md_to_text.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/pdf.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/converter/video_to_audio.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/exceptions/base.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/generator/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/generator/pdf.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/audio.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/document.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/downloader/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/downloader/downloader.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/html.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/markdown.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/notebook.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/plain_text.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/video.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/xml_xbrl.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/loader/youtube.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/audio_chunker.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/processor/transcript_chunker.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/prompts/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/prompts/text_merging.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/prompts/text_to_md.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext/utils/__init__.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/requires.txt +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/setup.cfg +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_audio_chunker.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_audio_comparison_helpers.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_audio_models.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_ocr_to_text_models.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_compare_youtube_models.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_dowload_audio_from_youtube.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_audio_transcript_from_gcs.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_customized_pdf_from_markdown.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_ocr.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_ocr_azure_oai.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_text.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_document_text_from_gcs.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_text_from_markdown.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_get_video_transcript_from_gcs.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_library.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_markdown_loader_gzip.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_markitdown_html.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_notebook_loader.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_ocr_fallbacks.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_pain_text.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_split_audio_with_llm.py +0 -0
- {polytext-0.2.0 → polytext-0.2.2b1}/tests/test_xml_xbrl_loader.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: polytext
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2b1
|
|
4
4
|
Summary: Python utilities to simplify document files management
|
|
5
5
|
Home-page: https://github.com/docsity/polytext
|
|
6
6
|
Author: Matteo Senardi
|
|
@@ -8,11 +8,12 @@ Author-email: matteo.s@docsity.com
|
|
|
8
8
|
License: MIT
|
|
9
9
|
Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
11
12
|
Classifier: Programming Language :: Python :: 3.12
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.13
|
|
13
14
|
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
14
15
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
-
Requires-Python: >=3.
|
|
16
|
+
Requires-Python: >=3.11
|
|
16
17
|
Description-Content-Type: text/markdown
|
|
17
18
|
License-File: LICENSE
|
|
18
19
|
Requires-Dist: pypdf==5.5.0
|
|
@@ -90,7 +91,7 @@ pip install polytext
|
|
|
90
91
|
|
|
91
92
|
| Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
|
|
92
93
|
|-------------|---------------------------------------------------------------------------------|------------------|-----------------|
|
|
93
|
-
| **Python** |
|
|
94
|
+
| **Python** | Supported on **3.11 – 3.13**<br> WeasyPrint still requires its native libraries | `brew install python@3.11` | `sudo apt install python3.11` |
|
|
94
95
|
| **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
|
|
95
96
|
| **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
|
|
96
97
|
|
|
@@ -35,7 +35,7 @@ pip install polytext
|
|
|
35
35
|
|
|
36
36
|
| Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
|
|
37
37
|
|-------------|---------------------------------------------------------------------------------|------------------|-----------------|
|
|
38
|
-
| **Python** |
|
|
38
|
+
| **Python** | Supported on **3.11 – 3.13**<br> WeasyPrint still requires its native libraries | `brew install python@3.11` | `sudo apt install python3.11` |
|
|
39
39
|
| **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
|
|
40
40
|
| **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
|
|
41
41
|
|
|
@@ -5,6 +5,7 @@ import tempfile
|
|
|
5
5
|
import time
|
|
6
6
|
import mimetypes
|
|
7
7
|
import uuid
|
|
8
|
+
import re
|
|
8
9
|
import ffmpeg
|
|
9
10
|
from retry import retry
|
|
10
11
|
from google import genai
|
|
@@ -42,11 +43,28 @@ INJECTION_GUARD_SYSTEM_INSTRUCTION = (
|
|
|
42
43
|
AUDIO_MIN_OUTPUT_TOKENS = 500
|
|
43
44
|
AUDIO_TAIL_REPETITION_LINES = int(os.getenv("AUDIO_TAIL_REPETITION_LINES", "200"))
|
|
44
45
|
AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHOLD", "0.35"))
|
|
45
|
-
AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite
|
|
46
|
+
AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
|
|
46
47
|
AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
|
|
47
48
|
AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
|
|
48
49
|
AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
|
|
49
50
|
AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
|
|
51
|
+
NO_HUMAN_SPEECH_MARKER = "no human speech detected"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def normalize_no_human_speech_marker(text: str) -> tuple[str, bool]:
|
|
55
|
+
if not text:
|
|
56
|
+
return "", False
|
|
57
|
+
|
|
58
|
+
marker_line_pattern = re.compile(r"(?im)^\s*no human speech detected\s*$")
|
|
59
|
+
non_empty_lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
60
|
+
if non_empty_lines and all(line.lower() == NO_HUMAN_SPEECH_MARKER for line in non_empty_lines):
|
|
61
|
+
return "", True
|
|
62
|
+
|
|
63
|
+
cleaned_text = marker_line_pattern.sub("", text)
|
|
64
|
+
cleaned_text = re.sub(r"(?i)\bno human speech detected\b", "", cleaned_text)
|
|
65
|
+
cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip()
|
|
66
|
+
return cleaned_text, False
|
|
67
|
+
|
|
50
68
|
|
|
51
69
|
def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
|
|
52
70
|
"""
|
|
@@ -74,7 +92,7 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
|
|
|
74
92
|
logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
|
|
75
93
|
ffmpeg.input(input_path).output(
|
|
76
94
|
temp_audio_path,
|
|
77
|
-
q=bitrate_quality,
|
|
95
|
+
q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
|
|
78
96
|
acodec='libmp3lame',
|
|
79
97
|
ac=1, # Convert to mono
|
|
80
98
|
ar=16000, # Lower sample rate
|
|
@@ -86,6 +104,7 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
|
|
|
86
104
|
logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
|
|
87
105
|
return temp_audio_path
|
|
88
106
|
|
|
107
|
+
|
|
89
108
|
def transcribe_full_audio(audio_file, markdown_output: bool = False,
|
|
90
109
|
llm_api_key: str = None,
|
|
91
110
|
save_transcript_chunks: bool = False, bitrate_quality=9,
|
|
@@ -118,16 +137,19 @@ def transcribe_full_audio(audio_file, markdown_output: bool = False,
|
|
|
118
137
|
max_llm_tokens=max_llm_tokens, max_output_tokens=max_output_tokens)
|
|
119
138
|
return converter.transcribe_full_audio(audio_file, save_transcript_chunks)
|
|
120
139
|
|
|
140
|
+
|
|
121
141
|
class AudioToTextConverter:
|
|
122
|
-
def __init__(self, transcription_model: str ="gemini-3.1-flash-lite
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
142
|
+
def __init__(self, transcription_model: str = "gemini-3.1-flash-lite",
|
|
143
|
+
transcription_model_provider: str = "google",
|
|
144
|
+
k: int = 5, min_matches: int = 3, markdown_output: bool = True, llm_api_key: str = None,
|
|
145
|
+
max_llm_tokens: int = 4250,
|
|
146
|
+
max_output_tokens: int | None = None, temp_dir: str = "temp",
|
|
147
|
+
bitrate_quality: int = 9, timeout_minutes: int = None):
|
|
126
148
|
"""
|
|
127
149
|
Initialize the AudioToTextConverter class with a specified transcription model and provider.
|
|
128
150
|
|
|
129
151
|
Args:
|
|
130
|
-
transcription_model (str): Model name for transcription. Defaults to "gemini-3.1-flash-lite
|
|
152
|
+
transcription_model (str): Model name for transcription. Defaults to "gemini-3.1-flash-lite".
|
|
131
153
|
transcription_model_provider (str): Provider of transcription service. Defaults to "google".
|
|
132
154
|
k (int): Number of words to use when searching for overlap between chunks. Defaults to 5.
|
|
133
155
|
min_matches (int): Minimum matching words for chunk merging. Defaults to 3.
|
|
@@ -401,8 +423,10 @@ class AudioToTextConverter:
|
|
|
401
423
|
code=997,
|
|
402
424
|
)
|
|
403
425
|
|
|
426
|
+
response_text, marker_only = normalize_no_human_speech_marker(response_text)
|
|
427
|
+
|
|
404
428
|
response_dict = {
|
|
405
|
-
"transcript":
|
|
429
|
+
"transcript": "" if marker_only else response_text,
|
|
406
430
|
"completion_tokens": completion_tokens,
|
|
407
431
|
"prompt_tokens": prompt_tokens,
|
|
408
432
|
"completion_model": self.transcription_model,
|
|
@@ -10,7 +10,11 @@ from google import genai
|
|
|
10
10
|
from google.genai import types
|
|
11
11
|
from google.api_core import exceptions as google_exceptions
|
|
12
12
|
|
|
13
|
-
from ..prompts.ocr import
|
|
13
|
+
from ..prompts.ocr import (
|
|
14
|
+
OCR_TO_MARKDOWN_PROMPT,
|
|
15
|
+
OCR_TO_PLAIN_TEXT_PROMPT,
|
|
16
|
+
build_ocr_prompt,
|
|
17
|
+
)
|
|
14
18
|
from ..exceptions.base import EmptyDocument, ExceededMaxPages
|
|
15
19
|
from .gemini_quality_guards import (
|
|
16
20
|
extract_finish_reason,
|
|
@@ -103,6 +107,7 @@ def get_document_ocr(
|
|
|
103
107
|
timeout_minutes=None,
|
|
104
108
|
ocr_model: str | None = None,
|
|
105
109
|
max_output_tokens: int | None = None,
|
|
110
|
+
include_image_descriptions: bool = False,
|
|
106
111
|
):
|
|
107
112
|
"""
|
|
108
113
|
Convenience function to extract text from an image file using OCR, optionally formatted as Markdown.
|
|
@@ -123,26 +128,30 @@ def get_document_ocr(
|
|
|
123
128
|
ocr_model (str | None, optional): Gemini OCR model to use. Defaults to the converter default.
|
|
124
129
|
max_output_tokens (int | None, optional): Maximum Gemini output tokens.
|
|
125
130
|
Defaults to the converter default.
|
|
131
|
+
include_image_descriptions (bool, optional): If True, OCR prompts include
|
|
132
|
+
brief functional descriptions for meaningful non-text images.
|
|
133
|
+
Defaults to False.
|
|
126
134
|
|
|
127
135
|
Returns:
|
|
128
136
|
dict: Dictionary containing the OCR results and metadata.
|
|
129
137
|
"""
|
|
130
138
|
converter = DocumentOCRToTextConverter(
|
|
131
|
-
ocr_model=ocr_model or "gemini-3.1-flash-lite
|
|
139
|
+
ocr_model=ocr_model or "gemini-3.1-flash-lite",
|
|
132
140
|
markdown_output=markdown_output,
|
|
133
141
|
llm_api_key=llm_api_key,
|
|
134
142
|
target_size=target_size,
|
|
135
143
|
page_range=page_range,
|
|
136
144
|
timeout_minutes=timeout_minutes,
|
|
137
145
|
max_output_tokens=max_output_tokens,
|
|
146
|
+
include_image_descriptions=include_image_descriptions,
|
|
138
147
|
)
|
|
139
148
|
return converter.get_document_ocr(document_for_ocr)
|
|
140
149
|
|
|
141
150
|
class DocumentOCRToTextConverter:
|
|
142
|
-
def __init__(self, ocr_model="gemini-3.1-flash-lite
|
|
151
|
+
def __init__(self, ocr_model="gemini-3.1-flash-lite", ocr_model_provider="google",
|
|
143
152
|
markdown_output=True, llm_api_key=None, target_size=1, temp_dir="temp",
|
|
144
153
|
page_range=None, timeout_minutes: int = None, fallback_stage: int = 0,
|
|
145
|
-
max_output_tokens: int | None = None):
|
|
154
|
+
max_output_tokens: int | None = None, include_image_descriptions: bool = False):
|
|
146
155
|
"""
|
|
147
156
|
Initialize the DocumentOCRToTextConverter class with specified OCR model and formatting options.
|
|
148
157
|
|
|
@@ -150,7 +159,7 @@ class DocumentOCRToTextConverter:
|
|
|
150
159
|
It supports various image formats and can output either plain text or markdown.
|
|
151
160
|
|
|
152
161
|
Args:
|
|
153
|
-
ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite
|
|
162
|
+
ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite".
|
|
154
163
|
ocr_model_provider (str): Provider of OCR service. Defaults to "google".
|
|
155
164
|
markdown_output (bool): Enable markdown formatting in output. Defaults to True.
|
|
156
165
|
llm_api_key (str, optional): Override API key for language model. Defaults to None.
|
|
@@ -162,6 +171,9 @@ class DocumentOCRToTextConverter:
|
|
|
162
171
|
Defaults to 0.
|
|
163
172
|
max_output_tokens (int | None, optional): Maximum Gemini output tokens.
|
|
164
173
|
Defaults to `OCR_MAX_OUTPUT_TOKENS`.
|
|
174
|
+
include_image_descriptions (bool, optional): If True, OCR prompts include
|
|
175
|
+
brief functional descriptions for meaningful non-text images.
|
|
176
|
+
Defaults to False.
|
|
165
177
|
|
|
166
178
|
Raises:
|
|
167
179
|
OSError: If temp directory creation fails
|
|
@@ -174,6 +186,7 @@ class DocumentOCRToTextConverter:
|
|
|
174
186
|
self.target_size = target_size
|
|
175
187
|
self.page_range = page_range
|
|
176
188
|
self.timeout_minutes = timeout_minutes
|
|
189
|
+
self.include_image_descriptions = include_image_descriptions
|
|
177
190
|
requested_output_tokens = OCR_MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
|
|
178
191
|
self.max_output_tokens = max(requested_output_tokens, OCR_MIN_OUTPUT_TOKENS)
|
|
179
192
|
self.fallback_stage = fallback_stage
|
|
@@ -187,6 +200,13 @@ class DocumentOCRToTextConverter:
|
|
|
187
200
|
os.makedirs(self.temp_dir, exist_ok=True)
|
|
188
201
|
tempfile.tempdir = self.temp_dir
|
|
189
202
|
|
|
203
|
+
def _build_prompt_template(self) -> str:
|
|
204
|
+
base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
|
|
205
|
+
return build_ocr_prompt(
|
|
206
|
+
base_prompt,
|
|
207
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
208
|
+
)
|
|
209
|
+
|
|
190
210
|
def should_fallback_temperature_retry(self, error: EmptyDocument, temperature: float) -> bool:
|
|
191
211
|
if self.fallback_stage != 0:
|
|
192
212
|
return False
|
|
@@ -233,6 +253,7 @@ class DocumentOCRToTextConverter:
|
|
|
233
253
|
timeout_minutes=self.timeout_minutes,
|
|
234
254
|
fallback_stage=fallback_stage,
|
|
235
255
|
max_output_tokens=self.max_output_tokens,
|
|
256
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
236
257
|
)
|
|
237
258
|
result = fallback_converter.get_ocr(
|
|
238
259
|
file_for_ocr=file_for_ocr,
|
|
@@ -289,12 +310,9 @@ class DocumentOCRToTextConverter:
|
|
|
289
310
|
|
|
290
311
|
if self.markdown_output:
|
|
291
312
|
logger.info("Using prompt for markdown format")
|
|
292
|
-
# Convert the text to markdown format
|
|
293
|
-
prompt_template = OCR_TO_MARKDOWN_PROMPT
|
|
294
313
|
else:
|
|
295
314
|
logger.info("Using prompt for plain text format")
|
|
296
|
-
|
|
297
|
-
prompt_template = OCR_TO_PLAIN_TEXT_PROMPT
|
|
315
|
+
prompt_template = self._build_prompt_template()
|
|
298
316
|
|
|
299
317
|
try:
|
|
300
318
|
if self.llm_api_key:
|
|
@@ -18,7 +18,11 @@ from openai import (
|
|
|
18
18
|
InternalServerError,
|
|
19
19
|
)
|
|
20
20
|
|
|
21
|
-
from ..prompts.ocr import
|
|
21
|
+
from ..prompts.ocr import (
|
|
22
|
+
OCR_TO_MARKDOWN_PROMPT,
|
|
23
|
+
OCR_TO_PLAIN_TEXT_PROMPT,
|
|
24
|
+
build_ocr_prompt,
|
|
25
|
+
)
|
|
22
26
|
from ..exceptions.base import EmptyDocument, ExceededMaxPages
|
|
23
27
|
|
|
24
28
|
logger = logging.getLogger(__name__)
|
|
@@ -95,6 +99,7 @@ def get_document_ocr(
|
|
|
95
99
|
page_range=None,
|
|
96
100
|
timeout_minutes=None,
|
|
97
101
|
ocr_model="gpt-5-mini", # Azure deployment name
|
|
102
|
+
include_image_descriptions: bool = False,
|
|
98
103
|
):
|
|
99
104
|
"""
|
|
100
105
|
Convenience function to OCR a document (PDF) using Azure OpenAI vision.
|
|
@@ -106,6 +111,7 @@ def get_document_ocr(
|
|
|
106
111
|
target_size=target_size,
|
|
107
112
|
page_range=page_range,
|
|
108
113
|
timeout_minutes=timeout_minutes,
|
|
114
|
+
include_image_descriptions=include_image_descriptions,
|
|
109
115
|
)
|
|
110
116
|
return converter.get_document_ocr(document_for_ocr)
|
|
111
117
|
|
|
@@ -127,6 +133,7 @@ class DocumentOCRToTextConverter:
|
|
|
127
133
|
azure_api_version=None, # your resource-supported API version
|
|
128
134
|
max_tokens=4096, # avoid truncation
|
|
129
135
|
max_workers=None, # ThreadPoolExecutor workers (None = default)
|
|
136
|
+
include_image_descriptions: bool = False,
|
|
130
137
|
):
|
|
131
138
|
if ocr_model is None:
|
|
132
139
|
ocr_model = "gpt-4.1-mini"
|
|
@@ -139,6 +146,7 @@ class DocumentOCRToTextConverter:
|
|
|
139
146
|
self.timeout_minutes = timeout_minutes
|
|
140
147
|
self.max_tokens = max_tokens
|
|
141
148
|
self.max_workers = max_workers
|
|
149
|
+
self.include_image_descriptions = include_image_descriptions
|
|
142
150
|
|
|
143
151
|
# Azure config
|
|
144
152
|
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
@@ -154,6 +162,13 @@ class DocumentOCRToTextConverter:
|
|
|
154
162
|
if not self.azure_api_version:
|
|
155
163
|
raise ValueError("Missing Azure API version. Set azure_api_version or AZURE_OPENAI_API_VERSION.")
|
|
156
164
|
|
|
165
|
+
def _build_prompt_template(self) -> str:
|
|
166
|
+
base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
|
|
167
|
+
return build_ocr_prompt(
|
|
168
|
+
base_prompt,
|
|
169
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
170
|
+
)
|
|
171
|
+
|
|
157
172
|
def _build_client(self) -> AzureOpenAI:
|
|
158
173
|
azure_api_key = self.llm_api_key or os.getenv("AZURE_OPENAI_API_KEY")
|
|
159
174
|
if not azure_api_key:
|
|
@@ -189,7 +204,7 @@ class DocumentOCRToTextConverter:
|
|
|
189
204
|
temp_file_for_ocr = None
|
|
190
205
|
start_time = time.time()
|
|
191
206
|
|
|
192
|
-
prompt_template =
|
|
207
|
+
prompt_template = self._build_prompt_template()
|
|
193
208
|
logger.info("Using prompt for %s format", "markdown" if self.markdown_output else "plain text")
|
|
194
209
|
|
|
195
210
|
client = self._build_client()
|
|
@@ -370,4 +385,4 @@ class DocumentOCRToTextConverter:
|
|
|
370
385
|
start_page = 0
|
|
371
386
|
end_page = total_pages
|
|
372
387
|
|
|
373
|
-
return start_page, end_page
|
|
388
|
+
return start_page, end_page
|
|
@@ -20,6 +20,23 @@ def repetition_ratio(items: list[str], min_occurrences: int = 2) -> float:
|
|
|
20
20
|
return repeated_items / len(items)
|
|
21
21
|
|
|
22
22
|
|
|
23
|
+
def has_consecutive_repetition(items: list[str], min_run_length: int = 3) -> bool:
|
|
24
|
+
previous = None
|
|
25
|
+
run_length = 0
|
|
26
|
+
|
|
27
|
+
for item in items:
|
|
28
|
+
if item == previous:
|
|
29
|
+
run_length += 1
|
|
30
|
+
else:
|
|
31
|
+
previous = item
|
|
32
|
+
run_length = 1
|
|
33
|
+
|
|
34
|
+
if run_length >= min_run_length:
|
|
35
|
+
return True
|
|
36
|
+
|
|
37
|
+
return False
|
|
38
|
+
|
|
39
|
+
|
|
23
40
|
def tail_has_excessive_repetition(
|
|
24
41
|
text: str,
|
|
25
42
|
tail_lines: int,
|
|
@@ -30,10 +47,14 @@ def tail_has_excessive_repetition(
|
|
|
30
47
|
|
|
31
48
|
lines = [normalize_text_line(line) for line in text.splitlines() if normalize_text_line(line)]
|
|
32
49
|
tail = lines[-tail_lines:] if len(lines) > tail_lines else lines
|
|
50
|
+
if has_consecutive_repetition(tail):
|
|
51
|
+
return True
|
|
33
52
|
if len(tail) >= 4 and repetition_ratio(tail) >= threshold:
|
|
34
53
|
return True
|
|
35
54
|
|
|
36
55
|
sentences = split_sentences("\n".join(tail))
|
|
56
|
+
if has_consecutive_repetition(sentences):
|
|
57
|
+
return True
|
|
37
58
|
if len(sentences) >= 4 and repetition_ratio(sentences) >= threshold:
|
|
38
59
|
return True
|
|
39
60
|
|
|
@@ -10,7 +10,11 @@ from google import genai
|
|
|
10
10
|
from google.genai import types
|
|
11
11
|
from google.api_core import exceptions as google_exceptions
|
|
12
12
|
|
|
13
|
-
from ..prompts.ocr import
|
|
13
|
+
from ..prompts.ocr import (
|
|
14
|
+
OCR_TO_MARKDOWN_PROMPT,
|
|
15
|
+
OCR_TO_PLAIN_TEXT_PROMPT,
|
|
16
|
+
build_ocr_prompt,
|
|
17
|
+
)
|
|
14
18
|
from ..exceptions import EmptyDocument
|
|
15
19
|
from .gemini_quality_guards import (
|
|
16
20
|
extract_finish_reason,
|
|
@@ -102,6 +106,7 @@ def get_ocr(
|
|
|
102
106
|
timeout_minutes=None,
|
|
103
107
|
ocr_model: str | None = None,
|
|
104
108
|
max_output_tokens: int | None = None,
|
|
109
|
+
include_image_descriptions: bool = False,
|
|
105
110
|
):
|
|
106
111
|
"""
|
|
107
112
|
Convenience function to extract text from an image file using OCR, optionally formatted as Markdown.
|
|
@@ -121,24 +126,29 @@ def get_ocr(
|
|
|
121
126
|
ocr_model (str | None, optional): Gemini OCR model to use. Defaults to the converter default.
|
|
122
127
|
max_output_tokens (int | None, optional): Maximum Gemini output tokens.
|
|
123
128
|
Defaults to the converter default.
|
|
129
|
+
include_image_descriptions (bool, optional): If True, OCR prompts include
|
|
130
|
+
brief functional descriptions for meaningful non-text images.
|
|
131
|
+
Defaults to False.
|
|
124
132
|
|
|
125
133
|
Returns:
|
|
126
134
|
dict: Dictionary containing the OCR results and metadata.
|
|
127
135
|
"""
|
|
128
136
|
converter = OCRToTextConverter(
|
|
129
|
-
ocr_model=ocr_model or "gemini-3.1-flash-lite
|
|
137
|
+
ocr_model=ocr_model or "gemini-3.1-flash-lite",
|
|
130
138
|
markdown_output=markdown_output,
|
|
131
139
|
llm_api_key=llm_api_key,
|
|
132
140
|
target_size=target_size,
|
|
133
141
|
timeout_minutes=timeout_minutes,
|
|
134
142
|
max_output_tokens=max_output_tokens,
|
|
143
|
+
include_image_descriptions=include_image_descriptions,
|
|
135
144
|
)
|
|
136
145
|
return converter.get_ocr(file_for_ocr)
|
|
137
146
|
|
|
138
147
|
class OCRToTextConverter:
|
|
139
|
-
def __init__(self, ocr_model="gemini-3.1-flash-lite
|
|
148
|
+
def __init__(self, ocr_model="gemini-3.1-flash-lite", ocr_model_provider="google",
|
|
140
149
|
markdown_output=True, llm_api_key=None, target_size=1, temp_dir="temp",
|
|
141
|
-
timeout_minutes=None, fallback_stage: int = 0, max_output_tokens: int | None = None
|
|
150
|
+
timeout_minutes=None, fallback_stage: int = 0, max_output_tokens: int | None = None,
|
|
151
|
+
include_image_descriptions: bool = False):
|
|
142
152
|
"""
|
|
143
153
|
Initialize the OCRToTextConverter class with specified OCR model and formatting options.
|
|
144
154
|
|
|
@@ -146,7 +156,7 @@ class OCRToTextConverter:
|
|
|
146
156
|
It supports various image formats and can output either plain text or markdown.
|
|
147
157
|
|
|
148
158
|
Args:
|
|
149
|
-
ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite
|
|
159
|
+
ocr_model (str): Model name for OCR processing. Defaults to "gemini-3.1-flash-lite".
|
|
150
160
|
ocr_model_provider (str): Provider of OCR service. Defaults to "google".
|
|
151
161
|
markdown_output (bool): Enable markdown formatting in output. Defaults to True.
|
|
152
162
|
llm_api_key (str, optional): Override API key for language model. Defaults to None.
|
|
@@ -157,6 +167,9 @@ class OCRToTextConverter:
|
|
|
157
167
|
Defaults to 0.
|
|
158
168
|
max_output_tokens (int | None, optional): Maximum Gemini output tokens.
|
|
159
169
|
Defaults to `OCR_MAX_OUTPUT_TOKENS`.
|
|
170
|
+
include_image_descriptions (bool, optional): If True, OCR prompts include
|
|
171
|
+
brief functional descriptions for meaningful non-text images.
|
|
172
|
+
Defaults to False.
|
|
160
173
|
|
|
161
174
|
Raises:
|
|
162
175
|
OSError: If temp directory creation fails
|
|
@@ -168,6 +181,7 @@ class OCRToTextConverter:
|
|
|
168
181
|
self.llm_api_key = llm_api_key
|
|
169
182
|
self.target_size = target_size
|
|
170
183
|
self.timeout_minutes = timeout_minutes
|
|
184
|
+
self.include_image_descriptions = include_image_descriptions
|
|
171
185
|
requested_output_tokens = OCR_MAX_OUTPUT_TOKENS if max_output_tokens is None else max_output_tokens
|
|
172
186
|
self.max_output_tokens = max(requested_output_tokens, OCR_MIN_OUTPUT_TOKENS)
|
|
173
187
|
self.fallback_stage = fallback_stage
|
|
@@ -181,6 +195,13 @@ class OCRToTextConverter:
|
|
|
181
195
|
os.makedirs(self.temp_dir, exist_ok=True)
|
|
182
196
|
tempfile.tempdir = self.temp_dir
|
|
183
197
|
|
|
198
|
+
def _build_prompt_template(self) -> str:
|
|
199
|
+
base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
|
|
200
|
+
return build_ocr_prompt(
|
|
201
|
+
base_prompt,
|
|
202
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
203
|
+
)
|
|
204
|
+
|
|
184
205
|
def should_fallback_temperature_retry(self, error: EmptyDocument, temperature: float) -> bool:
|
|
185
206
|
if self.fallback_stage != 0:
|
|
186
207
|
return False
|
|
@@ -226,6 +247,7 @@ class OCRToTextConverter:
|
|
|
226
247
|
timeout_minutes=self.timeout_minutes,
|
|
227
248
|
fallback_stage=fallback_stage,
|
|
228
249
|
max_output_tokens=self.max_output_tokens,
|
|
250
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
229
251
|
)
|
|
230
252
|
result = fallback_converter.get_ocr(
|
|
231
253
|
file_for_ocr=file_for_ocr,
|
|
@@ -282,12 +304,9 @@ class OCRToTextConverter:
|
|
|
282
304
|
|
|
283
305
|
if self.markdown_output:
|
|
284
306
|
logger.info("Using prompt for markdown format")
|
|
285
|
-
# Convert the text to markdown format
|
|
286
|
-
prompt_template = OCR_TO_MARKDOWN_PROMPT
|
|
287
307
|
else:
|
|
288
308
|
logger.info("Using prompt for plain text format")
|
|
289
|
-
|
|
290
|
-
prompt_template = OCR_TO_PLAIN_TEXT_PROMPT
|
|
309
|
+
prompt_template = self._build_prompt_template()
|
|
291
310
|
|
|
292
311
|
try:
|
|
293
312
|
if self.llm_api_key:
|
|
@@ -18,7 +18,11 @@ from openai import (
|
|
|
18
18
|
InternalServerError,
|
|
19
19
|
)
|
|
20
20
|
|
|
21
|
-
from ..prompts.ocr import
|
|
21
|
+
from ..prompts.ocr import (
|
|
22
|
+
OCR_TO_MARKDOWN_PROMPT,
|
|
23
|
+
OCR_TO_PLAIN_TEXT_PROMPT,
|
|
24
|
+
build_ocr_prompt,
|
|
25
|
+
)
|
|
22
26
|
|
|
23
27
|
logger = logging.getLogger(__name__)
|
|
24
28
|
|
|
@@ -86,7 +90,14 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
|
|
|
86
90
|
raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
|
|
87
91
|
|
|
88
92
|
|
|
89
|
-
def get_ocr(
|
|
93
|
+
def get_ocr(
|
|
94
|
+
file_for_ocr,
|
|
95
|
+
markdown_output=False,
|
|
96
|
+
llm_api_key=None,
|
|
97
|
+
target_size=1,
|
|
98
|
+
timeout_minutes=None,
|
|
99
|
+
include_image_descriptions: bool = False,
|
|
100
|
+
):
|
|
90
101
|
"""
|
|
91
102
|
Convenience function to extract text from an image file using OCR (Azure OpenAI),
|
|
92
103
|
optionally formatted as Markdown.
|
|
@@ -96,6 +107,7 @@ def get_ocr(file_for_ocr, markdown_output=False, llm_api_key=None, target_size=1
|
|
|
96
107
|
llm_api_key=llm_api_key,
|
|
97
108
|
target_size=target_size,
|
|
98
109
|
timeout_minutes=timeout_minutes,
|
|
110
|
+
include_image_descriptions=include_image_descriptions,
|
|
99
111
|
)
|
|
100
112
|
return converter.get_ocr(file_for_ocr)
|
|
101
113
|
|
|
@@ -114,6 +126,7 @@ class OCRToTextConverter:
|
|
|
114
126
|
azure_endpoint=None, # e.g. https://<resource>.openai.azure.com
|
|
115
127
|
azure_api_version=None, # e.g. "2024-10-21" (use your resource-supported version)
|
|
116
128
|
max_tokens=4096, # avoid truncation
|
|
129
|
+
include_image_descriptions: bool = False,
|
|
117
130
|
):
|
|
118
131
|
self.ocr_model = ocr_model
|
|
119
132
|
self.ocr_model_provider = ocr_model_provider
|
|
@@ -122,6 +135,7 @@ class OCRToTextConverter:
|
|
|
122
135
|
self.target_size = target_size
|
|
123
136
|
self.timeout_minutes = timeout_minutes
|
|
124
137
|
self.max_tokens = max_tokens
|
|
138
|
+
self.include_image_descriptions = include_image_descriptions
|
|
125
139
|
|
|
126
140
|
# Azure config
|
|
127
141
|
self.azure_endpoint = azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT")
|
|
@@ -137,6 +151,13 @@ class OCRToTextConverter:
|
|
|
137
151
|
if not self.azure_api_version:
|
|
138
152
|
raise ValueError("Missing Azure API version. Set azure_api_version or AZURE_OPENAI_API_VERSION.")
|
|
139
153
|
|
|
154
|
+
def _build_prompt_template(self) -> str:
|
|
155
|
+
base_prompt = OCR_TO_MARKDOWN_PROMPT if self.markdown_output else OCR_TO_PLAIN_TEXT_PROMPT
|
|
156
|
+
return build_ocr_prompt(
|
|
157
|
+
base_prompt,
|
|
158
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
159
|
+
)
|
|
160
|
+
|
|
140
161
|
@retry(
|
|
141
162
|
(
|
|
142
163
|
APITimeoutError,
|
|
@@ -158,7 +179,7 @@ class OCRToTextConverter:
|
|
|
158
179
|
temp_file_for_ocr = None
|
|
159
180
|
start_time = time.time()
|
|
160
181
|
|
|
161
|
-
prompt_template =
|
|
182
|
+
prompt_template = self._build_prompt_template()
|
|
162
183
|
logger.info("Using prompt for %s format", "markdown" if self.markdown_output else "plain text")
|
|
163
184
|
|
|
164
185
|
# Build Azure client
|
|
@@ -252,4 +273,4 @@ class OCRToTextConverter:
|
|
|
252
273
|
finally:
|
|
253
274
|
# Clean up the temporary compressed file (only if we created one)
|
|
254
275
|
if temp_file_for_ocr and temp_file_for_ocr != file_for_ocr and os.path.exists(temp_file_for_ocr):
|
|
255
|
-
os.remove(temp_file_for_ocr)
|
|
276
|
+
os.remove(temp_file_for_ocr)
|
|
@@ -39,11 +39,20 @@ dotenv.load_dotenv()
|
|
|
39
39
|
logger = logging.getLogger(__name__)
|
|
40
40
|
|
|
41
41
|
MIN_DOC_TEXT_LENGTH_ACCEPTED = int(os.getenv("MIN_DOC_TEXT_LENGTH_ACCEPTED", "400"))
|
|
42
|
+
OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV = "OCR_INCLUDE_IMAGE_DESCRIPTIONS"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
46
|
+
value = os.getenv(name)
|
|
47
|
+
if value is None:
|
|
48
|
+
return default
|
|
49
|
+
return value.strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
42
50
|
|
|
43
51
|
|
|
44
52
|
class BaseLoader:
|
|
45
53
|
def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
|
|
46
|
-
ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
|
|
54
|
+
ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
|
|
55
|
+
include_image_descriptions: bool | None = None, **kwargs):
|
|
47
56
|
"""
|
|
48
57
|
Initialize the BaseLoader with cloud storage and LLM configurations.
|
|
49
58
|
|
|
@@ -58,6 +67,9 @@ class BaseLoader:
|
|
|
58
67
|
provider (str, optional): Provider of the model. Default to "google".
|
|
59
68
|
ocr_model (str, optional): OCR model to use for text extraction from images. Defaults to "gpt-5-mini".
|
|
60
69
|
timeout_minutes (int, optional): Timeout in minutes. Defaults to None.
|
|
70
|
+
include_image_descriptions (bool | None, optional): If True, OCR prompts
|
|
71
|
+
include brief functional descriptions for meaningful non-text images.
|
|
72
|
+
If None, defaults from OCR_INCLUDE_IMAGE_DESCRIPTIONS. Defaults to None.
|
|
61
73
|
**kwargs: Additional keyword arguments to pass to the underlying loader or extraction logic.
|
|
62
74
|
- target_size (int, optional): Target file size in bytes. Defaults to 1MB
|
|
63
75
|
- source (str): Source of the document. Must be either "cloud" or "local"
|
|
@@ -76,6 +88,11 @@ class BaseLoader:
|
|
|
76
88
|
self.provider = provider
|
|
77
89
|
self.ocr_model = ocr_model
|
|
78
90
|
self.timeout_minutes = timeout_minutes
|
|
91
|
+
self.include_image_descriptions = (
|
|
92
|
+
_read_bool_env(OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV)
|
|
93
|
+
if include_image_descriptions is None
|
|
94
|
+
else include_image_descriptions
|
|
95
|
+
)
|
|
79
96
|
self.kwargs = kwargs
|
|
80
97
|
self.target_size = kwargs.get("target_size", 1)
|
|
81
98
|
self.source = kwargs.get("source", "cloud")
|
|
@@ -287,7 +304,7 @@ class BaseLoader:
|
|
|
287
304
|
file_extension = file_extension.lower()
|
|
288
305
|
|
|
289
306
|
if is_document_fallback:
|
|
290
|
-
return DocumentOCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, ocr_provider=self.provider, ocr_model=self.ocr_model, **kwargs)
|
|
307
|
+
return DocumentOCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, ocr_provider=self.provider, ocr_model=self.ocr_model, include_image_descriptions=self.include_image_descriptions, **kwargs)
|
|
291
308
|
|
|
292
309
|
if file_extension in [".xml", ".xbrl"]:
|
|
293
310
|
return XmlXbrlLoader(temp_dir=self.temp_dir, markdown_output=self.markdown_output, **kwargs)
|
|
@@ -308,7 +325,7 @@ class BaseLoader:
|
|
|
308
325
|
elif mime_type.startswith("video/"):
|
|
309
326
|
return VideoLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
|
|
310
327
|
elif mime_type.startswith("image/"):
|
|
311
|
-
return OCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
|
|
328
|
+
return OCRLoader(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, include_image_descriptions=self.include_image_descriptions, **kwargs)
|
|
312
329
|
elif mime_type.startswith("text/markdown"):
|
|
313
330
|
return MarkdownLoader(markdown_output=self.markdown_output, temp_dir=self.temp_dir, **kwargs)
|
|
314
331
|
elif mime_type == "text/html":
|
|
@@ -45,6 +45,7 @@ class DocumentOCRLoader:
|
|
|
45
45
|
timeout_minutes: int = None,
|
|
46
46
|
ocr_provider: str = "google",
|
|
47
47
|
ocr_model: str | None = None,
|
|
48
|
+
include_image_descriptions: bool = False,
|
|
48
49
|
**kwargs
|
|
49
50
|
):
|
|
50
51
|
"""
|
|
@@ -78,6 +79,9 @@ class DocumentOCRLoader:
|
|
|
78
79
|
- For Google Gemini: optional / usually ignored.
|
|
79
80
|
- For Azure OpenAI: **deployment name** (e.g. "gpt-5-mini").
|
|
80
81
|
Defaults to None.
|
|
82
|
+
include_image_descriptions (bool, optional): If True, OCR prompts include
|
|
83
|
+
brief functional descriptions for meaningful non-text images.
|
|
84
|
+
Defaults to False.
|
|
81
85
|
**kwargs:
|
|
82
86
|
max_output_tokens (int, optional): Maximum Gemini output tokens for
|
|
83
87
|
Google document OCR generation.
|
|
@@ -100,6 +104,7 @@ class DocumentOCRLoader:
|
|
|
100
104
|
|
|
101
105
|
self.ocr_provider = (ocr_provider or "google").lower()
|
|
102
106
|
self.ocr_model = ocr_model
|
|
107
|
+
self.include_image_descriptions = include_image_descriptions
|
|
103
108
|
self.max_output_tokens = kwargs.get("max_output_tokens")
|
|
104
109
|
|
|
105
110
|
# Set up custom temp directory
|
|
@@ -248,6 +253,7 @@ class DocumentOCRLoader:
|
|
|
248
253
|
page_range=self.page_range,
|
|
249
254
|
timeout_minutes=self.timeout_minutes,
|
|
250
255
|
ocr_model=self.ocr_model or None,
|
|
256
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
251
257
|
)
|
|
252
258
|
else:
|
|
253
259
|
result_dict = ocr_fn(
|
|
@@ -263,6 +269,7 @@ class DocumentOCRLoader:
|
|
|
263
269
|
else None
|
|
264
270
|
),
|
|
265
271
|
max_output_tokens=self.max_output_tokens,
|
|
272
|
+
include_image_descriptions=self.include_image_descriptions,
|
|
266
273
|
)
|
|
267
274
|
|
|
268
275
|
result_dict["type"] = self.type
|