polytext 0.2.2b2__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.2.2b2 → polytext-0.2.4}/PKG-INFO +1 -1
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/audio_to_text.py +22 -12
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/document_ocr_to_text.py +6 -1
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/document_ocr_to_text_azure_oai.py +7 -2
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/ocr_to_text.py +7 -2
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/ocr_to_text_azure_oai.py +6 -1
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/video_to_audio.py +3 -3
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/base.py +31 -1
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/youtube_llm.py +1 -1
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/PKG-INFO +1 -1
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/SOURCES.txt +4 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/setup.py +1 -1
- polytext-0.2.4/tests/test_base_loader_error_mapping.py +81 -0
- polytext-0.2.4/tests/test_gemini_quality_guards.py +31 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_ocr_from_image.py +1 -1
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_ocr_fallbacks.py +28 -5
- polytext-0.2.4/tests/test_python_version_metadata.py +45 -0
- polytext-0.2.4/tests/test_youtube_llm_fallbacks.py +103 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_youtube_transcript.py +2 -2
- {polytext-0.2.2b2 → polytext-0.2.4}/LICENSE +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/README.md +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/base.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/gemini_quality_guards.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/html_to_md.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/md_to_text.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/pdf.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/text_to_md.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/exceptions/base.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/generator/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/generator/pdf.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/audio.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/document.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/document_ocr.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/downloader/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/downloader/downloader.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/html.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/markdown.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/notebook.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/ocr.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/plain_text.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/video.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/xml_xbrl.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/youtube.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/audio_chunker.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/text_merger.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/transcript_chunker.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/ocr.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/text_merging.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/text_to_md.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/transcription.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/utils/__init__.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext/utils/utils.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/requires.txt +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/pyproject.toml +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/setup.cfg +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_audio_chunker.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_audio_comparison_helpers.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_audio_transcription_model_migration.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_audio_models.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_document_ocr_to_text_models.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_ocr_to_text_models.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_youtube_models.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_extracted_text_whitespace.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_audio_transcript_from_gcs.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_customized_pdf_from_markdown.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_ocr.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_ocr_azure_oai.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_text.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_text_from_gcs.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_text_from_markdown.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_video_transcript_from_gcs.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_library.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_markdown_loader_gzip.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_markitdown_html.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_notebook_loader.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_ocr_image_descriptions.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_pain_text.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_split_audio_with_llm.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_xml_xbrl_loader.py +0 -0
- {polytext-0.2.2b2 → polytext-0.2.4}/tests/test_youtube_gemini_minimal_check.py +0 -0
|
@@ -46,7 +46,7 @@ AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHO
|
|
|
46
46
|
AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
|
|
47
47
|
AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
|
|
48
48
|
AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
|
|
49
|
-
AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-
|
|
49
|
+
AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
|
|
50
50
|
AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
|
|
51
51
|
NO_HUMAN_SPEECH_MARKER = "no human speech detected"
|
|
52
52
|
|
|
@@ -90,16 +90,22 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
|
|
|
90
90
|
os.close(fd)
|
|
91
91
|
|
|
92
92
|
logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
93
|
+
try:
|
|
94
|
+
ffmpeg.input(input_path).output(
|
|
95
|
+
temp_audio_path,
|
|
96
|
+
q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
|
|
97
|
+
acodec='libmp3lame',
|
|
98
|
+
ac=1, # Convert to mono
|
|
99
|
+
ar=16000, # Lower sample rate
|
|
100
|
+
vn=None,
|
|
101
|
+
threads=0, # Use maximum available threads
|
|
102
|
+
loglevel='error', # Reduce logging overhead
|
|
103
|
+
).run(quiet=True, overwrite_output=True)
|
|
104
|
+
except Exception:
|
|
105
|
+
logger.exception("FFmpeg error during audio processing for %s", input_path)
|
|
106
|
+
if os.path.exists(temp_audio_path):
|
|
107
|
+
os.unlink(temp_audio_path)
|
|
108
|
+
raise
|
|
103
109
|
|
|
104
110
|
logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
|
|
105
111
|
return temp_audio_path
|
|
@@ -313,7 +319,11 @@ class AudioToTextConverter:
|
|
|
313
319
|
|
|
314
320
|
mime_type, _ = mimetypes.guess_type(audio_file)
|
|
315
321
|
if mime_type is None:
|
|
316
|
-
|
|
322
|
+
try:
|
|
323
|
+
raise ValueError("Audio format not recognized")
|
|
324
|
+
except ValueError:
|
|
325
|
+
logger.exception("Unsupported audio format for %s", audio_file)
|
|
326
|
+
raise
|
|
317
327
|
|
|
318
328
|
return client.models.generate_content(
|
|
319
329
|
model=self.transcription_model,
|
|
@@ -96,6 +96,7 @@ def compress_and_convert_image(input_path: str, target_size=1):
|
|
|
96
96
|
return temp_image_path
|
|
97
97
|
|
|
98
98
|
except Exception as e:
|
|
99
|
+
logger.exception("FFmpeg error during image processing for %s", input_path)
|
|
99
100
|
raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
|
|
100
101
|
|
|
101
102
|
def get_document_ocr(
|
|
@@ -389,7 +390,11 @@ class DocumentOCRToTextConverter:
|
|
|
389
390
|
# Determine mimetype
|
|
390
391
|
mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
|
|
391
392
|
if mime_type is None:
|
|
392
|
-
|
|
393
|
+
try:
|
|
394
|
+
raise ValueError("Image format not recognized")
|
|
395
|
+
except ValueError:
|
|
396
|
+
logger.exception("Unsupported image format for %s", temp_file_for_ocr)
|
|
397
|
+
raise
|
|
393
398
|
|
|
394
399
|
response = client.models.generate_content(
|
|
395
400
|
model=self.ocr_model,
|
|
@@ -88,6 +88,7 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
|
|
|
88
88
|
return temp_image_path
|
|
89
89
|
|
|
90
90
|
except Exception as e:
|
|
91
|
+
logger.exception("FFmpeg error during image processing for %s", input_path)
|
|
91
92
|
raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
|
|
92
93
|
|
|
93
94
|
|
|
@@ -233,7 +234,11 @@ class DocumentOCRToTextConverter:
|
|
|
233
234
|
|
|
234
235
|
mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
|
|
235
236
|
if mime_type is None:
|
|
236
|
-
|
|
237
|
+
try:
|
|
238
|
+
raise ValueError("Image format not recognized")
|
|
239
|
+
except ValueError:
|
|
240
|
+
logger.exception("Unsupported image format for %s", temp_file_for_ocr)
|
|
241
|
+
raise
|
|
237
242
|
|
|
238
243
|
with open(temp_file_for_ocr, "rb") as f:
|
|
239
244
|
image_b64 = base64.b64encode(f.read()).decode("utf-8")
|
|
@@ -308,7 +313,7 @@ class DocumentOCRToTextConverter:
|
|
|
308
313
|
pdf = fitz.open(document_for_ocr)
|
|
309
314
|
total_pages = len(pdf)
|
|
310
315
|
if total_pages == 0:
|
|
311
|
-
raise EmptyDocument(message="The document has no pages.", code=
|
|
316
|
+
raise EmptyDocument(message="The document has no pages.", code=998)
|
|
312
317
|
|
|
313
318
|
start_page, end_page = self.validate_page_range(total_pages)
|
|
314
319
|
|
|
@@ -33,7 +33,7 @@ OCR_TAIL_REPETITION_THRESHOLD = float(os.getenv("OCR_TAIL_REPETITION_THRESHOLD",
|
|
|
33
33
|
OCR_FALLBACK_SOURCE_PATTERN = os.getenv("OCR_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
|
|
34
34
|
OCR_FALLBACK_MODEL = os.getenv("OCR_FALLBACK_MODEL", "gemini-3-flash-preview")
|
|
35
35
|
OCR_FALLBACK_TEMPERATURE = float(os.getenv("OCR_FALLBACK_TEMPERATURE", "1.0"))
|
|
36
|
-
OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-
|
|
36
|
+
OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def compress_and_convert_image(input_path: str, target_size=1):
|
|
@@ -96,6 +96,7 @@ def compress_and_convert_image(input_path: str, target_size=1):
|
|
|
96
96
|
return temp_image_path
|
|
97
97
|
|
|
98
98
|
except Exception as e:
|
|
99
|
+
logger.exception("FFmpeg error during image processing for %s", input_path)
|
|
99
100
|
raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
|
|
100
101
|
|
|
101
102
|
def get_ocr(
|
|
@@ -383,7 +384,11 @@ class OCRToTextConverter:
|
|
|
383
384
|
# Determine mimetype
|
|
384
385
|
mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
|
|
385
386
|
if mime_type is None:
|
|
386
|
-
|
|
387
|
+
try:
|
|
388
|
+
raise ValueError("Image format not recognized")
|
|
389
|
+
except ValueError:
|
|
390
|
+
logger.exception("Unsupported image format for %s", temp_file_for_ocr)
|
|
391
|
+
raise
|
|
387
392
|
|
|
388
393
|
response = client.models.generate_content(
|
|
389
394
|
model=self.ocr_model,
|
|
@@ -87,6 +87,7 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
|
|
|
87
87
|
return temp_image_path
|
|
88
88
|
|
|
89
89
|
except Exception as e:
|
|
90
|
+
logger.exception("FFmpeg error during image processing for %s", input_path)
|
|
90
91
|
raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
|
|
91
92
|
|
|
92
93
|
|
|
@@ -224,7 +225,11 @@ class OCRToTextConverter:
|
|
|
224
225
|
# We'll use base64 data-URL.
|
|
225
226
|
mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
|
|
226
227
|
if mime_type is None:
|
|
227
|
-
|
|
228
|
+
try:
|
|
229
|
+
raise ValueError("Image format not recognized")
|
|
230
|
+
except ValueError:
|
|
231
|
+
logger.exception("Unsupported image format for %s", temp_file_for_ocr)
|
|
232
|
+
raise
|
|
228
233
|
|
|
229
234
|
with open(temp_file_for_ocr, "rb") as f:
|
|
230
235
|
image_b64 = base64.b64encode(f.read()).decode("utf-8")
|
|
@@ -52,12 +52,12 @@ def convert_video_to_audio(video_file: str , bitrate_quality: int =9) -> str:
|
|
|
52
52
|
return temp_audio_path
|
|
53
53
|
|
|
54
54
|
except ffmpeg.Error as e:
|
|
55
|
-
logger.
|
|
55
|
+
logger.exception("FFmpeg conversion failed: %s", e.stderr.decode())
|
|
56
56
|
if os.path.exists(temp_audio_path):
|
|
57
57
|
os.unlink(temp_audio_path)
|
|
58
58
|
raise
|
|
59
59
|
except Exception as e:
|
|
60
|
-
logger.
|
|
60
|
+
logger.exception("Failed to convert video to audio: %s", str(e))
|
|
61
61
|
if os.path.exists(temp_audio_path):
|
|
62
62
|
os.unlink(temp_audio_path)
|
|
63
|
-
raise
|
|
63
|
+
raise
|
|
@@ -40,6 +40,12 @@ logger = logging.getLogger(__name__)
|
|
|
40
40
|
|
|
41
41
|
MIN_DOC_TEXT_LENGTH_ACCEPTED = int(os.getenv("MIN_DOC_TEXT_LENGTH_ACCEPTED", "400"))
|
|
42
42
|
OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV = "OCR_INCLUDE_IMAGE_DESCRIPTIONS"
|
|
43
|
+
LLM_OUTPUT_ERROR_CODES = {
|
|
44
|
+
995: "INVALID_ARGUMENT",
|
|
45
|
+
996: "RECITATION",
|
|
46
|
+
997: "REPETITIVE_OUTPUT",
|
|
47
|
+
999: "MAX_TOKENS",
|
|
48
|
+
}
|
|
43
49
|
|
|
44
50
|
|
|
45
51
|
def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
@@ -49,6 +55,18 @@ def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
|
49
55
|
return value.strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
50
56
|
|
|
51
57
|
|
|
58
|
+
def _capture_exception_for_sentry(error: Exception) -> None:
|
|
59
|
+
try:
|
|
60
|
+
import sentry_sdk
|
|
61
|
+
except ImportError:
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
sentry_sdk.capture_exception(error)
|
|
66
|
+
except Exception:
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
|
|
52
70
|
class BaseLoader:
|
|
53
71
|
def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
|
|
54
72
|
ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
|
|
@@ -147,6 +165,13 @@ class BaseLoader:
|
|
|
147
165
|
try:
|
|
148
166
|
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
149
167
|
except EmptyDocument as e:
|
|
168
|
+
if e.code in LLM_OUTPUT_ERROR_CODES:
|
|
169
|
+
_capture_exception_for_sentry(e)
|
|
170
|
+
raise LoaderError(
|
|
171
|
+
message=e.message,
|
|
172
|
+
status=422,
|
|
173
|
+
code=LLM_OUTPUT_ERROR_CODES[e.code],
|
|
174
|
+
) from e
|
|
150
175
|
logger.info(f"Empty document encountered: {e.message}")
|
|
151
176
|
if self.fallback_ocr:
|
|
152
177
|
loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
|
|
@@ -317,6 +342,7 @@ class BaseLoader:
|
|
|
317
342
|
return YoutubeTranscriptLoaderWithLlm(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
|
|
318
343
|
else:
|
|
319
344
|
return HtmlLoader(markdown_output=self.markdown_output)
|
|
345
|
+
# Handle markdown files based on extension or MIME type
|
|
320
346
|
if file_extension in [".md", ".markdown"] or (
|
|
321
347
|
mime_type and mime_type.startswith("text/markdown")
|
|
322
348
|
):
|
|
@@ -344,7 +370,11 @@ class BaseLoader:
|
|
|
344
370
|
**kwargs,
|
|
345
371
|
)
|
|
346
372
|
else:
|
|
347
|
-
|
|
373
|
+
try:
|
|
374
|
+
raise ValueError(f"Unsupported MIME type: {mime_type}")
|
|
375
|
+
except ValueError:
|
|
376
|
+
logger.exception("Unsupported media type while initializing loader: %s", mime_type)
|
|
377
|
+
raise
|
|
348
378
|
|
|
349
379
|
elif self.validate_user_text(text=input):
|
|
350
380
|
return PlainTextLoader(
|
|
@@ -384,7 +384,7 @@ class YoutubeTranscriptLoaderWithLlm:
|
|
|
384
384
|
fallback_model=self.final_fallback_model,
|
|
385
385
|
fallback_temperature=self.final_fallback_temperature,
|
|
386
386
|
)
|
|
387
|
-
raise
|
|
387
|
+
raise e_tmp from e
|
|
388
388
|
|
|
389
389
|
except errors.ServerError as e:
|
|
390
390
|
logger.info("ServerError occurred with status %s and message: %s", e.status, e.message)
|
|
@@ -56,6 +56,7 @@ polytext/utils/utils.py
|
|
|
56
56
|
tests/test_audio_chunker.py
|
|
57
57
|
tests/test_audio_comparison_helpers.py
|
|
58
58
|
tests/test_audio_transcription_model_migration.py
|
|
59
|
+
tests/test_base_loader_error_mapping.py
|
|
59
60
|
tests/test_compare_audio_models.py
|
|
60
61
|
tests/test_compare_document_ocr_to_text_models.py
|
|
61
62
|
tests/test_compare_ocr_to_text_models.py
|
|
@@ -63,6 +64,7 @@ tests/test_compare_youtube_models.py
|
|
|
63
64
|
tests/test_dowload_audio_from_youtube.py
|
|
64
65
|
tests/test_dowload_audio_from_youtube_helpers.py
|
|
65
66
|
tests/test_extracted_text_whitespace.py
|
|
67
|
+
tests/test_gemini_quality_guards.py
|
|
66
68
|
tests/test_get_audio_transcript_from_gcs.py
|
|
67
69
|
tests/test_get_customized_pdf_from_markdown.py
|
|
68
70
|
tests/test_get_document_ocr.py
|
|
@@ -79,7 +81,9 @@ tests/test_notebook_loader.py
|
|
|
79
81
|
tests/test_ocr_fallbacks.py
|
|
80
82
|
tests/test_ocr_image_descriptions.py
|
|
81
83
|
tests/test_pain_text.py
|
|
84
|
+
tests/test_python_version_metadata.py
|
|
82
85
|
tests/test_split_audio_with_llm.py
|
|
83
86
|
tests/test_xml_xbrl_loader.py
|
|
84
87
|
tests/test_youtube_gemini_minimal_check.py
|
|
88
|
+
tests/test_youtube_llm_fallbacks.py
|
|
85
89
|
tests/test_youtube_transcript.py
|
|
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
|
|
|
51
51
|
|
|
52
52
|
setup(
|
|
53
53
|
name='polytext',
|
|
54
|
-
version='0.2.
|
|
54
|
+
version='0.2.4',
|
|
55
55
|
url='https://github.com/docsity/polytext',
|
|
56
56
|
# download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
|
|
57
57
|
license='MIT',
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
3
|
+
|
|
4
|
+
from polytext.exceptions import EmptyDocument, LoaderError
|
|
5
|
+
from polytext.loader.base import BaseLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _FailingLoader:
|
|
9
|
+
def __init__(self, error):
|
|
10
|
+
self.error = error
|
|
11
|
+
|
|
12
|
+
def load(self, input_path):
|
|
13
|
+
raise self.error
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _FakeBaseLoader(BaseLoader):
|
|
17
|
+
def __init__(self, error, **kwargs):
|
|
18
|
+
super().__init__(**kwargs)
|
|
19
|
+
self.error = error
|
|
20
|
+
|
|
21
|
+
def initiate_storage(self, input):
|
|
22
|
+
return {}
|
|
23
|
+
|
|
24
|
+
def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
|
|
25
|
+
return _FailingLoader(self.error)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TestBaseLoaderErrorMapping(unittest.TestCase):
|
|
29
|
+
def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
|
|
30
|
+
cases = [
|
|
31
|
+
(995, "INVALID_ARGUMENT"),
|
|
32
|
+
(996, "RECITATION"),
|
|
33
|
+
(997, "REPETITIVE_OUTPUT"),
|
|
34
|
+
(999, "MAX_TOKENS"),
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
for empty_document_code, expected_loader_code in cases:
|
|
38
|
+
with self.subTest(empty_document_code=empty_document_code):
|
|
39
|
+
loader = _FakeBaseLoader(
|
|
40
|
+
EmptyDocument(
|
|
41
|
+
message=f"diagnostic failure {empty_document_code}",
|
|
42
|
+
code=empty_document_code,
|
|
43
|
+
)
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
sentry_sdk = Mock()
|
|
47
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
48
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
49
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
50
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
51
|
+
loader.get_text(["dummy.txt"])
|
|
52
|
+
|
|
53
|
+
error = error_context.exception
|
|
54
|
+
self.assertEqual(error.status, 422)
|
|
55
|
+
self.assertEqual(error.code, expected_loader_code)
|
|
56
|
+
self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
|
|
57
|
+
mock_info.assert_not_called()
|
|
58
|
+
mock_exception.assert_not_called()
|
|
59
|
+
sentry_sdk.capture_exception.assert_called_once()
|
|
60
|
+
self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
|
|
61
|
+
|
|
62
|
+
def test_empty_or_too_short_documents_still_return_empty_response(self):
|
|
63
|
+
loader = _FakeBaseLoader(
|
|
64
|
+
EmptyDocument(
|
|
65
|
+
message="Document text with less than 400 characters",
|
|
66
|
+
code=998,
|
|
67
|
+
)
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
71
|
+
response = loader.get_text(["empty.txt"])
|
|
72
|
+
|
|
73
|
+
self.assertEqual(response["text"], "")
|
|
74
|
+
self.assertEqual(response["completion_tokens"], 0)
|
|
75
|
+
self.assertEqual(response["prompt_tokens"], 0)
|
|
76
|
+
self.assertEqual(response["output_list"][0]["input"], "empty.txt")
|
|
77
|
+
mock_exception.assert_not_called()
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
if __name__ == "__main__":
|
|
81
|
+
unittest.main()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
from polytext.converter.gemini_quality_guards import tail_has_excessive_repetition
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestGeminiQualityGuards(unittest.TestCase):
|
|
7
|
+
def test_detects_consecutive_repeated_sentences_below_ratio_threshold(self):
|
|
8
|
+
text = (
|
|
9
|
+
"gli davamo nomi veri e falsi. "
|
|
10
|
+
"_Elio_ all'anagrafe e io gli gli dicevo _Roberto Gustativi_. "
|
|
11
|
+
"E questi scrivevano Roberto Gustativi. "
|
|
12
|
+
"E la soddisfazione perversa era andare a comprare il giornale. "
|
|
13
|
+
"È successo. "
|
|
14
|
+
"Sono stato. "
|
|
15
|
+
"Che è successo? "
|
|
16
|
+
"Siamo passati dal basement. "
|
|
17
|
+
"Siamo passati dal basement. "
|
|
18
|
+
"Siamo passati dal basement. "
|
|
19
|
+
"Siamo passati dal basement. "
|
|
20
|
+
"Il miglior finale di sempre. "
|
|
21
|
+
"Grazie, grazie, grazie, grazie, grazie, grazie, grazie. "
|
|
22
|
+
"E vi grazie."
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
self.assertTrue(
|
|
26
|
+
tail_has_excessive_repetition(text, tail_lines=200, threshold=0.35)
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
if __name__ == "__main__":
|
|
31
|
+
unittest.main()
|
|
@@ -38,7 +38,7 @@ def main():
|
|
|
38
38
|
|
|
39
39
|
# local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9695.jpg"
|
|
40
40
|
# local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9701.jpg"
|
|
41
|
-
local_file_path = "/Users/marcodelgiudice/Projects/polytext/
|
|
41
|
+
local_file_path = "/Users/marcodelgiudice/Projects/polytext/gm1.png"
|
|
42
42
|
|
|
43
43
|
try:
|
|
44
44
|
start = time.time()
|
|
@@ -4,7 +4,11 @@ from types import SimpleNamespace
|
|
|
4
4
|
from unittest.mock import patch
|
|
5
5
|
|
|
6
6
|
from polytext.converter.document_ocr_to_text import DocumentOCRToTextConverter
|
|
7
|
+
from polytext.converter.document_ocr_to_text_azure_oai import (
|
|
8
|
+
DocumentOCRToTextConverter as AzureDocumentOCRToTextConverter,
|
|
9
|
+
)
|
|
7
10
|
from polytext.converter.ocr_to_text import OCRToTextConverter
|
|
11
|
+
from polytext.exceptions import EmptyDocument
|
|
8
12
|
|
|
9
13
|
|
|
10
14
|
def _make_response(
|
|
@@ -208,11 +212,12 @@ class TestOcrFallbacks(unittest.TestCase):
|
|
|
208
212
|
)
|
|
209
213
|
mock_client_cls.return_value = fake_client
|
|
210
214
|
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
215
|
+
with patch("polytext.converter.ocr_to_text.OCR_FINAL_FALLBACK_MODEL", "gemini-2.0-flash"):
|
|
216
|
+
converter = OCRToTextConverter(ocr_model="gemini-3.1-flash-lite-preview")
|
|
217
|
+
with tempfile.NamedTemporaryFile(suffix=".png") as temp_image:
|
|
218
|
+
temp_image.write(b"fake-image")
|
|
219
|
+
temp_image.flush()
|
|
220
|
+
result = converter.get_ocr(temp_image.name)
|
|
216
221
|
|
|
217
222
|
self.assertEqual(result["text"], "final fallback text")
|
|
218
223
|
self.assertEqual(
|
|
@@ -263,6 +268,24 @@ class TestOcrFallbacks(unittest.TestCase):
|
|
|
263
268
|
)
|
|
264
269
|
self.assertEqual(fake_client.models.generate_content_temperatures, [0.0, 0.0, 1.0])
|
|
265
270
|
|
|
271
|
+
@patch("fitz.open")
|
|
272
|
+
def test_azure_document_ocr_no_pages_is_empty_or_too_short(self, mock_fitz_open):
|
|
273
|
+
mock_fitz_open.return_value = _FakePdf([])
|
|
274
|
+
|
|
275
|
+
converter = AzureDocumentOCRToTextConverter(
|
|
276
|
+
azure_endpoint="https://example.openai.azure.com",
|
|
277
|
+
azure_api_version="2024-10-21",
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
|
|
281
|
+
temp_pdf.write(b"%PDF-1.4\n")
|
|
282
|
+
temp_pdf.flush()
|
|
283
|
+
with self.assertRaises(EmptyDocument) as error_context:
|
|
284
|
+
converter.get_document_ocr(temp_pdf.name)
|
|
285
|
+
|
|
286
|
+
self.assertEqual(error_context.exception.code, 998)
|
|
287
|
+
self.assertEqual(error_context.exception.message, "The document has no pages.")
|
|
288
|
+
|
|
266
289
|
|
|
267
290
|
if __name__ == "__main__":
|
|
268
291
|
unittest.main()
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
import tomllib
|
|
3
|
+
import unittest
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _setup_keyword(name):
|
|
11
|
+
tree = ast.parse((ROOT / "setup.py").read_text())
|
|
12
|
+
setup_call = next(
|
|
13
|
+
node
|
|
14
|
+
for node in ast.walk(tree)
|
|
15
|
+
if isinstance(node, ast.Call)
|
|
16
|
+
and getattr(node.func, "id", None) == "setup"
|
|
17
|
+
)
|
|
18
|
+
return next(
|
|
19
|
+
keyword.value
|
|
20
|
+
for keyword in setup_call.keywords
|
|
21
|
+
if keyword.arg == name
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class PythonVersionMetadataTest(unittest.TestCase):
|
|
26
|
+
def test_packaging_metadata_allows_python_311(self):
|
|
27
|
+
setup_python_requires = ast.literal_eval(_setup_keyword("python_requires"))
|
|
28
|
+
pyproject = tomllib.loads((ROOT / "pyproject.toml").read_text())
|
|
29
|
+
|
|
30
|
+
self.assertEqual(setup_python_requires, ">=3.11")
|
|
31
|
+
self.assertEqual(
|
|
32
|
+
pyproject["tool"]["poetry"]["dependencies"]["python"],
|
|
33
|
+
">=3.11,<3.14",
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
def test_setup_classifiers_include_supported_python_versions(self):
|
|
37
|
+
classifiers = ast.literal_eval(_setup_keyword("classifiers"))
|
|
38
|
+
|
|
39
|
+
self.assertIn("Programming Language :: Python :: 3.11", classifiers)
|
|
40
|
+
self.assertIn("Programming Language :: Python :: 3.12", classifiers)
|
|
41
|
+
self.assertIn("Programming Language :: Python :: 3.13", classifiers)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
if __name__ == "__main__":
|
|
45
|
+
unittest.main()
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from types import SimpleNamespace
|
|
3
|
+
from unittest.mock import patch
|
|
4
|
+
|
|
5
|
+
from google.genai import errors as genai_errors
|
|
6
|
+
from google.genai import types
|
|
7
|
+
|
|
8
|
+
from polytext.loader.youtube_llm import YoutubeTranscriptLoaderWithLlm
|
|
9
|
+
from polytext.exceptions import EmptyDocument
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _make_response(text="full transcript"):
|
|
13
|
+
return SimpleNamespace(
|
|
14
|
+
text=text,
|
|
15
|
+
candidates=[SimpleNamespace(finish_reason="STOP")],
|
|
16
|
+
usage_metadata=SimpleNamespace(
|
|
17
|
+
candidates_token_count=3,
|
|
18
|
+
prompt_token_count=2,
|
|
19
|
+
total_token_count=5,
|
|
20
|
+
),
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class _FakeModels:
|
|
25
|
+
def __init__(self, response):
|
|
26
|
+
self.response = response
|
|
27
|
+
self.generate_content_config = None
|
|
28
|
+
self.generate_content_model = None
|
|
29
|
+
|
|
30
|
+
def generate_content(self, model, contents, config):
|
|
31
|
+
self.generate_content_model = model
|
|
32
|
+
self.generate_content_config = config
|
|
33
|
+
if isinstance(self.response, Exception):
|
|
34
|
+
raise self.response
|
|
35
|
+
return self.response
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _FakeClient:
|
|
39
|
+
def __init__(self, response):
|
|
40
|
+
self.models = _FakeModels(response)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _invalid_argument_error():
|
|
44
|
+
return genai_errors.ClientError(
|
|
45
|
+
400,
|
|
46
|
+
{
|
|
47
|
+
"error": {
|
|
48
|
+
"code": 400,
|
|
49
|
+
"message": "Request contains an invalid argument.",
|
|
50
|
+
"status": "INVALID_ARGUMENT",
|
|
51
|
+
}
|
|
52
|
+
},
|
|
53
|
+
None,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _long_transcript():
|
|
58
|
+
return " ".join(
|
|
59
|
+
f"This is transcript sentence number {index} with unique content."
|
|
60
|
+
for index in range(20)
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class TestYoutubeLlmFallbacks(unittest.TestCase):
|
|
65
|
+
@patch("polytext.loader.youtube_llm.genai.Client")
|
|
66
|
+
def test_invalid_argument_final_fallback_uses_original_temperature(self, mock_client_cls):
|
|
67
|
+
clients = [
|
|
68
|
+
_FakeClient(_invalid_argument_error()),
|
|
69
|
+
_FakeClient(_invalid_argument_error()),
|
|
70
|
+
_FakeClient(_make_response(_long_transcript())),
|
|
71
|
+
]
|
|
72
|
+
mock_client_cls.side_effect = clients
|
|
73
|
+
|
|
74
|
+
loader = YoutubeTranscriptLoaderWithLlm()
|
|
75
|
+
result = loader.get_text_from_youtube("https://www.youtube.com/watch?v=example")
|
|
76
|
+
|
|
77
|
+
self.assertEqual(result["completion_model"], "models/gemini-2.5-flash")
|
|
78
|
+
self.assertEqual(clients[2].models.generate_content_config.temperature, 0.0)
|
|
79
|
+
self.assertEqual(
|
|
80
|
+
clients[2].models.generate_content_config.media_resolution,
|
|
81
|
+
types.MediaResolution.MEDIA_RESOLUTION_LOW,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
@patch("polytext.loader.youtube_llm.genai.Client")
|
|
85
|
+
def test_invalid_argument_after_fallbacks_raises_empty_document_code_995(self, mock_client_cls):
|
|
86
|
+
clients = [
|
|
87
|
+
_FakeClient(_invalid_argument_error()),
|
|
88
|
+
_FakeClient(_invalid_argument_error()),
|
|
89
|
+
_FakeClient(_invalid_argument_error()),
|
|
90
|
+
]
|
|
91
|
+
mock_client_cls.side_effect = clients
|
|
92
|
+
|
|
93
|
+
loader = YoutubeTranscriptLoaderWithLlm()
|
|
94
|
+
|
|
95
|
+
with self.assertRaises(EmptyDocument) as error_context:
|
|
96
|
+
loader.get_text_from_youtube("https://www.youtube.com/watch?v=example")
|
|
97
|
+
|
|
98
|
+
self.assertEqual(error_context.exception.code, 995)
|
|
99
|
+
self.assertIn("INVALID_ARGUMENT", error_context.exception.message)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
if __name__ == "__main__":
|
|
103
|
+
unittest.main()
|
|
@@ -32,9 +32,9 @@ url = 'https://www.youtube.com/watch?v=L4as3tks4Js' # basement alberto angela
|
|
|
32
32
|
|
|
33
33
|
# url = 'https://www.youtube.com/watch?v=UabBYexBD4k' # INM RAG 11 minuti, completato in 26 secondi con successo con gemini-3.1-flash-lite
|
|
34
34
|
|
|
35
|
-
url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
|
|
35
|
+
#url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw' # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
|
|
38
38
|
|
|
39
39
|
# url = 'https://www.youtube.com/watch?v=CM2CkNU9xR0' # google antigravity 27 minuti, completato in 39 secondi con successo con gemini-3.1-flash-lite (146k token in input, 6k token in output)
|
|
40
40
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|