PyPI - polytext - Versions diffs - 0.2.2b2__tar.gz → 0.2.4__tar.gz - Mend

polytext 0.2.2b2tar.gz → 0.2.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

{polytext-0.2.2b2 → polytext-0.2.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: polytext
-Version: 0.2.2b2
+Version: 0.2.4
 Summary: Python utilities to simplify document files management
 Home-page: https://github.com/docsity/polytext
 Author: Matteo Senardi

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/audio_to_text.py RENAMED Viewed

@@ -46,7 +46,7 @@ AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHO
 AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
 AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
 AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
-AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
+AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
 AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
 NO_HUMAN_SPEECH_MARKER = "no human speech detected"
@@ -90,16 +90,22 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
     os.close(fd)
     logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
-    ffmpeg.input(input_path).output(
-        temp_audio_path,
-        q=bitrate_quality,  # Variable bitrate quality (0-9, 9 being lowest)
-        acodec='libmp3lame',
-        ac=1,  # Convert to mono
-        ar=16000,  # Lower sample rate
-        vn=None,
-        threads=0,  # Use maximum available threads
-        loglevel='error',  # Reduce logging overhead
-    ).run(quiet=True, overwrite_output=True)
+    try:
+        ffmpeg.input(input_path).output(
+            temp_audio_path,
+            q=bitrate_quality,  # Variable bitrate quality (0-9, 9 being lowest)
+            acodec='libmp3lame',
+            ac=1,  # Convert to mono
+            ar=16000,  # Lower sample rate
+            vn=None,
+            threads=0,  # Use maximum available threads
+            loglevel='error',  # Reduce logging overhead
+        ).run(quiet=True, overwrite_output=True)
+    except Exception:
+        logger.exception("FFmpeg error during audio processing for %s", input_path)
+        if os.path.exists(temp_audio_path):
+            os.unlink(temp_audio_path)
+        raise
     logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
     return temp_audio_path
@@ -313,7 +319,11 @@ class AudioToTextConverter:
         mime_type, _ = mimetypes.guess_type(audio_file)
         if mime_type is None:
-            raise ValueError("Audio format not recognized")
+            try:
+                raise ValueError("Audio format not recognized")
+            except ValueError:
+                logger.exception("Unsupported audio format for %s", audio_file)
+                raise
         return client.models.generate_content(
             model=self.transcription_model,

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/document_ocr_to_text.py RENAMED Viewed

@@ -96,6 +96,7 @@ def compress_and_convert_image(input_path: str, target_size=1):
         return temp_image_path
     except Exception as e:
+        logger.exception("FFmpeg error during image processing for %s", input_path)
         raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
 def get_document_ocr(
@@ -389,7 +390,11 @@ class DocumentOCRToTextConverter:
                 # Determine mimetype
                 mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
                 if mime_type is None:
-                    raise ValueError("Image format not recognized")
+                    try:
+                        raise ValueError("Image format not recognized")
+                    except ValueError:
+                        logger.exception("Unsupported image format for %s", temp_file_for_ocr)
+                        raise
                 response = client.models.generate_content(
                     model=self.ocr_model,

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/document_ocr_to_text_azure_oai.py RENAMED Viewed

@@ -88,6 +88,7 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
         return temp_image_path
     except Exception as e:
+        logger.exception("FFmpeg error during image processing for %s", input_path)
         raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
@@ -233,7 +234,11 @@ class DocumentOCRToTextConverter:
             mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
             if mime_type is None:
-                raise ValueError("Image format not recognized")
+                try:
+                    raise ValueError("Image format not recognized")
+                except ValueError:
+                    logger.exception("Unsupported image format for %s", temp_file_for_ocr)
+                    raise
             with open(temp_file_for_ocr, "rb") as f:
                 image_b64 = base64.b64encode(f.read()).decode("utf-8")
@@ -308,7 +313,7 @@ class DocumentOCRToTextConverter:
             pdf = fitz.open(document_for_ocr)
             total_pages = len(pdf)
             if total_pages == 0:
-                raise EmptyDocument(message="The document has no pages.", code=997)
+                raise EmptyDocument(message="The document has no pages.", code=998)
             start_page, end_page = self.validate_page_range(total_pages)

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/ocr_to_text.py RENAMED Viewed

@@ -33,7 +33,7 @@ OCR_TAIL_REPETITION_THRESHOLD = float(os.getenv("OCR_TAIL_REPETITION_THRESHOLD",
 OCR_FALLBACK_SOURCE_PATTERN = os.getenv("OCR_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
 OCR_FALLBACK_MODEL = os.getenv("OCR_FALLBACK_MODEL", "gemini-3-flash-preview")
 OCR_FALLBACK_TEMPERATURE = float(os.getenv("OCR_FALLBACK_TEMPERATURE", "1.0"))
-OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
+OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
 def compress_and_convert_image(input_path: str, target_size=1):
@@ -96,6 +96,7 @@ def compress_and_convert_image(input_path: str, target_size=1):
         return temp_image_path
     except Exception as e:
+        logger.exception("FFmpeg error during image processing for %s", input_path)
         raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
 def get_ocr(
@@ -383,7 +384,11 @@ class OCRToTextConverter:
                 # Determine mimetype
                 mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
                 if mime_type is None:
-                    raise ValueError("Image format not recognized")
+                    try:
+                        raise ValueError("Image format not recognized")
+                    except ValueError:
+                        logger.exception("Unsupported image format for %s", temp_file_for_ocr)
+                        raise
                 response = client.models.generate_content(
                     model=self.ocr_model,

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/ocr_to_text_azure_oai.py RENAMED Viewed

@@ -87,6 +87,7 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
         return temp_image_path
     except Exception as e:
+        logger.exception("FFmpeg error during image processing for %s", input_path)
         raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
@@ -224,7 +225,11 @@ class OCRToTextConverter:
             # We'll use base64 data-URL.
             mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
             if mime_type is None:
-                raise ValueError("Image format not recognized")
+                try:
+                    raise ValueError("Image format not recognized")
+                except ValueError:
+                    logger.exception("Unsupported image format for %s", temp_file_for_ocr)
+                    raise
             with open(temp_file_for_ocr, "rb") as f:
                 image_b64 = base64.b64encode(f.read()).decode("utf-8")

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/video_to_audio.py RENAMED Viewed

@@ -52,12 +52,12 @@ def convert_video_to_audio(video_file: str , bitrate_quality: int =9) -> str:
         return temp_audio_path
     except ffmpeg.Error as e:
-        logger.info(f"FFmpeg conversion failed: {e.stderr.decode()}")
+        logger.exception("FFmpeg conversion failed: %s", e.stderr.decode())
         if os.path.exists(temp_audio_path):
             os.unlink(temp_audio_path)
         raise
     except Exception as e:
-        logger.info(f"Failed to convert video to audio: {str(e)}")
+        logger.exception("Failed to convert video to audio: %s", str(e))
         if os.path.exists(temp_audio_path):
             os.unlink(temp_audio_path)
-        raise
+        raise

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/base.py RENAMED Viewed

@@ -40,6 +40,12 @@ logger = logging.getLogger(__name__)
 MIN_DOC_TEXT_LENGTH_ACCEPTED = int(os.getenv("MIN_DOC_TEXT_LENGTH_ACCEPTED", "400"))
 OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV = "OCR_INCLUDE_IMAGE_DESCRIPTIONS"
+LLM_OUTPUT_ERROR_CODES = {
+    995: "INVALID_ARGUMENT",
+    996: "RECITATION",
+    997: "REPETITIVE_OUTPUT",
+    999: "MAX_TOKENS",
+}
 def _read_bool_env(name: str, default: bool = False) -> bool:
@@ -49,6 +55,18 @@ def _read_bool_env(name: str, default: bool = False) -> bool:
     return value.strip().lower() in {"1", "true", "yes", "y", "on"}
+def _capture_exception_for_sentry(error: Exception) -> None:
+    try:
+        import sentry_sdk
+    except ImportError:
+        return
+    try:
+        sentry_sdk.capture_exception(error)
+    except Exception:
+        return
 class BaseLoader:
     def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
                  ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
@@ -147,6 +165,13 @@ class BaseLoader:
         try:
             response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
         except EmptyDocument as e:
+            if e.code in LLM_OUTPUT_ERROR_CODES:
+                _capture_exception_for_sentry(e)
+                raise LoaderError(
+                    message=e.message,
+                    status=422,
+                    code=LLM_OUTPUT_ERROR_CODES[e.code],
+                ) from e
             logger.info(f"Empty document encountered: {e.message}")
             if self.fallback_ocr:
                 loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
@@ -317,6 +342,7 @@ class BaseLoader:
                 return YoutubeTranscriptLoaderWithLlm(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
             else:
                 return HtmlLoader(markdown_output=self.markdown_output)
+        # Handle markdown files based on extension or MIME type
         if file_extension in [".md", ".markdown"] or (
                 mime_type and mime_type.startswith("text/markdown")
         ):
@@ -344,7 +370,11 @@ class BaseLoader:
                     **kwargs,
                 )
             else:
-                raise ValueError(f"Unsupported MIME type: {mime_type}")
+                try:
+                    raise ValueError(f"Unsupported MIME type: {mime_type}")
+                except ValueError:
+                    logger.exception("Unsupported media type while initializing loader: %s", mime_type)
+                    raise
         elif self.validate_user_text(text=input):
             return PlainTextLoader(

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/youtube_llm.py RENAMED Viewed

@@ -384,7 +384,7 @@ class YoutubeTranscriptLoaderWithLlm:
                     fallback_model=self.final_fallback_model,
                     fallback_temperature=self.final_fallback_temperature,
                 )
-            raise Exception(f"Invalid argument: {e.message}; details={getattr(e, 'details', None)}")
+            raise e_tmp from e
         except errors.ServerError as e:
             logger.info("ServerError occurred with status %s and message: %s", e.status, e.message)

{polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: polytext
-Version: 0.2.2b2
+Version: 0.2.4
 Summary: Python utilities to simplify document files management
 Home-page: https://github.com/docsity/polytext
 Author: Matteo Senardi

{polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/SOURCES.txt RENAMED Viewed

@@ -56,6 +56,7 @@ polytext/utils/utils.py
 tests/test_audio_chunker.py
 tests/test_audio_comparison_helpers.py
 tests/test_audio_transcription_model_migration.py
+tests/test_base_loader_error_mapping.py
 tests/test_compare_audio_models.py
 tests/test_compare_document_ocr_to_text_models.py
 tests/test_compare_ocr_to_text_models.py
@@ -63,6 +64,7 @@ tests/test_compare_youtube_models.py
 tests/test_dowload_audio_from_youtube.py
 tests/test_dowload_audio_from_youtube_helpers.py
 tests/test_extracted_text_whitespace.py
+tests/test_gemini_quality_guards.py
 tests/test_get_audio_transcript_from_gcs.py
 tests/test_get_customized_pdf_from_markdown.py
 tests/test_get_document_ocr.py
@@ -79,7 +81,9 @@ tests/test_notebook_loader.py
 tests/test_ocr_fallbacks.py
 tests/test_ocr_image_descriptions.py
 tests/test_pain_text.py
+tests/test_python_version_metadata.py
 tests/test_split_audio_with_llm.py
 tests/test_xml_xbrl_loader.py
 tests/test_youtube_gemini_minimal_check.py
+tests/test_youtube_llm_fallbacks.py
 tests/test_youtube_transcript.py

{polytext-0.2.2b2 → polytext-0.2.4}/setup.py RENAMED Viewed

@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
 setup(
     name='polytext',
-    version='0.2.2b2',
+    version='0.2.4',
     url='https://github.com/docsity/polytext',
     # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
     license='MIT',

polytext-0.2.4/tests/test_base_loader_error_mapping.py ADDED Viewed

@@ -0,0 +1,81 @@
+import unittest
+from unittest.mock import Mock, patch
+from polytext.exceptions import EmptyDocument, LoaderError
+from polytext.loader.base import BaseLoader
+class _FailingLoader:
+    def __init__(self, error):
+        self.error = error
+    def load(self, input_path):
+        raise self.error
+class _FakeBaseLoader(BaseLoader):
+    def __init__(self, error, **kwargs):
+        super().__init__(**kwargs)
+        self.error = error
+    def initiate_storage(self, input):
+        return {}
+    def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
+        return _FailingLoader(self.error)
+class TestBaseLoaderErrorMapping(unittest.TestCase):
+    def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
+        cases = [
+            (995, "INVALID_ARGUMENT"),
+            (996, "RECITATION"),
+            (997, "REPETITIVE_OUTPUT"),
+            (999, "MAX_TOKENS"),
+        ]
+        for empty_document_code, expected_loader_code in cases:
+            with self.subTest(empty_document_code=empty_document_code):
+                loader = _FakeBaseLoader(
+                    EmptyDocument(
+                        message=f"diagnostic failure {empty_document_code}",
+                        code=empty_document_code,
+                    )
+                )
+                sentry_sdk = Mock()
+                with patch("polytext.loader.base.logger.info") as mock_info:
+                    with patch("polytext.loader.base.logger.exception") as mock_exception:
+                        with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
+                            with self.assertRaises(LoaderError) as error_context:
+                                loader.get_text(["dummy.txt"])
+                error = error_context.exception
+                self.assertEqual(error.status, 422)
+                self.assertEqual(error.code, expected_loader_code)
+                self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
+                mock_info.assert_not_called()
+                mock_exception.assert_not_called()
+                sentry_sdk.capture_exception.assert_called_once()
+                self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
+    def test_empty_or_too_short_documents_still_return_empty_response(self):
+        loader = _FakeBaseLoader(
+            EmptyDocument(
+                message="Document text with less than 400 characters",
+                code=998,
+            )
+        )
+        with patch("polytext.loader.base.logger.exception") as mock_exception:
+            response = loader.get_text(["empty.txt"])
+        self.assertEqual(response["text"], "")
+        self.assertEqual(response["completion_tokens"], 0)
+        self.assertEqual(response["prompt_tokens"], 0)
+        self.assertEqual(response["output_list"][0]["input"], "empty.txt")
+        mock_exception.assert_not_called()
+if __name__ == "__main__":
+    unittest.main()

polytext-0.2.4/tests/test_gemini_quality_guards.py ADDED Viewed

@@ -0,0 +1,31 @@
+import unittest
+from polytext.converter.gemini_quality_guards import tail_has_excessive_repetition
+class TestGeminiQualityGuards(unittest.TestCase):
+    def test_detects_consecutive_repeated_sentences_below_ratio_threshold(self):
+        text = (
+            "gli davamo nomi veri e falsi. "
+            "_Elio_ all'anagrafe e io gli gli dicevo _Roberto Gustativi_. "
+            "E questi scrivevano Roberto Gustativi. "
+            "E la soddisfazione perversa era andare a comprare il giornale. "
+            "È successo. "
+            "Sono stato. "
+            "Che è successo? "
+            "Siamo passati dal basement. "
+            "Siamo passati dal basement. "
+            "Siamo passati dal basement. "
+            "Siamo passati dal basement. "
+            "Il miglior finale di sempre. "
+            "Grazie, grazie, grazie, grazie, grazie, grazie, grazie. "
+            "E vi grazie."
+        )
+        self.assertTrue(
+            tail_has_excessive_repetition(text, tail_lines=200, threshold=0.35)
+        )
+if __name__ == "__main__":
+    unittest.main()

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_ocr_from_image.py RENAMED Viewed

@@ -38,7 +38,7 @@ def main():
     # local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9695.jpg"
     # local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9701.jpg"
-    local_file_path = "/Users/marcodelgiudice/Projects/polytext/chimicaformula.png"
+    local_file_path = "/Users/marcodelgiudice/Projects/polytext/gm1.png"
     try:
         start = time.time()

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_ocr_fallbacks.py RENAMED Viewed

@@ -4,7 +4,11 @@ from types import SimpleNamespace
 from unittest.mock import patch
 from polytext.converter.document_ocr_to_text import DocumentOCRToTextConverter
+from polytext.converter.document_ocr_to_text_azure_oai import (
+    DocumentOCRToTextConverter as AzureDocumentOCRToTextConverter,
+)
 from polytext.converter.ocr_to_text import OCRToTextConverter
+from polytext.exceptions import EmptyDocument
 def _make_response(
@@ -208,11 +212,12 @@ class TestOcrFallbacks(unittest.TestCase):
         )
         mock_client_cls.return_value = fake_client
-        converter = OCRToTextConverter(ocr_model="gemini-3.1-flash-lite-preview")
-        with tempfile.NamedTemporaryFile(suffix=".png") as temp_image:
-            temp_image.write(b"fake-image")
-            temp_image.flush()
-            result = converter.get_ocr(temp_image.name)
+        with patch("polytext.converter.ocr_to_text.OCR_FINAL_FALLBACK_MODEL", "gemini-2.0-flash"):
+            converter = OCRToTextConverter(ocr_model="gemini-3.1-flash-lite-preview")
+            with tempfile.NamedTemporaryFile(suffix=".png") as temp_image:
+                temp_image.write(b"fake-image")
+                temp_image.flush()
+                result = converter.get_ocr(temp_image.name)
         self.assertEqual(result["text"], "final fallback text")
         self.assertEqual(
@@ -263,6 +268,24 @@ class TestOcrFallbacks(unittest.TestCase):
         )
         self.assertEqual(fake_client.models.generate_content_temperatures, [0.0, 0.0, 1.0])
+    @patch("fitz.open")
+    def test_azure_document_ocr_no_pages_is_empty_or_too_short(self, mock_fitz_open):
+        mock_fitz_open.return_value = _FakePdf([])
+        converter = AzureDocumentOCRToTextConverter(
+            azure_endpoint="https://example.openai.azure.com",
+            azure_api_version="2024-10-21",
+        )
+        with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
+            temp_pdf.write(b"%PDF-1.4\n")
+            temp_pdf.flush()
+            with self.assertRaises(EmptyDocument) as error_context:
+                converter.get_document_ocr(temp_pdf.name)
+        self.assertEqual(error_context.exception.code, 998)
+        self.assertEqual(error_context.exception.message, "The document has no pages.")
 if __name__ == "__main__":
     unittest.main()

polytext-0.2.4/tests/test_python_version_metadata.py ADDED Viewed

@@ -0,0 +1,45 @@
+import ast
+import tomllib
+import unittest
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[1]
+def _setup_keyword(name):
+    tree = ast.parse((ROOT / "setup.py").read_text())
+    setup_call = next(
+        node
+        for node in ast.walk(tree)
+        if isinstance(node, ast.Call)
+        and getattr(node.func, "id", None) == "setup"
+    )
+    return next(
+        keyword.value
+        for keyword in setup_call.keywords
+        if keyword.arg == name
+    )
+class PythonVersionMetadataTest(unittest.TestCase):
+    def test_packaging_metadata_allows_python_311(self):
+        setup_python_requires = ast.literal_eval(_setup_keyword("python_requires"))
+        pyproject = tomllib.loads((ROOT / "pyproject.toml").read_text())
+        self.assertEqual(setup_python_requires, ">=3.11")
+        self.assertEqual(
+            pyproject["tool"]["poetry"]["dependencies"]["python"],
+            ">=3.11,<3.14",
+        )
+    def test_setup_classifiers_include_supported_python_versions(self):
+        classifiers = ast.literal_eval(_setup_keyword("classifiers"))
+        self.assertIn("Programming Language :: Python :: 3.11", classifiers)
+        self.assertIn("Programming Language :: Python :: 3.12", classifiers)
+        self.assertIn("Programming Language :: Python :: 3.13", classifiers)
+if __name__ == "__main__":
+    unittest.main()

polytext-0.2.4/tests/test_youtube_llm_fallbacks.py ADDED Viewed

@@ -0,0 +1,103 @@
+import unittest
+from types import SimpleNamespace
+from unittest.mock import patch
+from google.genai import errors as genai_errors
+from google.genai import types
+from polytext.loader.youtube_llm import YoutubeTranscriptLoaderWithLlm
+from polytext.exceptions import EmptyDocument
+def _make_response(text="full transcript"):
+    return SimpleNamespace(
+        text=text,
+        candidates=[SimpleNamespace(finish_reason="STOP")],
+        usage_metadata=SimpleNamespace(
+            candidates_token_count=3,
+            prompt_token_count=2,
+            total_token_count=5,
+        ),
+    )
+class _FakeModels:
+    def __init__(self, response):
+        self.response = response
+        self.generate_content_config = None
+        self.generate_content_model = None
+    def generate_content(self, model, contents, config):
+        self.generate_content_model = model
+        self.generate_content_config = config
+        if isinstance(self.response, Exception):
+            raise self.response
+        return self.response
+class _FakeClient:
+    def __init__(self, response):
+        self.models = _FakeModels(response)
+def _invalid_argument_error():
+    return genai_errors.ClientError(
+        400,
+        {
+            "error": {
+                "code": 400,
+                "message": "Request contains an invalid argument.",
+                "status": "INVALID_ARGUMENT",
+            }
+        },
+        None,
+    )
+def _long_transcript():
+    return " ".join(
+        f"This is transcript sentence number {index} with unique content."
+        for index in range(20)
+    )
+class TestYoutubeLlmFallbacks(unittest.TestCase):
+    @patch("polytext.loader.youtube_llm.genai.Client")
+    def test_invalid_argument_final_fallback_uses_original_temperature(self, mock_client_cls):
+        clients = [
+            _FakeClient(_invalid_argument_error()),
+            _FakeClient(_invalid_argument_error()),
+            _FakeClient(_make_response(_long_transcript())),
+        ]
+        mock_client_cls.side_effect = clients
+        loader = YoutubeTranscriptLoaderWithLlm()
+        result = loader.get_text_from_youtube("https://www.youtube.com/watch?v=example")
+        self.assertEqual(result["completion_model"], "models/gemini-2.5-flash")
+        self.assertEqual(clients[2].models.generate_content_config.temperature, 0.0)
+        self.assertEqual(
+            clients[2].models.generate_content_config.media_resolution,
+            types.MediaResolution.MEDIA_RESOLUTION_LOW,
+        )
+    @patch("polytext.loader.youtube_llm.genai.Client")
+    def test_invalid_argument_after_fallbacks_raises_empty_document_code_995(self, mock_client_cls):
+        clients = [
+            _FakeClient(_invalid_argument_error()),
+            _FakeClient(_invalid_argument_error()),
+            _FakeClient(_invalid_argument_error()),
+        ]
+        mock_client_cls.side_effect = clients
+        loader = YoutubeTranscriptLoaderWithLlm()
+        with self.assertRaises(EmptyDocument) as error_context:
+            loader.get_text_from_youtube("https://www.youtube.com/watch?v=example")
+        self.assertEqual(error_context.exception.code, 995)
+        self.assertIn("INVALID_ARGUMENT", error_context.exception.message)
+if __name__ == "__main__":
+    unittest.main()

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_youtube_transcript.py RENAMED Viewed

@@ -32,9 +32,9 @@ url = 'https://www.youtube.com/watch?v=L4as3tks4Js'  # basement alberto angela
 # url = 'https://www.youtube.com/watch?v=UabBYexBD4k'  # INM RAG 11 minuti, completato in 26 secondi con successo con gemini-3.1-flash-lite
-url = 'https://www.youtube.com/watch?v=96jN2OCOfLs'  # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
+#url = 'https://www.youtube.com/watch?v=96jN2OCOfLs'  # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
-# url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw'  # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
+url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw'  # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
 # url = 'https://www.youtube.com/watch?v=CM2CkNU9xR0'  # google antigravity 27 minuti, completato in 39 secondi con successo con gemini-3.1-flash-lite (146k token in input, 6k token in output)

{polytext-0.2.2b2 → polytext-0.2.4}/LICENSE RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/README.md RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/base.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/gemini_quality_guards.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/html_to_md.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/md_to_text.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/pdf.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/converter/text_to_md.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/exceptions/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/exceptions/base.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/generator/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/generator/pdf.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/audio.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/document.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/document_ocr.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/downloader/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/downloader/downloader.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/html.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/markdown.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/notebook.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/ocr.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/plain_text.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/video.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/xml_xbrl.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/loader/youtube.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/audio_chunker.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/text_merger.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/processor/transcript_chunker.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/ocr.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/text_merging.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/text_to_md.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/prompts/transcription.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/utils/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext/utils/utils.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/not-zip-safe RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/requires.txt RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/polytext.egg-info/top_level.txt RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/pyproject.toml RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/setup.cfg RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_audio_chunker.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_audio_comparison_helpers.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_audio_transcription_model_migration.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_audio_models.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_document_ocr_to_text_models.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_ocr_to_text_models.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_compare_youtube_models.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube_helpers.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_extracted_text_whitespace.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_audio_transcript_from_gcs.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_customized_pdf_from_markdown.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_ocr.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_ocr_azure_oai.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_text.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_document_text_from_gcs.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_text_from_markdown.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_get_video_transcript_from_gcs.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_library.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_markdown_loader_gzip.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_markitdown_html.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_notebook_loader.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_ocr_image_descriptions.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_pain_text.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_split_audio_with_llm.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_xml_xbrl_loader.py RENAMED Viewed

File without changes

{polytext-0.2.2b2 → polytext-0.2.4}/tests/test_youtube_gemini_minimal_check.py RENAMED Viewed

File without changes

polytext 0.2.2b2__tar.gz → 0.2.4__tar.gz

polytext 0.2.2b2tar.gz → 0.2.4tar.gz