polytext 0.2.3__tar.gz → 0.2.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.2.3 → polytext-0.2.4}/PKG-INFO +1 -1
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/audio_to_text.py +1 -1
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/ocr_to_text.py +1 -1
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/base.py +14 -7
- {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/PKG-INFO +1 -1
- {polytext-0.2.3 → polytext-0.2.4}/setup.py +1 -1
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_base_loader_error_mapping.py +11 -9
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_ocr_from_image.py +1 -1
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_youtube_transcript.py +2 -2
- {polytext-0.2.3 → polytext-0.2.4}/LICENSE +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/README.md +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/base.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/document_ocr_to_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/gemini_quality_guards.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/html_to_md.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/md_to_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/pdf.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/text_to_md.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/video_to_audio.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/exceptions/base.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/generator/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/generator/pdf.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/audio.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/document.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/document_ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/downloader/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/downloader/downloader.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/html.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/markdown.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/notebook.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/plain_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/video.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/xml_xbrl.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/youtube.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/youtube_llm.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/audio_chunker.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/text_merger.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/transcript_chunker.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/text_merging.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/text_to_md.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/transcription.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/utils/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext/utils/utils.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/SOURCES.txt +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/requires.txt +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/pyproject.toml +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/setup.cfg +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_audio_chunker.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_audio_comparison_helpers.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_audio_transcription_model_migration.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_audio_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_document_ocr_to_text_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_ocr_to_text_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_youtube_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_extracted_text_whitespace.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_gemini_quality_guards.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_audio_transcript_from_gcs.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_customized_pdf_from_markdown.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_ocr_azure_oai.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_text_from_gcs.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_text_from_markdown.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_video_transcript_from_gcs.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_library.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_markdown_loader_gzip.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_markitdown_html.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_notebook_loader.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_ocr_fallbacks.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_ocr_image_descriptions.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_pain_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_python_version_metadata.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_split_audio_with_llm.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_xml_xbrl_loader.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_youtube_gemini_minimal_check.py +0 -0
- {polytext-0.2.3 → polytext-0.2.4}/tests/test_youtube_llm_fallbacks.py +0 -0
|
@@ -46,7 +46,7 @@ AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHO
|
|
|
46
46
|
AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
|
|
47
47
|
AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
|
|
48
48
|
AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
|
|
49
|
-
AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-
|
|
49
|
+
AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
|
|
50
50
|
AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
|
|
51
51
|
NO_HUMAN_SPEECH_MARKER = "no human speech detected"
|
|
52
52
|
|
|
@@ -33,7 +33,7 @@ OCR_TAIL_REPETITION_THRESHOLD = float(os.getenv("OCR_TAIL_REPETITION_THRESHOLD",
|
|
|
33
33
|
OCR_FALLBACK_SOURCE_PATTERN = os.getenv("OCR_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
|
|
34
34
|
OCR_FALLBACK_MODEL = os.getenv("OCR_FALLBACK_MODEL", "gemini-3-flash-preview")
|
|
35
35
|
OCR_FALLBACK_TEMPERATURE = float(os.getenv("OCR_FALLBACK_TEMPERATURE", "1.0"))
|
|
36
|
-
OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-
|
|
36
|
+
OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def compress_and_convert_image(input_path: str, target_size=1):
|
|
@@ -55,6 +55,18 @@ def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
|
55
55
|
return value.strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
56
56
|
|
|
57
57
|
|
|
58
|
+
def _capture_exception_for_sentry(error: Exception) -> None:
|
|
59
|
+
try:
|
|
60
|
+
import sentry_sdk
|
|
61
|
+
except ImportError:
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
try:
|
|
65
|
+
sentry_sdk.capture_exception(error)
|
|
66
|
+
except Exception:
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
|
|
58
70
|
class BaseLoader:
|
|
59
71
|
def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
|
|
60
72
|
ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
|
|
@@ -153,19 +165,14 @@ class BaseLoader:
|
|
|
153
165
|
try:
|
|
154
166
|
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
155
167
|
except EmptyDocument as e:
|
|
156
|
-
logger.info(f"Empty document encountered: {e.message}")
|
|
157
168
|
if e.code in LLM_OUTPUT_ERROR_CODES:
|
|
158
|
-
|
|
159
|
-
"Raising LoaderError: status=422 code=%s original_empty_document_code=%s message=%s",
|
|
160
|
-
LLM_OUTPUT_ERROR_CODES[e.code],
|
|
161
|
-
e.code,
|
|
162
|
-
e.message,
|
|
163
|
-
)
|
|
169
|
+
_capture_exception_for_sentry(e)
|
|
164
170
|
raise LoaderError(
|
|
165
171
|
message=e.message,
|
|
166
172
|
status=422,
|
|
167
173
|
code=LLM_OUTPUT_ERROR_CODES[e.code],
|
|
168
174
|
) from e
|
|
175
|
+
logger.info(f"Empty document encountered: {e.message}")
|
|
169
176
|
if self.fallback_ocr:
|
|
170
177
|
loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
|
|
171
178
|
llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
|
|
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
|
|
|
51
51
|
|
|
52
52
|
setup(
|
|
53
53
|
name='polytext',
|
|
54
|
-
version='0.2.
|
|
54
|
+
version='0.2.4',
|
|
55
55
|
url='https://github.com/docsity/polytext',
|
|
56
56
|
# download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
|
|
57
57
|
license='MIT',
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import unittest
|
|
2
|
-
from unittest.mock import patch
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
3
3
|
|
|
4
4
|
from polytext.exceptions import EmptyDocument, LoaderError
|
|
5
5
|
from polytext.loader.base import BaseLoader
|
|
@@ -43,19 +43,21 @@ class TestBaseLoaderErrorMapping(unittest.TestCase):
|
|
|
43
43
|
)
|
|
44
44
|
)
|
|
45
45
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
46
|
+
sentry_sdk = Mock()
|
|
47
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
48
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
49
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
50
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
51
|
+
loader.get_text(["dummy.txt"])
|
|
49
52
|
|
|
50
53
|
error = error_context.exception
|
|
51
54
|
self.assertEqual(error.status, 422)
|
|
52
55
|
self.assertEqual(error.code, expected_loader_code)
|
|
53
56
|
self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
self.
|
|
58
|
-
self.assertEqual(mock_exception.call_args.args[3], f"diagnostic failure {empty_document_code}")
|
|
57
|
+
mock_info.assert_not_called()
|
|
58
|
+
mock_exception.assert_not_called()
|
|
59
|
+
sentry_sdk.capture_exception.assert_called_once()
|
|
60
|
+
self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
|
|
59
61
|
|
|
60
62
|
def test_empty_or_too_short_documents_still_return_empty_response(self):
|
|
61
63
|
loader = _FakeBaseLoader(
|
|
@@ -38,7 +38,7 @@ def main():
|
|
|
38
38
|
|
|
39
39
|
# local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9695.jpg"
|
|
40
40
|
# local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9701.jpg"
|
|
41
|
-
local_file_path = "/Users/marcodelgiudice/Projects/polytext/
|
|
41
|
+
local_file_path = "/Users/marcodelgiudice/Projects/polytext/gm1.png"
|
|
42
42
|
|
|
43
43
|
try:
|
|
44
44
|
start = time.time()
|
|
@@ -32,9 +32,9 @@ url = 'https://www.youtube.com/watch?v=L4as3tks4Js' # basement alberto angela
|
|
|
32
32
|
|
|
33
33
|
# url = 'https://www.youtube.com/watch?v=UabBYexBD4k' # INM RAG 11 minuti, completato in 26 secondi con successo con gemini-3.1-flash-lite
|
|
34
34
|
|
|
35
|
-
url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
|
|
35
|
+
#url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw' # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
|
|
38
38
|
|
|
39
39
|
# url = 'https://www.youtube.com/watch?v=CM2CkNU9xR0' # google antigravity 27 minuti, completato in 39 secondi con successo con gemini-3.1-flash-lite (146k token in input, 6k token in output)
|
|
40
40
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|