polytext 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.2.3 → polytext-0.2.5}/PKG-INFO +1 -1
- {polytext-0.2.3 → polytext-0.2.5}/polytext/__init__.py +27 -2
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/audio_to_text.py +1 -1
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/ocr_to_text.py +1 -1
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/pdf.py +16 -3
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/base.py +44 -18
- {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/PKG-INFO +1 -1
- {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/SOURCES.txt +1 -0
- {polytext-0.2.3 → polytext-0.2.5}/setup.py +1 -1
- polytext-0.2.5/tests/test_base_loader_error_mapping.py +148 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_audio_transcript_from_gcs.py +1 -1
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_ocr_from_image.py +1 -1
- polytext-0.2.5/tests/test_pdf_conversion_error.py +43 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_youtube_transcript.py +2 -2
- polytext-0.2.3/tests/test_base_loader_error_mapping.py +0 -79
- {polytext-0.2.3 → polytext-0.2.5}/LICENSE +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/README.md +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/base.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/document_ocr_to_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/gemini_quality_guards.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/html_to_md.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/md_to_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/text_to_md.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/video_to_audio.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/exceptions/base.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/generator/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/generator/pdf.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/audio.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/document.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/document_ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/downloader/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/downloader/downloader.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/html.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/markdown.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/notebook.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/plain_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/video.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/xml_xbrl.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/youtube.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/youtube_llm.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/audio_chunker.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/text_merger.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/transcript_chunker.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/text_merging.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/text_to_md.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/transcription.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/utils/__init__.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext/utils/utils.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/requires.txt +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/pyproject.toml +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/setup.cfg +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_audio_chunker.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_audio_comparison_helpers.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_audio_transcription_model_migration.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_audio_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_document_ocr_to_text_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_ocr_to_text_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_youtube_models.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_extracted_text_whitespace.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_gemini_quality_guards.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_customized_pdf_from_markdown.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_ocr.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_ocr_azure_oai.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_text_from_gcs.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_text_from_markdown.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_video_transcript_from_gcs.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_library.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_markdown_loader_gzip.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_markitdown_html.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_notebook_loader.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_ocr_fallbacks.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_ocr_image_descriptions.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_pain_text.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_python_version_metadata.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_split_audio_with_llm.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_xml_xbrl_loader.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_youtube_gemini_minimal_check.py +0 -0
- {polytext-0.2.3 → polytext-0.2.5}/tests/test_youtube_llm_fallbacks.py +0 -0
|
@@ -3,11 +3,36 @@ import os
|
|
|
3
3
|
import logging
|
|
4
4
|
import dotenv
|
|
5
5
|
|
|
6
|
+
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError, LoaderError
|
|
7
|
+
|
|
6
8
|
logger = logging.getLogger(__name__)
|
|
7
9
|
|
|
8
10
|
# Load environment variables
|
|
9
11
|
dotenv.load_dotenv()
|
|
10
12
|
|
|
13
|
+
|
|
14
|
+
def _filter_expected_loader_errors(event, hint):
|
|
15
|
+
error = None
|
|
16
|
+
if hint:
|
|
17
|
+
exc_info = hint.get("exc_info")
|
|
18
|
+
if exc_info:
|
|
19
|
+
error = exc_info[1]
|
|
20
|
+
else:
|
|
21
|
+
error = hint.get("original_exception")
|
|
22
|
+
|
|
23
|
+
if isinstance(error, LoaderError) and error.code == "NO_TEXT_DETECTED":
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
exception_values = (event or {}).get("exception", {}).get("values", [])
|
|
27
|
+
for exception_value in exception_values:
|
|
28
|
+
exception_type = exception_value.get("type") or ""
|
|
29
|
+
exception_message = exception_value.get("value")
|
|
30
|
+
if exception_type.endswith("LoaderError") and exception_message == "No text detected":
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
return event
|
|
34
|
+
|
|
35
|
+
|
|
11
36
|
# Initialize Sentry if DSN is configured
|
|
12
37
|
sentry_dsn = os.getenv('SENTRY_DSN_POLYTEXT')
|
|
13
38
|
if sentry_dsn:
|
|
@@ -18,6 +43,7 @@ if sentry_dsn:
|
|
|
18
43
|
environment=os.getenv('ENV', 'prod'),
|
|
19
44
|
traces_sample_rate=1.0,
|
|
20
45
|
profiles_sample_rate=1.0,
|
|
46
|
+
before_send=_filter_expected_loader_errors,
|
|
21
47
|
)
|
|
22
48
|
logger.info("Sentry monitoring initialized")
|
|
23
49
|
except ImportError:
|
|
@@ -26,7 +52,6 @@ if sentry_dsn:
|
|
|
26
52
|
|
|
27
53
|
from .converter.pdf import convert_to_pdf, DocumentConverter
|
|
28
54
|
from .loader.document import DocumentLoader
|
|
29
|
-
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
|
|
30
55
|
from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
|
|
31
56
|
|
|
32
57
|
__all__ = [
|
|
@@ -38,4 +63,4 @@ __all__ = [
|
|
|
38
63
|
'ConversionError',
|
|
39
64
|
'get_customized_pdf_from_markdown',
|
|
40
65
|
'PDFGenerator'
|
|
41
|
-
]
|
|
66
|
+
]
|
|
@@ -46,7 +46,7 @@ AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHO
|
|
|
46
46
|
AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
|
|
47
47
|
AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
|
|
48
48
|
AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
|
|
49
|
-
AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-
|
|
49
|
+
AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
|
|
50
50
|
AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
|
|
51
51
|
NO_HUMAN_SPEECH_MARKER = "no human speech detected"
|
|
52
52
|
|
|
@@ -33,7 +33,7 @@ OCR_TAIL_REPETITION_THRESHOLD = float(os.getenv("OCR_TAIL_REPETITION_THRESHOLD",
|
|
|
33
33
|
OCR_FALLBACK_SOURCE_PATTERN = os.getenv("OCR_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
|
|
34
34
|
OCR_FALLBACK_MODEL = os.getenv("OCR_FALLBACK_MODEL", "gemini-3-flash-preview")
|
|
35
35
|
OCR_FALLBACK_TEMPERATURE = float(os.getenv("OCR_FALLBACK_TEMPERATURE", "1.0"))
|
|
36
|
-
OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-
|
|
36
|
+
OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
def compress_and_convert_image(input_path: str, target_size=1):
|
|
@@ -127,11 +127,24 @@ class DocumentConverter:
|
|
|
127
127
|
]
|
|
128
128
|
|
|
129
129
|
try:
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
subprocess.run(
|
|
131
|
+
command,
|
|
132
|
+
stdout=subprocess.PIPE,
|
|
133
|
+
stderr=subprocess.PIPE,
|
|
134
|
+
text=True,
|
|
135
|
+
check=True,
|
|
136
|
+
)
|
|
132
137
|
logger.info(f"Conversion successful: '{output_file}'")
|
|
133
138
|
except subprocess.CalledProcessError as e:
|
|
139
|
+
output_parts = []
|
|
140
|
+
if e.stdout:
|
|
141
|
+
output_parts.append(f"stdout: {e.stdout.strip()}")
|
|
142
|
+
if e.stderr:
|
|
143
|
+
output_parts.append(f"stderr: {e.stderr.strip()}")
|
|
144
|
+
details = "\n".join(output_parts)
|
|
134
145
|
error_msg = f"Error during conversion: {e}"
|
|
146
|
+
if details:
|
|
147
|
+
error_msg = f"{error_msg}\n{details}"
|
|
135
148
|
logger.info(error_msg)
|
|
136
149
|
raise ConversionError(error_msg, e)
|
|
137
150
|
|
|
@@ -253,4 +266,4 @@ class DocumentConverter:
|
|
|
253
266
|
# except Exception as e:
|
|
254
267
|
# error_msg = f"Error during PDF conversion: {str(e)}"
|
|
255
268
|
# logger.error(error_msg)
|
|
256
|
-
# raise ConversionError(error_msg)
|
|
269
|
+
# raise ConversionError(error_msg)
|
|
@@ -25,7 +25,7 @@ from ..loader import (
|
|
|
25
25
|
XmlXbrlLoader,
|
|
26
26
|
NotebookLoader
|
|
27
27
|
)
|
|
28
|
-
from ..exceptions import EmptyDocument, LoaderTimeoutError, LoaderError
|
|
28
|
+
from ..exceptions import ConversionError, EmptyDocument, LoaderTimeoutError, LoaderError
|
|
29
29
|
from ..utils.utils import clean_extracted_text_whitespace, remove_markdown_strip
|
|
30
30
|
|
|
31
31
|
# External imports
|
|
@@ -46,6 +46,10 @@ LLM_OUTPUT_ERROR_CODES = {
|
|
|
46
46
|
997: "REPETITIVE_OUTPUT",
|
|
47
47
|
999: "MAX_TOKENS",
|
|
48
48
|
}
|
|
49
|
+
EMPTY_DOCUMENT_LOADER_ERROR_CODES = {
|
|
50
|
+
**LLM_OUTPUT_ERROR_CODES,
|
|
51
|
+
998: "NO_TEXT_DETECTED",
|
|
52
|
+
}
|
|
49
53
|
|
|
50
54
|
|
|
51
55
|
def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
@@ -55,6 +59,32 @@ def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
|
55
59
|
return value.strip().lower() in {"1", "true", "yes", "y", "on"}
|
|
56
60
|
|
|
57
61
|
|
|
62
|
+
def _capture_exception_for_sentry(error: Exception) -> None:
|
|
63
|
+
try:
|
|
64
|
+
import sentry_sdk
|
|
65
|
+
except ImportError:
|
|
66
|
+
return
|
|
67
|
+
|
|
68
|
+
try:
|
|
69
|
+
sentry_sdk.capture_exception(error)
|
|
70
|
+
except Exception:
|
|
71
|
+
return
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _raise_empty_document_loader_error(error: EmptyDocument) -> None:
|
|
75
|
+
loader_error_code = EMPTY_DOCUMENT_LOADER_ERROR_CODES.get(error.code, "NO_TEXT_DETECTED")
|
|
76
|
+
message = error.message
|
|
77
|
+
if loader_error_code == "NO_TEXT_DETECTED":
|
|
78
|
+
message = "No text detected"
|
|
79
|
+
else:
|
|
80
|
+
_capture_exception_for_sentry(error)
|
|
81
|
+
raise LoaderError(
|
|
82
|
+
message=message,
|
|
83
|
+
status=422,
|
|
84
|
+
code=loader_error_code,
|
|
85
|
+
) from error
|
|
86
|
+
|
|
87
|
+
|
|
58
88
|
class BaseLoader:
|
|
59
89
|
def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
|
|
60
90
|
ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
|
|
@@ -153,28 +183,24 @@ class BaseLoader:
|
|
|
153
183
|
try:
|
|
154
184
|
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
155
185
|
except EmptyDocument as e:
|
|
156
|
-
logger.info(f"Empty document encountered: {e.message}")
|
|
157
186
|
if e.code in LLM_OUTPUT_ERROR_CODES:
|
|
158
|
-
|
|
159
|
-
"Raising LoaderError: status=422 code=%s original_empty_document_code=%s message=%s",
|
|
160
|
-
LLM_OUTPUT_ERROR_CODES[e.code],
|
|
161
|
-
e.code,
|
|
162
|
-
e.message,
|
|
163
|
-
)
|
|
164
|
-
raise LoaderError(
|
|
165
|
-
message=e.message,
|
|
166
|
-
status=422,
|
|
167
|
-
code=LLM_OUTPUT_ERROR_CODES[e.code],
|
|
168
|
-
) from e
|
|
187
|
+
_raise_empty_document_loader_error(e)
|
|
169
188
|
if self.fallback_ocr:
|
|
170
189
|
loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
|
|
171
190
|
llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
|
|
172
|
-
|
|
191
|
+
try:
|
|
192
|
+
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
193
|
+
except EmptyDocument as fallback_error:
|
|
194
|
+
_raise_empty_document_loader_error(fallback_error)
|
|
173
195
|
else:
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
196
|
+
_raise_empty_document_loader_error(e)
|
|
197
|
+
except ConversionError as e:
|
|
198
|
+
_capture_exception_for_sentry(e)
|
|
199
|
+
raise LoaderError(
|
|
200
|
+
message=e.message,
|
|
201
|
+
status=422,
|
|
202
|
+
code="CONVERSION_ERROR",
|
|
203
|
+
) from e
|
|
178
204
|
except LoaderTimeoutError:
|
|
179
205
|
raise LoaderError(message="timeout gemini", status=504, code="TIMEOUT")
|
|
180
206
|
except (httpx.ReadTimeout,
|
|
@@ -81,6 +81,7 @@ tests/test_notebook_loader.py
|
|
|
81
81
|
tests/test_ocr_fallbacks.py
|
|
82
82
|
tests/test_ocr_image_descriptions.py
|
|
83
83
|
tests/test_pain_text.py
|
|
84
|
+
tests/test_pdf_conversion_error.py
|
|
84
85
|
tests/test_python_version_metadata.py
|
|
85
86
|
tests/test_split_audio_with_llm.py
|
|
86
87
|
tests/test_xml_xbrl_loader.py
|
|
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
|
|
|
51
51
|
|
|
52
52
|
setup(
|
|
53
53
|
name='polytext',
|
|
54
|
-
version='0.2.
|
|
54
|
+
version='0.2.5',
|
|
55
55
|
url='https://github.com/docsity/polytext',
|
|
56
56
|
# download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
|
|
57
57
|
license='MIT',
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
3
|
+
|
|
4
|
+
from polytext.exceptions import ConversionError, EmptyDocument, LoaderError
|
|
5
|
+
from polytext.loader.base import BaseLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _FailingLoader:
|
|
9
|
+
def __init__(self, error):
|
|
10
|
+
self.error = error
|
|
11
|
+
|
|
12
|
+
def load(self, input_path):
|
|
13
|
+
raise self.error
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _FakeBaseLoader(BaseLoader):
|
|
17
|
+
def __init__(self, error, **kwargs):
|
|
18
|
+
super().__init__(**kwargs)
|
|
19
|
+
self.error = error
|
|
20
|
+
|
|
21
|
+
def initiate_storage(self, input):
|
|
22
|
+
return {}
|
|
23
|
+
|
|
24
|
+
def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
|
|
25
|
+
return _FailingLoader(self.error)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _FallbackFailingBaseLoader(BaseLoader):
|
|
29
|
+
def __init__(self, initial_error, fallback_error, **kwargs):
|
|
30
|
+
super().__init__(**kwargs)
|
|
31
|
+
self.initial_error = initial_error
|
|
32
|
+
self.fallback_error = fallback_error
|
|
33
|
+
|
|
34
|
+
def initiate_storage(self, input):
|
|
35
|
+
return {}
|
|
36
|
+
|
|
37
|
+
def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
|
|
38
|
+
if is_document_fallback:
|
|
39
|
+
return _FailingLoader(self.fallback_error)
|
|
40
|
+
return _FailingLoader(self.initial_error)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestBaseLoaderErrorMapping(unittest.TestCase):
|
|
44
|
+
def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
|
|
45
|
+
cases = [
|
|
46
|
+
(995, "INVALID_ARGUMENT"),
|
|
47
|
+
(996, "RECITATION"),
|
|
48
|
+
(997, "REPETITIVE_OUTPUT"),
|
|
49
|
+
(999, "MAX_TOKENS"),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
for empty_document_code, expected_loader_code in cases:
|
|
53
|
+
with self.subTest(empty_document_code=empty_document_code):
|
|
54
|
+
loader = _FakeBaseLoader(
|
|
55
|
+
EmptyDocument(
|
|
56
|
+
message=f"diagnostic failure {empty_document_code}",
|
|
57
|
+
code=empty_document_code,
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
sentry_sdk = Mock()
|
|
62
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
63
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
64
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
65
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
66
|
+
loader.get_text(["dummy.txt"])
|
|
67
|
+
|
|
68
|
+
error = error_context.exception
|
|
69
|
+
self.assertEqual(error.status, 422)
|
|
70
|
+
self.assertEqual(error.code, expected_loader_code)
|
|
71
|
+
self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
|
|
72
|
+
mock_info.assert_not_called()
|
|
73
|
+
mock_exception.assert_not_called()
|
|
74
|
+
sentry_sdk.capture_exception.assert_called_once()
|
|
75
|
+
self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
|
|
76
|
+
|
|
77
|
+
def test_empty_or_too_short_documents_are_raised_as_loader_errors(self):
|
|
78
|
+
loader = _FakeBaseLoader(
|
|
79
|
+
EmptyDocument(
|
|
80
|
+
message="Document text with less than 400 characters",
|
|
81
|
+
code=998,
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
sentry_sdk = Mock()
|
|
86
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
87
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
88
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
89
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
90
|
+
loader.get_text(["empty.txt"])
|
|
91
|
+
|
|
92
|
+
error = error_context.exception
|
|
93
|
+
self.assertEqual(error.status, 422)
|
|
94
|
+
self.assertEqual(error.code, "NO_TEXT_DETECTED")
|
|
95
|
+
self.assertEqual(error.message, "No text detected")
|
|
96
|
+
mock_info.assert_not_called()
|
|
97
|
+
mock_exception.assert_not_called()
|
|
98
|
+
sentry_sdk.capture_exception.assert_not_called()
|
|
99
|
+
|
|
100
|
+
def test_empty_document_after_fallback_ocr_is_raised_as_loader_error(self):
|
|
101
|
+
loader = _FallbackFailingBaseLoader(
|
|
102
|
+
initial_error=EmptyDocument(
|
|
103
|
+
message="No text detected",
|
|
104
|
+
code=998,
|
|
105
|
+
),
|
|
106
|
+
fallback_error=EmptyDocument(
|
|
107
|
+
message="No text extracted from OCR fallback",
|
|
108
|
+
),
|
|
109
|
+
fallback_ocr=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
sentry_sdk = Mock()
|
|
113
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
114
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
115
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
116
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
117
|
+
loader.get_text(["empty.pdf"])
|
|
118
|
+
|
|
119
|
+
error = error_context.exception
|
|
120
|
+
self.assertEqual(error.status, 422)
|
|
121
|
+
self.assertEqual(error.code, "NO_TEXT_DETECTED")
|
|
122
|
+
self.assertEqual(error.message, "No text detected")
|
|
123
|
+
mock_info.assert_not_called()
|
|
124
|
+
mock_exception.assert_not_called()
|
|
125
|
+
sentry_sdk.capture_exception.assert_not_called()
|
|
126
|
+
|
|
127
|
+
def test_conversion_error_is_raised_as_loader_error(self):
|
|
128
|
+
conversion_error = ConversionError("LibreOffice failed")
|
|
129
|
+
loader = _FakeBaseLoader(conversion_error)
|
|
130
|
+
|
|
131
|
+
sentry_sdk = Mock()
|
|
132
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
133
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
134
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
135
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
136
|
+
loader.get_text(["document.docx"])
|
|
137
|
+
|
|
138
|
+
error = error_context.exception
|
|
139
|
+
self.assertEqual(error.status, 422)
|
|
140
|
+
self.assertEqual(error.code, "CONVERSION_ERROR")
|
|
141
|
+
self.assertEqual(error.message, "LibreOffice failed")
|
|
142
|
+
mock_info.assert_not_called()
|
|
143
|
+
mock_exception.assert_not_called()
|
|
144
|
+
sentry_sdk.capture_exception.assert_called_once_with(conversion_error)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
if __name__ == "__main__":
|
|
148
|
+
unittest.main()
|
|
@@ -33,7 +33,7 @@ def main():
|
|
|
33
33
|
# Define document data
|
|
34
34
|
file_path = "gcs://opit-da-test-ml-ai-store-bucket/learning_resources/course_id=406/module_id=2658/id=31427/8434.mp4"
|
|
35
35
|
|
|
36
|
-
local_file_path = "/Users/marcodelgiudice/
|
|
36
|
+
local_file_path = "/Users/marcodelgiudice/Downloads/mq0264a5-5a073da227de0ee462bd6de8731d586a1dcc635f.pdf"
|
|
37
37
|
|
|
38
38
|
# Call get_text method
|
|
39
39
|
start = time.time()
|
|
@@ -38,7 +38,7 @@ def main():
|
|
|
38
38
|
|
|
39
39
|
# local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9695.jpg"
|
|
40
40
|
# local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9701.jpg"
|
|
41
|
-
local_file_path = "/Users/marcodelgiudice/Projects/polytext/
|
|
41
|
+
local_file_path = "/Users/marcodelgiudice/Projects/polytext/gm1.png"
|
|
42
42
|
|
|
43
43
|
try:
|
|
44
44
|
start = time.time()
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import tempfile
|
|
3
|
+
import unittest
|
|
4
|
+
from unittest.mock import patch
|
|
5
|
+
|
|
6
|
+
from polytext.converter.pdf import DocumentConverter
|
|
7
|
+
from polytext.exceptions import ConversionError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestPdfConversionError(unittest.TestCase):
|
|
11
|
+
@patch.object(DocumentConverter, "check_libreoffice_installed", return_value=True)
|
|
12
|
+
@patch("polytext.converter.pdf.subprocess.run")
|
|
13
|
+
@patch("polytext.converter.pdf.subprocess.check_call")
|
|
14
|
+
def test_conversion_error_includes_libreoffice_output(
|
|
15
|
+
self,
|
|
16
|
+
mock_check_call,
|
|
17
|
+
mock_run,
|
|
18
|
+
_mock_check_libreoffice,
|
|
19
|
+
):
|
|
20
|
+
libreoffice_error = subprocess.CalledProcessError(
|
|
21
|
+
returncode=1,
|
|
22
|
+
cmd=["libreoffice", "--convert-to", "pdf"],
|
|
23
|
+
output="convert input.docx -> output.pdf",
|
|
24
|
+
stderr="Unspecified Application Error",
|
|
25
|
+
)
|
|
26
|
+
mock_check_call.side_effect = libreoffice_error
|
|
27
|
+
mock_run.side_effect = libreoffice_error
|
|
28
|
+
|
|
29
|
+
with tempfile.NamedTemporaryFile(suffix=".docx") as input_file:
|
|
30
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf") as output_file:
|
|
31
|
+
with self.assertRaises(ConversionError) as error_context:
|
|
32
|
+
DocumentConverter().convert_to_pdf(
|
|
33
|
+
input_file=input_file.name,
|
|
34
|
+
original_file=input_file.name,
|
|
35
|
+
output_file=output_file.name,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
self.assertIn("Unspecified Application Error", error_context.exception.message)
|
|
39
|
+
self.assertIn("convert input.docx -> output.pdf", error_context.exception.message)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
unittest.main()
|
|
@@ -32,9 +32,9 @@ url = 'https://www.youtube.com/watch?v=L4as3tks4Js' # basement alberto angela
|
|
|
32
32
|
|
|
33
33
|
# url = 'https://www.youtube.com/watch?v=UabBYexBD4k' # INM RAG 11 minuti, completato in 26 secondi con successo con gemini-3.1-flash-lite
|
|
34
34
|
|
|
35
|
-
url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
|
|
35
|
+
#url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
|
|
36
36
|
|
|
37
|
-
|
|
37
|
+
url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw' # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
|
|
38
38
|
|
|
39
39
|
# url = 'https://www.youtube.com/watch?v=CM2CkNU9xR0' # google antigravity 27 minuti, completato in 39 secondi con successo con gemini-3.1-flash-lite (146k token in input, 6k token in output)
|
|
40
40
|
|
|
@@ -1,79 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
from unittest.mock import patch
|
|
3
|
-
|
|
4
|
-
from polytext.exceptions import EmptyDocument, LoaderError
|
|
5
|
-
from polytext.loader.base import BaseLoader
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class _FailingLoader:
|
|
9
|
-
def __init__(self, error):
|
|
10
|
-
self.error = error
|
|
11
|
-
|
|
12
|
-
def load(self, input_path):
|
|
13
|
-
raise self.error
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class _FakeBaseLoader(BaseLoader):
|
|
17
|
-
def __init__(self, error, **kwargs):
|
|
18
|
-
super().__init__(**kwargs)
|
|
19
|
-
self.error = error
|
|
20
|
-
|
|
21
|
-
def initiate_storage(self, input):
|
|
22
|
-
return {}
|
|
23
|
-
|
|
24
|
-
def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
|
|
25
|
-
return _FailingLoader(self.error)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class TestBaseLoaderErrorMapping(unittest.TestCase):
|
|
29
|
-
def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
|
|
30
|
-
cases = [
|
|
31
|
-
(995, "INVALID_ARGUMENT"),
|
|
32
|
-
(996, "RECITATION"),
|
|
33
|
-
(997, "REPETITIVE_OUTPUT"),
|
|
34
|
-
(999, "MAX_TOKENS"),
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
for empty_document_code, expected_loader_code in cases:
|
|
38
|
-
with self.subTest(empty_document_code=empty_document_code):
|
|
39
|
-
loader = _FakeBaseLoader(
|
|
40
|
-
EmptyDocument(
|
|
41
|
-
message=f"diagnostic failure {empty_document_code}",
|
|
42
|
-
code=empty_document_code,
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
47
|
-
with self.assertRaises(LoaderError) as error_context:
|
|
48
|
-
loader.get_text(["dummy.txt"])
|
|
49
|
-
|
|
50
|
-
error = error_context.exception
|
|
51
|
-
self.assertEqual(error.status, 422)
|
|
52
|
-
self.assertEqual(error.code, expected_loader_code)
|
|
53
|
-
self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
|
|
54
|
-
mock_exception.assert_called_once()
|
|
55
|
-
self.assertIn("Raising LoaderError", mock_exception.call_args.args[0])
|
|
56
|
-
self.assertEqual(mock_exception.call_args.args[1], expected_loader_code)
|
|
57
|
-
self.assertEqual(mock_exception.call_args.args[2], empty_document_code)
|
|
58
|
-
self.assertEqual(mock_exception.call_args.args[3], f"diagnostic failure {empty_document_code}")
|
|
59
|
-
|
|
60
|
-
def test_empty_or_too_short_documents_still_return_empty_response(self):
|
|
61
|
-
loader = _FakeBaseLoader(
|
|
62
|
-
EmptyDocument(
|
|
63
|
-
message="Document text with less than 400 characters",
|
|
64
|
-
code=998,
|
|
65
|
-
)
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
69
|
-
response = loader.get_text(["empty.txt"])
|
|
70
|
-
|
|
71
|
-
self.assertEqual(response["text"], "")
|
|
72
|
-
self.assertEqual(response["completion_tokens"], 0)
|
|
73
|
-
self.assertEqual(response["prompt_tokens"], 0)
|
|
74
|
-
self.assertEqual(response["output_list"][0]["input"], "empty.txt")
|
|
75
|
-
mock_exception.assert_not_called()
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
if __name__ == "__main__":
|
|
79
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|