polytext 0.2.4__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.2.4 → polytext-0.2.5}/PKG-INFO +1 -1
- {polytext-0.2.4 → polytext-0.2.5}/polytext/__init__.py +27 -2
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/pdf.py +16 -3
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/base.py +32 -13
- {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/PKG-INFO +1 -1
- {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/SOURCES.txt +1 -0
- {polytext-0.2.4 → polytext-0.2.5}/setup.py +1 -1
- polytext-0.2.5/tests/test_base_loader_error_mapping.py +148 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_audio_transcript_from_gcs.py +1 -1
- polytext-0.2.5/tests/test_pdf_conversion_error.py +43 -0
- polytext-0.2.4/tests/test_base_loader_error_mapping.py +0 -81
- {polytext-0.2.4 → polytext-0.2.5}/LICENSE +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/README.md +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/audio_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/base.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/document_ocr_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/gemini_quality_guards.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/html_to_md.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/md_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/ocr_to_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/text_to_md.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/video_to_audio.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/exceptions/base.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/generator/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/generator/pdf.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/audio.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/document.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/document_ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/downloader/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/downloader/downloader.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/html.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/markdown.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/notebook.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/plain_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/video.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/xml_xbrl.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/youtube.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/youtube_llm.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/audio_chunker.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/text_merger.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/transcript_chunker.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/text_merging.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/text_to_md.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/transcription.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/utils/__init__.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext/utils/utils.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/requires.txt +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/pyproject.toml +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/setup.cfg +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_audio_chunker.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_audio_comparison_helpers.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_audio_transcription_model_migration.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_audio_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_document_ocr_to_text_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_ocr_to_text_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_youtube_models.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_extracted_text_whitespace.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_gemini_quality_guards.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_customized_pdf_from_markdown.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_ocr.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_ocr_azure_oai.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_text_from_gcs.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_ocr_from_image.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_text_from_markdown.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_video_transcript_from_gcs.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_library.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_markdown_loader_gzip.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_markitdown_html.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_notebook_loader.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_ocr_fallbacks.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_ocr_image_descriptions.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_pain_text.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_python_version_metadata.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_split_audio_with_llm.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_xml_xbrl_loader.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_youtube_gemini_minimal_check.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_youtube_llm_fallbacks.py +0 -0
- {polytext-0.2.4 → polytext-0.2.5}/tests/test_youtube_transcript.py +0 -0
|
@@ -3,11 +3,36 @@ import os
|
|
|
3
3
|
import logging
|
|
4
4
|
import dotenv
|
|
5
5
|
|
|
6
|
+
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError, LoaderError
|
|
7
|
+
|
|
6
8
|
logger = logging.getLogger(__name__)
|
|
7
9
|
|
|
8
10
|
# Load environment variables
|
|
9
11
|
dotenv.load_dotenv()
|
|
10
12
|
|
|
13
|
+
|
|
14
|
+
def _filter_expected_loader_errors(event, hint):
|
|
15
|
+
error = None
|
|
16
|
+
if hint:
|
|
17
|
+
exc_info = hint.get("exc_info")
|
|
18
|
+
if exc_info:
|
|
19
|
+
error = exc_info[1]
|
|
20
|
+
else:
|
|
21
|
+
error = hint.get("original_exception")
|
|
22
|
+
|
|
23
|
+
if isinstance(error, LoaderError) and error.code == "NO_TEXT_DETECTED":
|
|
24
|
+
return None
|
|
25
|
+
|
|
26
|
+
exception_values = (event or {}).get("exception", {}).get("values", [])
|
|
27
|
+
for exception_value in exception_values:
|
|
28
|
+
exception_type = exception_value.get("type") or ""
|
|
29
|
+
exception_message = exception_value.get("value")
|
|
30
|
+
if exception_type.endswith("LoaderError") and exception_message == "No text detected":
|
|
31
|
+
return None
|
|
32
|
+
|
|
33
|
+
return event
|
|
34
|
+
|
|
35
|
+
|
|
11
36
|
# Initialize Sentry if DSN is configured
|
|
12
37
|
sentry_dsn = os.getenv('SENTRY_DSN_POLYTEXT')
|
|
13
38
|
if sentry_dsn:
|
|
@@ -18,6 +43,7 @@ if sentry_dsn:
|
|
|
18
43
|
environment=os.getenv('ENV', 'prod'),
|
|
19
44
|
traces_sample_rate=1.0,
|
|
20
45
|
profiles_sample_rate=1.0,
|
|
46
|
+
before_send=_filter_expected_loader_errors,
|
|
21
47
|
)
|
|
22
48
|
logger.info("Sentry monitoring initialized")
|
|
23
49
|
except ImportError:
|
|
@@ -26,7 +52,6 @@ if sentry_dsn:
|
|
|
26
52
|
|
|
27
53
|
from .converter.pdf import convert_to_pdf, DocumentConverter
|
|
28
54
|
from .loader.document import DocumentLoader
|
|
29
|
-
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
|
|
30
55
|
from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
|
|
31
56
|
|
|
32
57
|
__all__ = [
|
|
@@ -38,4 +63,4 @@ __all__ = [
|
|
|
38
63
|
'ConversionError',
|
|
39
64
|
'get_customized_pdf_from_markdown',
|
|
40
65
|
'PDFGenerator'
|
|
41
|
-
]
|
|
66
|
+
]
|
|
@@ -127,11 +127,24 @@ class DocumentConverter:
|
|
|
127
127
|
]
|
|
128
128
|
|
|
129
129
|
try:
|
|
130
|
-
|
|
131
|
-
|
|
130
|
+
subprocess.run(
|
|
131
|
+
command,
|
|
132
|
+
stdout=subprocess.PIPE,
|
|
133
|
+
stderr=subprocess.PIPE,
|
|
134
|
+
text=True,
|
|
135
|
+
check=True,
|
|
136
|
+
)
|
|
132
137
|
logger.info(f"Conversion successful: '{output_file}'")
|
|
133
138
|
except subprocess.CalledProcessError as e:
|
|
139
|
+
output_parts = []
|
|
140
|
+
if e.stdout:
|
|
141
|
+
output_parts.append(f"stdout: {e.stdout.strip()}")
|
|
142
|
+
if e.stderr:
|
|
143
|
+
output_parts.append(f"stderr: {e.stderr.strip()}")
|
|
144
|
+
details = "\n".join(output_parts)
|
|
134
145
|
error_msg = f"Error during conversion: {e}"
|
|
146
|
+
if details:
|
|
147
|
+
error_msg = f"{error_msg}\n{details}"
|
|
135
148
|
logger.info(error_msg)
|
|
136
149
|
raise ConversionError(error_msg, e)
|
|
137
150
|
|
|
@@ -253,4 +266,4 @@ class DocumentConverter:
|
|
|
253
266
|
# except Exception as e:
|
|
254
267
|
# error_msg = f"Error during PDF conversion: {str(e)}"
|
|
255
268
|
# logger.error(error_msg)
|
|
256
|
-
# raise ConversionError(error_msg)
|
|
269
|
+
# raise ConversionError(error_msg)
|
|
@@ -25,7 +25,7 @@ from ..loader import (
|
|
|
25
25
|
XmlXbrlLoader,
|
|
26
26
|
NotebookLoader
|
|
27
27
|
)
|
|
28
|
-
from ..exceptions import EmptyDocument, LoaderTimeoutError, LoaderError
|
|
28
|
+
from ..exceptions import ConversionError, EmptyDocument, LoaderTimeoutError, LoaderError
|
|
29
29
|
from ..utils.utils import clean_extracted_text_whitespace, remove_markdown_strip
|
|
30
30
|
|
|
31
31
|
# External imports
|
|
@@ -46,6 +46,10 @@ LLM_OUTPUT_ERROR_CODES = {
|
|
|
46
46
|
997: "REPETITIVE_OUTPUT",
|
|
47
47
|
999: "MAX_TOKENS",
|
|
48
48
|
}
|
|
49
|
+
EMPTY_DOCUMENT_LOADER_ERROR_CODES = {
|
|
50
|
+
**LLM_OUTPUT_ERROR_CODES,
|
|
51
|
+
998: "NO_TEXT_DETECTED",
|
|
52
|
+
}
|
|
49
53
|
|
|
50
54
|
|
|
51
55
|
def _read_bool_env(name: str, default: bool = False) -> bool:
|
|
@@ -67,6 +71,20 @@ def _capture_exception_for_sentry(error: Exception) -> None:
|
|
|
67
71
|
return
|
|
68
72
|
|
|
69
73
|
|
|
74
|
+
def _raise_empty_document_loader_error(error: EmptyDocument) -> None:
|
|
75
|
+
loader_error_code = EMPTY_DOCUMENT_LOADER_ERROR_CODES.get(error.code, "NO_TEXT_DETECTED")
|
|
76
|
+
message = error.message
|
|
77
|
+
if loader_error_code == "NO_TEXT_DETECTED":
|
|
78
|
+
message = "No text detected"
|
|
79
|
+
else:
|
|
80
|
+
_capture_exception_for_sentry(error)
|
|
81
|
+
raise LoaderError(
|
|
82
|
+
message=message,
|
|
83
|
+
status=422,
|
|
84
|
+
code=loader_error_code,
|
|
85
|
+
) from error
|
|
86
|
+
|
|
87
|
+
|
|
70
88
|
class BaseLoader:
|
|
71
89
|
def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
|
|
72
90
|
ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
|
|
@@ -166,22 +184,23 @@ class BaseLoader:
|
|
|
166
184
|
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
167
185
|
except EmptyDocument as e:
|
|
168
186
|
if e.code in LLM_OUTPUT_ERROR_CODES:
|
|
169
|
-
|
|
170
|
-
raise LoaderError(
|
|
171
|
-
message=e.message,
|
|
172
|
-
status=422,
|
|
173
|
-
code=LLM_OUTPUT_ERROR_CODES[e.code],
|
|
174
|
-
) from e
|
|
175
|
-
logger.info(f"Empty document encountered: {e.message}")
|
|
187
|
+
_raise_empty_document_loader_error(e)
|
|
176
188
|
if self.fallback_ocr:
|
|
177
189
|
loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
|
|
178
190
|
llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
|
|
179
|
-
|
|
191
|
+
try:
|
|
192
|
+
response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
|
|
193
|
+
except EmptyDocument as fallback_error:
|
|
194
|
+
_raise_empty_document_loader_error(fallback_error)
|
|
180
195
|
else:
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
196
|
+
_raise_empty_document_loader_error(e)
|
|
197
|
+
except ConversionError as e:
|
|
198
|
+
_capture_exception_for_sentry(e)
|
|
199
|
+
raise LoaderError(
|
|
200
|
+
message=e.message,
|
|
201
|
+
status=422,
|
|
202
|
+
code="CONVERSION_ERROR",
|
|
203
|
+
) from e
|
|
185
204
|
except LoaderTimeoutError:
|
|
186
205
|
raise LoaderError(message="timeout gemini", status=504, code="TIMEOUT")
|
|
187
206
|
except (httpx.ReadTimeout,
|
|
@@ -81,6 +81,7 @@ tests/test_notebook_loader.py
|
|
|
81
81
|
tests/test_ocr_fallbacks.py
|
|
82
82
|
tests/test_ocr_image_descriptions.py
|
|
83
83
|
tests/test_pain_text.py
|
|
84
|
+
tests/test_pdf_conversion_error.py
|
|
84
85
|
tests/test_python_version_metadata.py
|
|
85
86
|
tests/test_split_audio_with_llm.py
|
|
86
87
|
tests/test_xml_xbrl_loader.py
|
|
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
|
|
|
51
51
|
|
|
52
52
|
setup(
|
|
53
53
|
name='polytext',
|
|
54
|
-
version='0.2.
|
|
54
|
+
version='0.2.5',
|
|
55
55
|
url='https://github.com/docsity/polytext',
|
|
56
56
|
# download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
|
|
57
57
|
license='MIT',
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import Mock, patch
|
|
3
|
+
|
|
4
|
+
from polytext.exceptions import ConversionError, EmptyDocument, LoaderError
|
|
5
|
+
from polytext.loader.base import BaseLoader
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _FailingLoader:
|
|
9
|
+
def __init__(self, error):
|
|
10
|
+
self.error = error
|
|
11
|
+
|
|
12
|
+
def load(self, input_path):
|
|
13
|
+
raise self.error
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class _FakeBaseLoader(BaseLoader):
|
|
17
|
+
def __init__(self, error, **kwargs):
|
|
18
|
+
super().__init__(**kwargs)
|
|
19
|
+
self.error = error
|
|
20
|
+
|
|
21
|
+
def initiate_storage(self, input):
|
|
22
|
+
return {}
|
|
23
|
+
|
|
24
|
+
def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
|
|
25
|
+
return _FailingLoader(self.error)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _FallbackFailingBaseLoader(BaseLoader):
|
|
29
|
+
def __init__(self, initial_error, fallback_error, **kwargs):
|
|
30
|
+
super().__init__(**kwargs)
|
|
31
|
+
self.initial_error = initial_error
|
|
32
|
+
self.fallback_error = fallback_error
|
|
33
|
+
|
|
34
|
+
def initiate_storage(self, input):
|
|
35
|
+
return {}
|
|
36
|
+
|
|
37
|
+
def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
|
|
38
|
+
if is_document_fallback:
|
|
39
|
+
return _FailingLoader(self.fallback_error)
|
|
40
|
+
return _FailingLoader(self.initial_error)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class TestBaseLoaderErrorMapping(unittest.TestCase):
|
|
44
|
+
def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
|
|
45
|
+
cases = [
|
|
46
|
+
(995, "INVALID_ARGUMENT"),
|
|
47
|
+
(996, "RECITATION"),
|
|
48
|
+
(997, "REPETITIVE_OUTPUT"),
|
|
49
|
+
(999, "MAX_TOKENS"),
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
for empty_document_code, expected_loader_code in cases:
|
|
53
|
+
with self.subTest(empty_document_code=empty_document_code):
|
|
54
|
+
loader = _FakeBaseLoader(
|
|
55
|
+
EmptyDocument(
|
|
56
|
+
message=f"diagnostic failure {empty_document_code}",
|
|
57
|
+
code=empty_document_code,
|
|
58
|
+
)
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
sentry_sdk = Mock()
|
|
62
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
63
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
64
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
65
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
66
|
+
loader.get_text(["dummy.txt"])
|
|
67
|
+
|
|
68
|
+
error = error_context.exception
|
|
69
|
+
self.assertEqual(error.status, 422)
|
|
70
|
+
self.assertEqual(error.code, expected_loader_code)
|
|
71
|
+
self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
|
|
72
|
+
mock_info.assert_not_called()
|
|
73
|
+
mock_exception.assert_not_called()
|
|
74
|
+
sentry_sdk.capture_exception.assert_called_once()
|
|
75
|
+
self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
|
|
76
|
+
|
|
77
|
+
def test_empty_or_too_short_documents_are_raised_as_loader_errors(self):
|
|
78
|
+
loader = _FakeBaseLoader(
|
|
79
|
+
EmptyDocument(
|
|
80
|
+
message="Document text with less than 400 characters",
|
|
81
|
+
code=998,
|
|
82
|
+
)
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
sentry_sdk = Mock()
|
|
86
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
87
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
88
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
89
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
90
|
+
loader.get_text(["empty.txt"])
|
|
91
|
+
|
|
92
|
+
error = error_context.exception
|
|
93
|
+
self.assertEqual(error.status, 422)
|
|
94
|
+
self.assertEqual(error.code, "NO_TEXT_DETECTED")
|
|
95
|
+
self.assertEqual(error.message, "No text detected")
|
|
96
|
+
mock_info.assert_not_called()
|
|
97
|
+
mock_exception.assert_not_called()
|
|
98
|
+
sentry_sdk.capture_exception.assert_not_called()
|
|
99
|
+
|
|
100
|
+
def test_empty_document_after_fallback_ocr_is_raised_as_loader_error(self):
|
|
101
|
+
loader = _FallbackFailingBaseLoader(
|
|
102
|
+
initial_error=EmptyDocument(
|
|
103
|
+
message="No text detected",
|
|
104
|
+
code=998,
|
|
105
|
+
),
|
|
106
|
+
fallback_error=EmptyDocument(
|
|
107
|
+
message="No text extracted from OCR fallback",
|
|
108
|
+
),
|
|
109
|
+
fallback_ocr=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
sentry_sdk = Mock()
|
|
113
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
114
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
115
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
116
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
117
|
+
loader.get_text(["empty.pdf"])
|
|
118
|
+
|
|
119
|
+
error = error_context.exception
|
|
120
|
+
self.assertEqual(error.status, 422)
|
|
121
|
+
self.assertEqual(error.code, "NO_TEXT_DETECTED")
|
|
122
|
+
self.assertEqual(error.message, "No text detected")
|
|
123
|
+
mock_info.assert_not_called()
|
|
124
|
+
mock_exception.assert_not_called()
|
|
125
|
+
sentry_sdk.capture_exception.assert_not_called()
|
|
126
|
+
|
|
127
|
+
def test_conversion_error_is_raised_as_loader_error(self):
|
|
128
|
+
conversion_error = ConversionError("LibreOffice failed")
|
|
129
|
+
loader = _FakeBaseLoader(conversion_error)
|
|
130
|
+
|
|
131
|
+
sentry_sdk = Mock()
|
|
132
|
+
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
133
|
+
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
134
|
+
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
135
|
+
with self.assertRaises(LoaderError) as error_context:
|
|
136
|
+
loader.get_text(["document.docx"])
|
|
137
|
+
|
|
138
|
+
error = error_context.exception
|
|
139
|
+
self.assertEqual(error.status, 422)
|
|
140
|
+
self.assertEqual(error.code, "CONVERSION_ERROR")
|
|
141
|
+
self.assertEqual(error.message, "LibreOffice failed")
|
|
142
|
+
mock_info.assert_not_called()
|
|
143
|
+
mock_exception.assert_not_called()
|
|
144
|
+
sentry_sdk.capture_exception.assert_called_once_with(conversion_error)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
if __name__ == "__main__":
|
|
148
|
+
unittest.main()
|
|
@@ -33,7 +33,7 @@ def main():
|
|
|
33
33
|
# Define document data
|
|
34
34
|
file_path = "gcs://opit-da-test-ml-ai-store-bucket/learning_resources/course_id=406/module_id=2658/id=31427/8434.mp4"
|
|
35
35
|
|
|
36
|
-
local_file_path = "/Users/marcodelgiudice/
|
|
36
|
+
local_file_path = "/Users/marcodelgiudice/Downloads/mq0264a5-5a073da227de0ee462bd6de8731d586a1dcc635f.pdf"
|
|
37
37
|
|
|
38
38
|
# Call get_text method
|
|
39
39
|
start = time.time()
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import tempfile
|
|
3
|
+
import unittest
|
|
4
|
+
from unittest.mock import patch
|
|
5
|
+
|
|
6
|
+
from polytext.converter.pdf import DocumentConverter
|
|
7
|
+
from polytext.exceptions import ConversionError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestPdfConversionError(unittest.TestCase):
|
|
11
|
+
@patch.object(DocumentConverter, "check_libreoffice_installed", return_value=True)
|
|
12
|
+
@patch("polytext.converter.pdf.subprocess.run")
|
|
13
|
+
@patch("polytext.converter.pdf.subprocess.check_call")
|
|
14
|
+
def test_conversion_error_includes_libreoffice_output(
|
|
15
|
+
self,
|
|
16
|
+
mock_check_call,
|
|
17
|
+
mock_run,
|
|
18
|
+
_mock_check_libreoffice,
|
|
19
|
+
):
|
|
20
|
+
libreoffice_error = subprocess.CalledProcessError(
|
|
21
|
+
returncode=1,
|
|
22
|
+
cmd=["libreoffice", "--convert-to", "pdf"],
|
|
23
|
+
output="convert input.docx -> output.pdf",
|
|
24
|
+
stderr="Unspecified Application Error",
|
|
25
|
+
)
|
|
26
|
+
mock_check_call.side_effect = libreoffice_error
|
|
27
|
+
mock_run.side_effect = libreoffice_error
|
|
28
|
+
|
|
29
|
+
with tempfile.NamedTemporaryFile(suffix=".docx") as input_file:
|
|
30
|
+
with tempfile.NamedTemporaryFile(suffix=".pdf") as output_file:
|
|
31
|
+
with self.assertRaises(ConversionError) as error_context:
|
|
32
|
+
DocumentConverter().convert_to_pdf(
|
|
33
|
+
input_file=input_file.name,
|
|
34
|
+
original_file=input_file.name,
|
|
35
|
+
output_file=output_file.name,
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
self.assertIn("Unspecified Application Error", error_context.exception.message)
|
|
39
|
+
self.assertIn("convert input.docx -> output.pdf", error_context.exception.message)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
if __name__ == "__main__":
|
|
43
|
+
unittest.main()
|
|
@@ -1,81 +0,0 @@
|
|
|
1
|
-
import unittest
|
|
2
|
-
from unittest.mock import Mock, patch
|
|
3
|
-
|
|
4
|
-
from polytext.exceptions import EmptyDocument, LoaderError
|
|
5
|
-
from polytext.loader.base import BaseLoader
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class _FailingLoader:
|
|
9
|
-
def __init__(self, error):
|
|
10
|
-
self.error = error
|
|
11
|
-
|
|
12
|
-
def load(self, input_path):
|
|
13
|
-
raise self.error
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class _FakeBaseLoader(BaseLoader):
|
|
17
|
-
def __init__(self, error, **kwargs):
|
|
18
|
-
super().__init__(**kwargs)
|
|
19
|
-
self.error = error
|
|
20
|
-
|
|
21
|
-
def initiate_storage(self, input):
|
|
22
|
-
return {}
|
|
23
|
-
|
|
24
|
-
def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
|
|
25
|
-
return _FailingLoader(self.error)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
class TestBaseLoaderErrorMapping(unittest.TestCase):
|
|
29
|
-
def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
|
|
30
|
-
cases = [
|
|
31
|
-
(995, "INVALID_ARGUMENT"),
|
|
32
|
-
(996, "RECITATION"),
|
|
33
|
-
(997, "REPETITIVE_OUTPUT"),
|
|
34
|
-
(999, "MAX_TOKENS"),
|
|
35
|
-
]
|
|
36
|
-
|
|
37
|
-
for empty_document_code, expected_loader_code in cases:
|
|
38
|
-
with self.subTest(empty_document_code=empty_document_code):
|
|
39
|
-
loader = _FakeBaseLoader(
|
|
40
|
-
EmptyDocument(
|
|
41
|
-
message=f"diagnostic failure {empty_document_code}",
|
|
42
|
-
code=empty_document_code,
|
|
43
|
-
)
|
|
44
|
-
)
|
|
45
|
-
|
|
46
|
-
sentry_sdk = Mock()
|
|
47
|
-
with patch("polytext.loader.base.logger.info") as mock_info:
|
|
48
|
-
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
49
|
-
with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
|
|
50
|
-
with self.assertRaises(LoaderError) as error_context:
|
|
51
|
-
loader.get_text(["dummy.txt"])
|
|
52
|
-
|
|
53
|
-
error = error_context.exception
|
|
54
|
-
self.assertEqual(error.status, 422)
|
|
55
|
-
self.assertEqual(error.code, expected_loader_code)
|
|
56
|
-
self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
|
|
57
|
-
mock_info.assert_not_called()
|
|
58
|
-
mock_exception.assert_not_called()
|
|
59
|
-
sentry_sdk.capture_exception.assert_called_once()
|
|
60
|
-
self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
|
|
61
|
-
|
|
62
|
-
def test_empty_or_too_short_documents_still_return_empty_response(self):
|
|
63
|
-
loader = _FakeBaseLoader(
|
|
64
|
-
EmptyDocument(
|
|
65
|
-
message="Document text with less than 400 characters",
|
|
66
|
-
code=998,
|
|
67
|
-
)
|
|
68
|
-
)
|
|
69
|
-
|
|
70
|
-
with patch("polytext.loader.base.logger.exception") as mock_exception:
|
|
71
|
-
response = loader.get_text(["empty.txt"])
|
|
72
|
-
|
|
73
|
-
self.assertEqual(response["text"], "")
|
|
74
|
-
self.assertEqual(response["completion_tokens"], 0)
|
|
75
|
-
self.assertEqual(response["prompt_tokens"], 0)
|
|
76
|
-
self.assertEqual(response["output_list"][0]["input"], "empty.txt")
|
|
77
|
-
mock_exception.assert_not_called()
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
if __name__ == "__main__":
|
|
81
|
-
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|