polytext 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {polytext-0.2.3 → polytext-0.2.5}/PKG-INFO +1 -1
  2. {polytext-0.2.3 → polytext-0.2.5}/polytext/__init__.py +27 -2
  3. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/audio_to_text.py +1 -1
  4. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/ocr_to_text.py +1 -1
  5. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/pdf.py +16 -3
  6. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/base.py +44 -18
  7. {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/PKG-INFO +1 -1
  8. {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/SOURCES.txt +1 -0
  9. {polytext-0.2.3 → polytext-0.2.5}/setup.py +1 -1
  10. polytext-0.2.5/tests/test_base_loader_error_mapping.py +148 -0
  11. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_audio_transcript_from_gcs.py +1 -1
  12. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_ocr_from_image.py +1 -1
  13. polytext-0.2.5/tests/test_pdf_conversion_error.py +43 -0
  14. {polytext-0.2.3 → polytext-0.2.5}/tests/test_youtube_transcript.py +2 -2
  15. polytext-0.2.3/tests/test_base_loader_error_mapping.py +0 -79
  16. {polytext-0.2.3 → polytext-0.2.5}/LICENSE +0 -0
  17. {polytext-0.2.3 → polytext-0.2.5}/README.md +0 -0
  18. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/__init__.py +0 -0
  19. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/base.py +0 -0
  20. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/document_ocr_to_text.py +0 -0
  21. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
  22. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/gemini_quality_guards.py +0 -0
  23. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/html_to_md.py +0 -0
  24. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/md_to_text.py +0 -0
  25. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
  26. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/text_to_md.py +0 -0
  27. {polytext-0.2.3 → polytext-0.2.5}/polytext/converter/video_to_audio.py +0 -0
  28. {polytext-0.2.3 → polytext-0.2.5}/polytext/exceptions/__init__.py +0 -0
  29. {polytext-0.2.3 → polytext-0.2.5}/polytext/exceptions/base.py +0 -0
  30. {polytext-0.2.3 → polytext-0.2.5}/polytext/generator/__init__.py +0 -0
  31. {polytext-0.2.3 → polytext-0.2.5}/polytext/generator/pdf.py +0 -0
  32. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/__init__.py +0 -0
  33. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/audio.py +0 -0
  34. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/document.py +0 -0
  35. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/document_ocr.py +0 -0
  36. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/downloader/__init__.py +0 -0
  37. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/downloader/downloader.py +0 -0
  38. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/html.py +0 -0
  39. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/markdown.py +0 -0
  40. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/notebook.py +0 -0
  41. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/ocr.py +0 -0
  42. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/plain_text.py +0 -0
  43. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/video.py +0 -0
  44. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/xml_xbrl.py +0 -0
  45. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/youtube.py +0 -0
  46. {polytext-0.2.3 → polytext-0.2.5}/polytext/loader/youtube_llm.py +0 -0
  47. {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/__init__.py +0 -0
  48. {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/audio_chunker.py +0 -0
  49. {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/text_merger.py +0 -0
  50. {polytext-0.2.3 → polytext-0.2.5}/polytext/processor/transcript_chunker.py +0 -0
  51. {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/__init__.py +0 -0
  52. {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/ocr.py +0 -0
  53. {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/text_merging.py +0 -0
  54. {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/text_to_md.py +0 -0
  55. {polytext-0.2.3 → polytext-0.2.5}/polytext/prompts/transcription.py +0 -0
  56. {polytext-0.2.3 → polytext-0.2.5}/polytext/utils/__init__.py +0 -0
  57. {polytext-0.2.3 → polytext-0.2.5}/polytext/utils/utils.py +0 -0
  58. {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/dependency_links.txt +0 -0
  59. {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/not-zip-safe +0 -0
  60. {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/requires.txt +0 -0
  61. {polytext-0.2.3 → polytext-0.2.5}/polytext.egg-info/top_level.txt +0 -0
  62. {polytext-0.2.3 → polytext-0.2.5}/pyproject.toml +0 -0
  63. {polytext-0.2.3 → polytext-0.2.5}/setup.cfg +0 -0
  64. {polytext-0.2.3 → polytext-0.2.5}/tests/test_audio_chunker.py +0 -0
  65. {polytext-0.2.3 → polytext-0.2.5}/tests/test_audio_comparison_helpers.py +0 -0
  66. {polytext-0.2.3 → polytext-0.2.5}/tests/test_audio_transcription_model_migration.py +0 -0
  67. {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_audio_models.py +0 -0
  68. {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_document_ocr_to_text_models.py +0 -0
  69. {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_ocr_to_text_models.py +0 -0
  70. {polytext-0.2.3 → polytext-0.2.5}/tests/test_compare_youtube_models.py +0 -0
  71. {polytext-0.2.3 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube.py +0 -0
  72. {polytext-0.2.3 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
  73. {polytext-0.2.3 → polytext-0.2.5}/tests/test_extracted_text_whitespace.py +0 -0
  74. {polytext-0.2.3 → polytext-0.2.5}/tests/test_gemini_quality_guards.py +0 -0
  75. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_customized_pdf_from_markdown.py +0 -0
  76. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_ocr.py +0 -0
  77. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_ocr_azure_oai.py +0 -0
  78. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_text.py +0 -0
  79. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_document_text_from_gcs.py +0 -0
  80. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_text_from_markdown.py +0 -0
  81. {polytext-0.2.3 → polytext-0.2.5}/tests/test_get_video_transcript_from_gcs.py +0 -0
  82. {polytext-0.2.3 → polytext-0.2.5}/tests/test_library.py +0 -0
  83. {polytext-0.2.3 → polytext-0.2.5}/tests/test_markdown_loader_gzip.py +0 -0
  84. {polytext-0.2.3 → polytext-0.2.5}/tests/test_markitdown_html.py +0 -0
  85. {polytext-0.2.3 → polytext-0.2.5}/tests/test_notebook_loader.py +0 -0
  86. {polytext-0.2.3 → polytext-0.2.5}/tests/test_ocr_fallbacks.py +0 -0
  87. {polytext-0.2.3 → polytext-0.2.5}/tests/test_ocr_image_descriptions.py +0 -0
  88. {polytext-0.2.3 → polytext-0.2.5}/tests/test_pain_text.py +0 -0
  89. {polytext-0.2.3 → polytext-0.2.5}/tests/test_python_version_metadata.py +0 -0
  90. {polytext-0.2.3 → polytext-0.2.5}/tests/test_split_audio_with_llm.py +0 -0
  91. {polytext-0.2.3 → polytext-0.2.5}/tests/test_xml_xbrl_loader.py +0 -0
  92. {polytext-0.2.3 → polytext-0.2.5}/tests/test_youtube_gemini_minimal_check.py +0 -0
  93. {polytext-0.2.3 → polytext-0.2.5}/tests/test_youtube_llm_fallbacks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -3,11 +3,36 @@ import os
3
3
  import logging
4
4
  import dotenv
5
5
 
6
+ from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError, LoaderError
7
+
6
8
  logger = logging.getLogger(__name__)
7
9
 
8
10
  # Load environment variables
9
11
  dotenv.load_dotenv()
10
12
 
13
+
14
+ def _filter_expected_loader_errors(event, hint):
15
+ error = None
16
+ if hint:
17
+ exc_info = hint.get("exc_info")
18
+ if exc_info:
19
+ error = exc_info[1]
20
+ else:
21
+ error = hint.get("original_exception")
22
+
23
+ if isinstance(error, LoaderError) and error.code == "NO_TEXT_DETECTED":
24
+ return None
25
+
26
+ exception_values = (event or {}).get("exception", {}).get("values", [])
27
+ for exception_value in exception_values:
28
+ exception_type = exception_value.get("type") or ""
29
+ exception_message = exception_value.get("value")
30
+ if exception_type.endswith("LoaderError") and exception_message == "No text detected":
31
+ return None
32
+
33
+ return event
34
+
35
+
11
36
  # Initialize Sentry if DSN is configured
12
37
  sentry_dsn = os.getenv('SENTRY_DSN_POLYTEXT')
13
38
  if sentry_dsn:
@@ -18,6 +43,7 @@ if sentry_dsn:
18
43
  environment=os.getenv('ENV', 'prod'),
19
44
  traces_sample_rate=1.0,
20
45
  profiles_sample_rate=1.0,
46
+ before_send=_filter_expected_loader_errors,
21
47
  )
22
48
  logger.info("Sentry monitoring initialized")
23
49
  except ImportError:
@@ -26,7 +52,6 @@ if sentry_dsn:
26
52
 
27
53
  from .converter.pdf import convert_to_pdf, DocumentConverter
28
54
  from .loader.document import DocumentLoader
29
- from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
30
55
  from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
31
56
 
32
57
  __all__ = [
@@ -38,4 +63,4 @@ __all__ = [
38
63
  'ConversionError',
39
64
  'get_customized_pdf_from_markdown',
40
65
  'PDFGenerator'
41
- ]
66
+ ]
@@ -46,7 +46,7 @@ AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHO
46
46
  AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
47
47
  AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
48
48
  AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
49
- AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
49
+ AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
50
50
  AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
51
51
  NO_HUMAN_SPEECH_MARKER = "no human speech detected"
52
52
 
@@ -33,7 +33,7 @@ OCR_TAIL_REPETITION_THRESHOLD = float(os.getenv("OCR_TAIL_REPETITION_THRESHOLD",
33
33
  OCR_FALLBACK_SOURCE_PATTERN = os.getenv("OCR_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
34
34
  OCR_FALLBACK_MODEL = os.getenv("OCR_FALLBACK_MODEL", "gemini-3-flash-preview")
35
35
  OCR_FALLBACK_TEMPERATURE = float(os.getenv("OCR_FALLBACK_TEMPERATURE", "1.0"))
36
- OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
36
+ OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
37
37
 
38
38
 
39
39
  def compress_and_convert_image(input_path: str, target_size=1):
@@ -127,11 +127,24 @@ class DocumentConverter:
127
127
  ]
128
128
 
129
129
  try:
130
- # Suppress Java runtime warnings by redirecting stderr
131
- subprocess.check_call(command, stderr=subprocess.DEVNULL)
130
+ subprocess.run(
131
+ command,
132
+ stdout=subprocess.PIPE,
133
+ stderr=subprocess.PIPE,
134
+ text=True,
135
+ check=True,
136
+ )
132
137
  logger.info(f"Conversion successful: '{output_file}'")
133
138
  except subprocess.CalledProcessError as e:
139
+ output_parts = []
140
+ if e.stdout:
141
+ output_parts.append(f"stdout: {e.stdout.strip()}")
142
+ if e.stderr:
143
+ output_parts.append(f"stderr: {e.stderr.strip()}")
144
+ details = "\n".join(output_parts)
134
145
  error_msg = f"Error during conversion: {e}"
146
+ if details:
147
+ error_msg = f"{error_msg}\n{details}"
135
148
  logger.info(error_msg)
136
149
  raise ConversionError(error_msg, e)
137
150
 
@@ -253,4 +266,4 @@ class DocumentConverter:
253
266
  # except Exception as e:
254
267
  # error_msg = f"Error during PDF conversion: {str(e)}"
255
268
  # logger.error(error_msg)
256
- # raise ConversionError(error_msg)
269
+ # raise ConversionError(error_msg)
@@ -25,7 +25,7 @@ from ..loader import (
25
25
  XmlXbrlLoader,
26
26
  NotebookLoader
27
27
  )
28
- from ..exceptions import EmptyDocument, LoaderTimeoutError, LoaderError
28
+ from ..exceptions import ConversionError, EmptyDocument, LoaderTimeoutError, LoaderError
29
29
  from ..utils.utils import clean_extracted_text_whitespace, remove_markdown_strip
30
30
 
31
31
  # External imports
@@ -46,6 +46,10 @@ LLM_OUTPUT_ERROR_CODES = {
46
46
  997: "REPETITIVE_OUTPUT",
47
47
  999: "MAX_TOKENS",
48
48
  }
49
+ EMPTY_DOCUMENT_LOADER_ERROR_CODES = {
50
+ **LLM_OUTPUT_ERROR_CODES,
51
+ 998: "NO_TEXT_DETECTED",
52
+ }
49
53
 
50
54
 
51
55
  def _read_bool_env(name: str, default: bool = False) -> bool:
@@ -55,6 +59,32 @@ def _read_bool_env(name: str, default: bool = False) -> bool:
55
59
  return value.strip().lower() in {"1", "true", "yes", "y", "on"}
56
60
 
57
61
 
62
+ def _capture_exception_for_sentry(error: Exception) -> None:
63
+ try:
64
+ import sentry_sdk
65
+ except ImportError:
66
+ return
67
+
68
+ try:
69
+ sentry_sdk.capture_exception(error)
70
+ except Exception:
71
+ return
72
+
73
+
74
+ def _raise_empty_document_loader_error(error: EmptyDocument) -> None:
75
+ loader_error_code = EMPTY_DOCUMENT_LOADER_ERROR_CODES.get(error.code, "NO_TEXT_DETECTED")
76
+ message = error.message
77
+ if loader_error_code == "NO_TEXT_DETECTED":
78
+ message = "No text detected"
79
+ else:
80
+ _capture_exception_for_sentry(error)
81
+ raise LoaderError(
82
+ message=message,
83
+ status=422,
84
+ code=loader_error_code,
85
+ ) from error
86
+
87
+
58
88
  class BaseLoader:
59
89
  def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
60
90
  ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
@@ -153,28 +183,24 @@ class BaseLoader:
153
183
  try:
154
184
  response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
155
185
  except EmptyDocument as e:
156
- logger.info(f"Empty document encountered: {e.message}")
157
186
  if e.code in LLM_OUTPUT_ERROR_CODES:
158
- logger.exception(
159
- "Raising LoaderError: status=422 code=%s original_empty_document_code=%s message=%s",
160
- LLM_OUTPUT_ERROR_CODES[e.code],
161
- e.code,
162
- e.message,
163
- )
164
- raise LoaderError(
165
- message=e.message,
166
- status=422,
167
- code=LLM_OUTPUT_ERROR_CODES[e.code],
168
- ) from e
187
+ _raise_empty_document_loader_error(e)
169
188
  if self.fallback_ocr:
170
189
  loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
171
190
  llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
172
- response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
191
+ try:
192
+ response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
193
+ except EmptyDocument as fallback_error:
194
+ _raise_empty_document_loader_error(fallback_error)
173
195
  else:
174
- response = {"text": "", "completion_tokens": 0, "prompt_tokens": 0, "output_list": [
175
- {"text": "", "completion_tokens": 0, "prompt_tokens": 0, "completion_model": "not provided",
176
- "completion_model_provider": "not provided", "text_chunks": "not provided", "type": "document",
177
- "input": first_file_url}]}
196
+ _raise_empty_document_loader_error(e)
197
+ except ConversionError as e:
198
+ _capture_exception_for_sentry(e)
199
+ raise LoaderError(
200
+ message=e.message,
201
+ status=422,
202
+ code="CONVERSION_ERROR",
203
+ ) from e
178
204
  except LoaderTimeoutError:
179
205
  raise LoaderError(message="timeout gemini", status=504, code="TIMEOUT")
180
206
  except (httpx.ReadTimeout,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -81,6 +81,7 @@ tests/test_notebook_loader.py
81
81
  tests/test_ocr_fallbacks.py
82
82
  tests/test_ocr_image_descriptions.py
83
83
  tests/test_pain_text.py
84
+ tests/test_pdf_conversion_error.py
84
85
  tests/test_python_version_metadata.py
85
86
  tests/test_split_audio_with_llm.py
86
87
  tests/test_xml_xbrl_loader.py
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
51
51
 
52
52
  setup(
53
53
  name='polytext',
54
- version='0.2.3',
54
+ version='0.2.5',
55
55
  url='https://github.com/docsity/polytext',
56
56
  # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
57
57
  license='MIT',
@@ -0,0 +1,148 @@
1
+ import unittest
2
+ from unittest.mock import Mock, patch
3
+
4
+ from polytext.exceptions import ConversionError, EmptyDocument, LoaderError
5
+ from polytext.loader.base import BaseLoader
6
+
7
+
8
+ class _FailingLoader:
9
+ def __init__(self, error):
10
+ self.error = error
11
+
12
+ def load(self, input_path):
13
+ raise self.error
14
+
15
+
16
+ class _FakeBaseLoader(BaseLoader):
17
+ def __init__(self, error, **kwargs):
18
+ super().__init__(**kwargs)
19
+ self.error = error
20
+
21
+ def initiate_storage(self, input):
22
+ return {}
23
+
24
+ def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
25
+ return _FailingLoader(self.error)
26
+
27
+
28
+ class _FallbackFailingBaseLoader(BaseLoader):
29
+ def __init__(self, initial_error, fallback_error, **kwargs):
30
+ super().__init__(**kwargs)
31
+ self.initial_error = initial_error
32
+ self.fallback_error = fallback_error
33
+
34
+ def initiate_storage(self, input):
35
+ return {}
36
+
37
+ def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
38
+ if is_document_fallback:
39
+ return _FailingLoader(self.fallback_error)
40
+ return _FailingLoader(self.initial_error)
41
+
42
+
43
+ class TestBaseLoaderErrorMapping(unittest.TestCase):
44
+ def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
45
+ cases = [
46
+ (995, "INVALID_ARGUMENT"),
47
+ (996, "RECITATION"),
48
+ (997, "REPETITIVE_OUTPUT"),
49
+ (999, "MAX_TOKENS"),
50
+ ]
51
+
52
+ for empty_document_code, expected_loader_code in cases:
53
+ with self.subTest(empty_document_code=empty_document_code):
54
+ loader = _FakeBaseLoader(
55
+ EmptyDocument(
56
+ message=f"diagnostic failure {empty_document_code}",
57
+ code=empty_document_code,
58
+ )
59
+ )
60
+
61
+ sentry_sdk = Mock()
62
+ with patch("polytext.loader.base.logger.info") as mock_info:
63
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
64
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
65
+ with self.assertRaises(LoaderError) as error_context:
66
+ loader.get_text(["dummy.txt"])
67
+
68
+ error = error_context.exception
69
+ self.assertEqual(error.status, 422)
70
+ self.assertEqual(error.code, expected_loader_code)
71
+ self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
72
+ mock_info.assert_not_called()
73
+ mock_exception.assert_not_called()
74
+ sentry_sdk.capture_exception.assert_called_once()
75
+ self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
76
+
77
+ def test_empty_or_too_short_documents_are_raised_as_loader_errors(self):
78
+ loader = _FakeBaseLoader(
79
+ EmptyDocument(
80
+ message="Document text with less than 400 characters",
81
+ code=998,
82
+ )
83
+ )
84
+
85
+ sentry_sdk = Mock()
86
+ with patch("polytext.loader.base.logger.info") as mock_info:
87
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
88
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
89
+ with self.assertRaises(LoaderError) as error_context:
90
+ loader.get_text(["empty.txt"])
91
+
92
+ error = error_context.exception
93
+ self.assertEqual(error.status, 422)
94
+ self.assertEqual(error.code, "NO_TEXT_DETECTED")
95
+ self.assertEqual(error.message, "No text detected")
96
+ mock_info.assert_not_called()
97
+ mock_exception.assert_not_called()
98
+ sentry_sdk.capture_exception.assert_not_called()
99
+
100
+ def test_empty_document_after_fallback_ocr_is_raised_as_loader_error(self):
101
+ loader = _FallbackFailingBaseLoader(
102
+ initial_error=EmptyDocument(
103
+ message="No text detected",
104
+ code=998,
105
+ ),
106
+ fallback_error=EmptyDocument(
107
+ message="No text extracted from OCR fallback",
108
+ ),
109
+ fallback_ocr=True,
110
+ )
111
+
112
+ sentry_sdk = Mock()
113
+ with patch("polytext.loader.base.logger.info") as mock_info:
114
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
115
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
116
+ with self.assertRaises(LoaderError) as error_context:
117
+ loader.get_text(["empty.pdf"])
118
+
119
+ error = error_context.exception
120
+ self.assertEqual(error.status, 422)
121
+ self.assertEqual(error.code, "NO_TEXT_DETECTED")
122
+ self.assertEqual(error.message, "No text detected")
123
+ mock_info.assert_not_called()
124
+ mock_exception.assert_not_called()
125
+ sentry_sdk.capture_exception.assert_not_called()
126
+
127
+ def test_conversion_error_is_raised_as_loader_error(self):
128
+ conversion_error = ConversionError("LibreOffice failed")
129
+ loader = _FakeBaseLoader(conversion_error)
130
+
131
+ sentry_sdk = Mock()
132
+ with patch("polytext.loader.base.logger.info") as mock_info:
133
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
134
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
135
+ with self.assertRaises(LoaderError) as error_context:
136
+ loader.get_text(["document.docx"])
137
+
138
+ error = error_context.exception
139
+ self.assertEqual(error.status, 422)
140
+ self.assertEqual(error.code, "CONVERSION_ERROR")
141
+ self.assertEqual(error.message, "LibreOffice failed")
142
+ mock_info.assert_not_called()
143
+ mock_exception.assert_not_called()
144
+ sentry_sdk.capture_exception.assert_called_once_with(conversion_error)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ unittest.main()
@@ -33,7 +33,7 @@ def main():
33
33
  # Define document data
34
34
  file_path = "gcs://opit-da-test-ml-ai-store-bucket/learning_resources/course_id=406/module_id=2658/id=31427/8434.mp4"
35
35
 
36
- local_file_path = "/Users/marcodelgiudice/Projects/polytext/audio_8_barbero_0_5_ore.m4a"
36
+ local_file_path = "/Users/marcodelgiudice/Downloads/mq0264a5-5a073da227de0ee462bd6de8731d586a1dcc635f.pdf"
37
37
 
38
38
  # Call get_text method
39
39
  start = time.time()
@@ -38,7 +38,7 @@ def main():
38
38
 
39
39
  # local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9695.jpg"
40
40
  # local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9701.jpg"
41
- local_file_path = "/Users/marcodelgiudice/Projects/polytext/chimicaformula.png"
41
+ local_file_path = "/Users/marcodelgiudice/Projects/polytext/gm1.png"
42
42
 
43
43
  try:
44
44
  start = time.time()
@@ -0,0 +1,43 @@
1
+ import subprocess
2
+ import tempfile
3
+ import unittest
4
+ from unittest.mock import patch
5
+
6
+ from polytext.converter.pdf import DocumentConverter
7
+ from polytext.exceptions import ConversionError
8
+
9
+
10
+ class TestPdfConversionError(unittest.TestCase):
11
+ @patch.object(DocumentConverter, "check_libreoffice_installed", return_value=True)
12
+ @patch("polytext.converter.pdf.subprocess.run")
13
+ @patch("polytext.converter.pdf.subprocess.check_call")
14
+ def test_conversion_error_includes_libreoffice_output(
15
+ self,
16
+ mock_check_call,
17
+ mock_run,
18
+ _mock_check_libreoffice,
19
+ ):
20
+ libreoffice_error = subprocess.CalledProcessError(
21
+ returncode=1,
22
+ cmd=["libreoffice", "--convert-to", "pdf"],
23
+ output="convert input.docx -> output.pdf",
24
+ stderr="Unspecified Application Error",
25
+ )
26
+ mock_check_call.side_effect = libreoffice_error
27
+ mock_run.side_effect = libreoffice_error
28
+
29
+ with tempfile.NamedTemporaryFile(suffix=".docx") as input_file:
30
+ with tempfile.NamedTemporaryFile(suffix=".pdf") as output_file:
31
+ with self.assertRaises(ConversionError) as error_context:
32
+ DocumentConverter().convert_to_pdf(
33
+ input_file=input_file.name,
34
+ original_file=input_file.name,
35
+ output_file=output_file.name,
36
+ )
37
+
38
+ self.assertIn("Unspecified Application Error", error_context.exception.message)
39
+ self.assertIn("convert input.docx -> output.pdf", error_context.exception.message)
40
+
41
+
42
+ if __name__ == "__main__":
43
+ unittest.main()
@@ -32,9 +32,9 @@ url = 'https://www.youtube.com/watch?v=L4as3tks4Js' # basement alberto angela
32
32
 
33
33
  # url = 'https://www.youtube.com/watch?v=UabBYexBD4k' # INM RAG 11 minuti, completato in 26 secondi con successo con gemini-3.1-flash-lite
34
34
 
35
- url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
35
+ #url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
36
36
 
37
- # url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw' # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
37
+ url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw' # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
38
38
 
39
39
  # url = 'https://www.youtube.com/watch?v=CM2CkNU9xR0' # google antigravity 27 minuti, completato in 39 secondi con successo con gemini-3.1-flash-lite (146k token in input, 6k token in output)
40
40
 
@@ -1,79 +0,0 @@
1
- import unittest
2
- from unittest.mock import patch
3
-
4
- from polytext.exceptions import EmptyDocument, LoaderError
5
- from polytext.loader.base import BaseLoader
6
-
7
-
8
- class _FailingLoader:
9
- def __init__(self, error):
10
- self.error = error
11
-
12
- def load(self, input_path):
13
- raise self.error
14
-
15
-
16
- class _FakeBaseLoader(BaseLoader):
17
- def __init__(self, error, **kwargs):
18
- super().__init__(**kwargs)
19
- self.error = error
20
-
21
- def initiate_storage(self, input):
22
- return {}
23
-
24
- def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
25
- return _FailingLoader(self.error)
26
-
27
-
28
- class TestBaseLoaderErrorMapping(unittest.TestCase):
29
- def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
30
- cases = [
31
- (995, "INVALID_ARGUMENT"),
32
- (996, "RECITATION"),
33
- (997, "REPETITIVE_OUTPUT"),
34
- (999, "MAX_TOKENS"),
35
- ]
36
-
37
- for empty_document_code, expected_loader_code in cases:
38
- with self.subTest(empty_document_code=empty_document_code):
39
- loader = _FakeBaseLoader(
40
- EmptyDocument(
41
- message=f"diagnostic failure {empty_document_code}",
42
- code=empty_document_code,
43
- )
44
- )
45
-
46
- with patch("polytext.loader.base.logger.exception") as mock_exception:
47
- with self.assertRaises(LoaderError) as error_context:
48
- loader.get_text(["dummy.txt"])
49
-
50
- error = error_context.exception
51
- self.assertEqual(error.status, 422)
52
- self.assertEqual(error.code, expected_loader_code)
53
- self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
54
- mock_exception.assert_called_once()
55
- self.assertIn("Raising LoaderError", mock_exception.call_args.args[0])
56
- self.assertEqual(mock_exception.call_args.args[1], expected_loader_code)
57
- self.assertEqual(mock_exception.call_args.args[2], empty_document_code)
58
- self.assertEqual(mock_exception.call_args.args[3], f"diagnostic failure {empty_document_code}")
59
-
60
- def test_empty_or_too_short_documents_still_return_empty_response(self):
61
- loader = _FakeBaseLoader(
62
- EmptyDocument(
63
- message="Document text with less than 400 characters",
64
- code=998,
65
- )
66
- )
67
-
68
- with patch("polytext.loader.base.logger.exception") as mock_exception:
69
- response = loader.get_text(["empty.txt"])
70
-
71
- self.assertEqual(response["text"], "")
72
- self.assertEqual(response["completion_tokens"], 0)
73
- self.assertEqual(response["prompt_tokens"], 0)
74
- self.assertEqual(response["output_list"][0]["input"], "empty.txt")
75
- mock_exception.assert_not_called()
76
-
77
-
78
- if __name__ == "__main__":
79
- unittest.main()
File without changes
File without changes
File without changes
File without changes
File without changes