polytext 0.2.4__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. {polytext-0.2.4 → polytext-0.2.5}/PKG-INFO +1 -1
  2. {polytext-0.2.4 → polytext-0.2.5}/polytext/__init__.py +27 -2
  3. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/pdf.py +16 -3
  4. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/base.py +32 -13
  5. {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/PKG-INFO +1 -1
  6. {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/SOURCES.txt +1 -0
  7. {polytext-0.2.4 → polytext-0.2.5}/setup.py +1 -1
  8. polytext-0.2.5/tests/test_base_loader_error_mapping.py +148 -0
  9. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_audio_transcript_from_gcs.py +1 -1
  10. polytext-0.2.5/tests/test_pdf_conversion_error.py +43 -0
  11. polytext-0.2.4/tests/test_base_loader_error_mapping.py +0 -81
  12. {polytext-0.2.4 → polytext-0.2.5}/LICENSE +0 -0
  13. {polytext-0.2.4 → polytext-0.2.5}/README.md +0 -0
  14. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/__init__.py +0 -0
  15. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/audio_to_text.py +0 -0
  16. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/base.py +0 -0
  17. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/document_ocr_to_text.py +0 -0
  18. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
  19. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/gemini_quality_guards.py +0 -0
  20. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/html_to_md.py +0 -0
  21. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/md_to_text.py +0 -0
  22. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/ocr_to_text.py +0 -0
  23. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
  24. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/text_to_md.py +0 -0
  25. {polytext-0.2.4 → polytext-0.2.5}/polytext/converter/video_to_audio.py +0 -0
  26. {polytext-0.2.4 → polytext-0.2.5}/polytext/exceptions/__init__.py +0 -0
  27. {polytext-0.2.4 → polytext-0.2.5}/polytext/exceptions/base.py +0 -0
  28. {polytext-0.2.4 → polytext-0.2.5}/polytext/generator/__init__.py +0 -0
  29. {polytext-0.2.4 → polytext-0.2.5}/polytext/generator/pdf.py +0 -0
  30. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/__init__.py +0 -0
  31. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/audio.py +0 -0
  32. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/document.py +0 -0
  33. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/document_ocr.py +0 -0
  34. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/downloader/__init__.py +0 -0
  35. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/downloader/downloader.py +0 -0
  36. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/html.py +0 -0
  37. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/markdown.py +0 -0
  38. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/notebook.py +0 -0
  39. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/ocr.py +0 -0
  40. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/plain_text.py +0 -0
  41. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/video.py +0 -0
  42. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/xml_xbrl.py +0 -0
  43. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/youtube.py +0 -0
  44. {polytext-0.2.4 → polytext-0.2.5}/polytext/loader/youtube_llm.py +0 -0
  45. {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/__init__.py +0 -0
  46. {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/audio_chunker.py +0 -0
  47. {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/text_merger.py +0 -0
  48. {polytext-0.2.4 → polytext-0.2.5}/polytext/processor/transcript_chunker.py +0 -0
  49. {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/__init__.py +0 -0
  50. {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/ocr.py +0 -0
  51. {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/text_merging.py +0 -0
  52. {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/text_to_md.py +0 -0
  53. {polytext-0.2.4 → polytext-0.2.5}/polytext/prompts/transcription.py +0 -0
  54. {polytext-0.2.4 → polytext-0.2.5}/polytext/utils/__init__.py +0 -0
  55. {polytext-0.2.4 → polytext-0.2.5}/polytext/utils/utils.py +0 -0
  56. {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/dependency_links.txt +0 -0
  57. {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/not-zip-safe +0 -0
  58. {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/requires.txt +0 -0
  59. {polytext-0.2.4 → polytext-0.2.5}/polytext.egg-info/top_level.txt +0 -0
  60. {polytext-0.2.4 → polytext-0.2.5}/pyproject.toml +0 -0
  61. {polytext-0.2.4 → polytext-0.2.5}/setup.cfg +0 -0
  62. {polytext-0.2.4 → polytext-0.2.5}/tests/test_audio_chunker.py +0 -0
  63. {polytext-0.2.4 → polytext-0.2.5}/tests/test_audio_comparison_helpers.py +0 -0
  64. {polytext-0.2.4 → polytext-0.2.5}/tests/test_audio_transcription_model_migration.py +0 -0
  65. {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_audio_models.py +0 -0
  66. {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_document_ocr_to_text_models.py +0 -0
  67. {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_ocr_to_text_models.py +0 -0
  68. {polytext-0.2.4 → polytext-0.2.5}/tests/test_compare_youtube_models.py +0 -0
  69. {polytext-0.2.4 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube.py +0 -0
  70. {polytext-0.2.4 → polytext-0.2.5}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
  71. {polytext-0.2.4 → polytext-0.2.5}/tests/test_extracted_text_whitespace.py +0 -0
  72. {polytext-0.2.4 → polytext-0.2.5}/tests/test_gemini_quality_guards.py +0 -0
  73. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_customized_pdf_from_markdown.py +0 -0
  74. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_ocr.py +0 -0
  75. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_ocr_azure_oai.py +0 -0
  76. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_text.py +0 -0
  77. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_document_text_from_gcs.py +0 -0
  78. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_ocr_from_image.py +0 -0
  79. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_text_from_markdown.py +0 -0
  80. {polytext-0.2.4 → polytext-0.2.5}/tests/test_get_video_transcript_from_gcs.py +0 -0
  81. {polytext-0.2.4 → polytext-0.2.5}/tests/test_library.py +0 -0
  82. {polytext-0.2.4 → polytext-0.2.5}/tests/test_markdown_loader_gzip.py +0 -0
  83. {polytext-0.2.4 → polytext-0.2.5}/tests/test_markitdown_html.py +0 -0
  84. {polytext-0.2.4 → polytext-0.2.5}/tests/test_notebook_loader.py +0 -0
  85. {polytext-0.2.4 → polytext-0.2.5}/tests/test_ocr_fallbacks.py +0 -0
  86. {polytext-0.2.4 → polytext-0.2.5}/tests/test_ocr_image_descriptions.py +0 -0
  87. {polytext-0.2.4 → polytext-0.2.5}/tests/test_pain_text.py +0 -0
  88. {polytext-0.2.4 → polytext-0.2.5}/tests/test_python_version_metadata.py +0 -0
  89. {polytext-0.2.4 → polytext-0.2.5}/tests/test_split_audio_with_llm.py +0 -0
  90. {polytext-0.2.4 → polytext-0.2.5}/tests/test_xml_xbrl_loader.py +0 -0
  91. {polytext-0.2.4 → polytext-0.2.5}/tests/test_youtube_gemini_minimal_check.py +0 -0
  92. {polytext-0.2.4 → polytext-0.2.5}/tests/test_youtube_llm_fallbacks.py +0 -0
  93. {polytext-0.2.4 → polytext-0.2.5}/tests/test_youtube_transcript.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -3,11 +3,36 @@ import os
3
3
  import logging
4
4
  import dotenv
5
5
 
6
+ from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError, LoaderError
7
+
6
8
  logger = logging.getLogger(__name__)
7
9
 
8
10
  # Load environment variables
9
11
  dotenv.load_dotenv()
10
12
 
13
+
14
+ def _filter_expected_loader_errors(event, hint):
15
+ error = None
16
+ if hint:
17
+ exc_info = hint.get("exc_info")
18
+ if exc_info:
19
+ error = exc_info[1]
20
+ else:
21
+ error = hint.get("original_exception")
22
+
23
+ if isinstance(error, LoaderError) and error.code == "NO_TEXT_DETECTED":
24
+ return None
25
+
26
+ exception_values = (event or {}).get("exception", {}).get("values", [])
27
+ for exception_value in exception_values:
28
+ exception_type = exception_value.get("type") or ""
29
+ exception_message = exception_value.get("value")
30
+ if exception_type.endswith("LoaderError") and exception_message == "No text detected":
31
+ return None
32
+
33
+ return event
34
+
35
+
11
36
  # Initialize Sentry if DSN is configured
12
37
  sentry_dsn = os.getenv('SENTRY_DSN_POLYTEXT')
13
38
  if sentry_dsn:
@@ -18,6 +43,7 @@ if sentry_dsn:
18
43
  environment=os.getenv('ENV', 'prod'),
19
44
  traces_sample_rate=1.0,
20
45
  profiles_sample_rate=1.0,
46
+ before_send=_filter_expected_loader_errors,
21
47
  )
22
48
  logger.info("Sentry monitoring initialized")
23
49
  except ImportError:
@@ -26,7 +52,6 @@ if sentry_dsn:
26
52
 
27
53
  from .converter.pdf import convert_to_pdf, DocumentConverter
28
54
  from .loader.document import DocumentLoader
29
- from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
30
55
  from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
31
56
 
32
57
  __all__ = [
@@ -38,4 +63,4 @@ __all__ = [
38
63
  'ConversionError',
39
64
  'get_customized_pdf_from_markdown',
40
65
  'PDFGenerator'
41
- ]
66
+ ]
@@ -127,11 +127,24 @@ class DocumentConverter:
127
127
  ]
128
128
 
129
129
  try:
130
- # Suppress Java runtime warnings by redirecting stderr
131
- subprocess.check_call(command, stderr=subprocess.DEVNULL)
130
+ subprocess.run(
131
+ command,
132
+ stdout=subprocess.PIPE,
133
+ stderr=subprocess.PIPE,
134
+ text=True,
135
+ check=True,
136
+ )
132
137
  logger.info(f"Conversion successful: '{output_file}'")
133
138
  except subprocess.CalledProcessError as e:
139
+ output_parts = []
140
+ if e.stdout:
141
+ output_parts.append(f"stdout: {e.stdout.strip()}")
142
+ if e.stderr:
143
+ output_parts.append(f"stderr: {e.stderr.strip()}")
144
+ details = "\n".join(output_parts)
134
145
  error_msg = f"Error during conversion: {e}"
146
+ if details:
147
+ error_msg = f"{error_msg}\n{details}"
135
148
  logger.info(error_msg)
136
149
  raise ConversionError(error_msg, e)
137
150
 
@@ -253,4 +266,4 @@ class DocumentConverter:
253
266
  # except Exception as e:
254
267
  # error_msg = f"Error during PDF conversion: {str(e)}"
255
268
  # logger.error(error_msg)
256
- # raise ConversionError(error_msg)
269
+ # raise ConversionError(error_msg)
@@ -25,7 +25,7 @@ from ..loader import (
25
25
  XmlXbrlLoader,
26
26
  NotebookLoader
27
27
  )
28
- from ..exceptions import EmptyDocument, LoaderTimeoutError, LoaderError
28
+ from ..exceptions import ConversionError, EmptyDocument, LoaderTimeoutError, LoaderError
29
29
  from ..utils.utils import clean_extracted_text_whitespace, remove_markdown_strip
30
30
 
31
31
  # External imports
@@ -46,6 +46,10 @@ LLM_OUTPUT_ERROR_CODES = {
46
46
  997: "REPETITIVE_OUTPUT",
47
47
  999: "MAX_TOKENS",
48
48
  }
49
+ EMPTY_DOCUMENT_LOADER_ERROR_CODES = {
50
+ **LLM_OUTPUT_ERROR_CODES,
51
+ 998: "NO_TEXT_DETECTED",
52
+ }
49
53
 
50
54
 
51
55
  def _read_bool_env(name: str, default: bool = False) -> bool:
@@ -67,6 +71,20 @@ def _capture_exception_for_sentry(error: Exception) -> None:
67
71
  return
68
72
 
69
73
 
74
+ def _raise_empty_document_loader_error(error: EmptyDocument) -> None:
75
+ loader_error_code = EMPTY_DOCUMENT_LOADER_ERROR_CODES.get(error.code, "NO_TEXT_DETECTED")
76
+ message = error.message
77
+ if loader_error_code == "NO_TEXT_DETECTED":
78
+ message = "No text detected"
79
+ else:
80
+ _capture_exception_for_sentry(error)
81
+ raise LoaderError(
82
+ message=message,
83
+ status=422,
84
+ code=loader_error_code,
85
+ ) from error
86
+
87
+
70
88
  class BaseLoader:
71
89
  def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
72
90
  ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
@@ -166,22 +184,23 @@ class BaseLoader:
166
184
  response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
167
185
  except EmptyDocument as e:
168
186
  if e.code in LLM_OUTPUT_ERROR_CODES:
169
- _capture_exception_for_sentry(e)
170
- raise LoaderError(
171
- message=e.message,
172
- status=422,
173
- code=LLM_OUTPUT_ERROR_CODES[e.code],
174
- ) from e
175
- logger.info(f"Empty document encountered: {e.message}")
187
+ _raise_empty_document_loader_error(e)
176
188
  if self.fallback_ocr:
177
189
  loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
178
190
  llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
179
- response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
191
+ try:
192
+ response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
193
+ except EmptyDocument as fallback_error:
194
+ _raise_empty_document_loader_error(fallback_error)
180
195
  else:
181
- response = {"text": "", "completion_tokens": 0, "prompt_tokens": 0, "output_list": [
182
- {"text": "", "completion_tokens": 0, "prompt_tokens": 0, "completion_model": "not provided",
183
- "completion_model_provider": "not provided", "text_chunks": "not provided", "type": "document",
184
- "input": first_file_url}]}
196
+ _raise_empty_document_loader_error(e)
197
+ except ConversionError as e:
198
+ _capture_exception_for_sentry(e)
199
+ raise LoaderError(
200
+ message=e.message,
201
+ status=422,
202
+ code="CONVERSION_ERROR",
203
+ ) from e
185
204
  except LoaderTimeoutError:
186
205
  raise LoaderError(message="timeout gemini", status=504, code="TIMEOUT")
187
206
  except (httpx.ReadTimeout,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -81,6 +81,7 @@ tests/test_notebook_loader.py
81
81
  tests/test_ocr_fallbacks.py
82
82
  tests/test_ocr_image_descriptions.py
83
83
  tests/test_pain_text.py
84
+ tests/test_pdf_conversion_error.py
84
85
  tests/test_python_version_metadata.py
85
86
  tests/test_split_audio_with_llm.py
86
87
  tests/test_xml_xbrl_loader.py
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
51
51
 
52
52
  setup(
53
53
  name='polytext',
54
- version='0.2.4',
54
+ version='0.2.5',
55
55
  url='https://github.com/docsity/polytext',
56
56
  # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
57
57
  license='MIT',
@@ -0,0 +1,148 @@
1
+ import unittest
2
+ from unittest.mock import Mock, patch
3
+
4
+ from polytext.exceptions import ConversionError, EmptyDocument, LoaderError
5
+ from polytext.loader.base import BaseLoader
6
+
7
+
8
+ class _FailingLoader:
9
+ def __init__(self, error):
10
+ self.error = error
11
+
12
+ def load(self, input_path):
13
+ raise self.error
14
+
15
+
16
+ class _FakeBaseLoader(BaseLoader):
17
+ def __init__(self, error, **kwargs):
18
+ super().__init__(**kwargs)
19
+ self.error = error
20
+
21
+ def initiate_storage(self, input):
22
+ return {}
23
+
24
+ def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
25
+ return _FailingLoader(self.error)
26
+
27
+
28
+ class _FallbackFailingBaseLoader(BaseLoader):
29
+ def __init__(self, initial_error, fallback_error, **kwargs):
30
+ super().__init__(**kwargs)
31
+ self.initial_error = initial_error
32
+ self.fallback_error = fallback_error
33
+
34
+ def initiate_storage(self, input):
35
+ return {}
36
+
37
+ def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
38
+ if is_document_fallback:
39
+ return _FailingLoader(self.fallback_error)
40
+ return _FailingLoader(self.initial_error)
41
+
42
+
43
+ class TestBaseLoaderErrorMapping(unittest.TestCase):
44
+ def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
45
+ cases = [
46
+ (995, "INVALID_ARGUMENT"),
47
+ (996, "RECITATION"),
48
+ (997, "REPETITIVE_OUTPUT"),
49
+ (999, "MAX_TOKENS"),
50
+ ]
51
+
52
+ for empty_document_code, expected_loader_code in cases:
53
+ with self.subTest(empty_document_code=empty_document_code):
54
+ loader = _FakeBaseLoader(
55
+ EmptyDocument(
56
+ message=f"diagnostic failure {empty_document_code}",
57
+ code=empty_document_code,
58
+ )
59
+ )
60
+
61
+ sentry_sdk = Mock()
62
+ with patch("polytext.loader.base.logger.info") as mock_info:
63
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
64
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
65
+ with self.assertRaises(LoaderError) as error_context:
66
+ loader.get_text(["dummy.txt"])
67
+
68
+ error = error_context.exception
69
+ self.assertEqual(error.status, 422)
70
+ self.assertEqual(error.code, expected_loader_code)
71
+ self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
72
+ mock_info.assert_not_called()
73
+ mock_exception.assert_not_called()
74
+ sentry_sdk.capture_exception.assert_called_once()
75
+ self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
76
+
77
+ def test_empty_or_too_short_documents_are_raised_as_loader_errors(self):
78
+ loader = _FakeBaseLoader(
79
+ EmptyDocument(
80
+ message="Document text with less than 400 characters",
81
+ code=998,
82
+ )
83
+ )
84
+
85
+ sentry_sdk = Mock()
86
+ with patch("polytext.loader.base.logger.info") as mock_info:
87
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
88
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
89
+ with self.assertRaises(LoaderError) as error_context:
90
+ loader.get_text(["empty.txt"])
91
+
92
+ error = error_context.exception
93
+ self.assertEqual(error.status, 422)
94
+ self.assertEqual(error.code, "NO_TEXT_DETECTED")
95
+ self.assertEqual(error.message, "No text detected")
96
+ mock_info.assert_not_called()
97
+ mock_exception.assert_not_called()
98
+ sentry_sdk.capture_exception.assert_not_called()
99
+
100
+ def test_empty_document_after_fallback_ocr_is_raised_as_loader_error(self):
101
+ loader = _FallbackFailingBaseLoader(
102
+ initial_error=EmptyDocument(
103
+ message="No text detected",
104
+ code=998,
105
+ ),
106
+ fallback_error=EmptyDocument(
107
+ message="No text extracted from OCR fallback",
108
+ ),
109
+ fallback_ocr=True,
110
+ )
111
+
112
+ sentry_sdk = Mock()
113
+ with patch("polytext.loader.base.logger.info") as mock_info:
114
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
115
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
116
+ with self.assertRaises(LoaderError) as error_context:
117
+ loader.get_text(["empty.pdf"])
118
+
119
+ error = error_context.exception
120
+ self.assertEqual(error.status, 422)
121
+ self.assertEqual(error.code, "NO_TEXT_DETECTED")
122
+ self.assertEqual(error.message, "No text detected")
123
+ mock_info.assert_not_called()
124
+ mock_exception.assert_not_called()
125
+ sentry_sdk.capture_exception.assert_not_called()
126
+
127
+ def test_conversion_error_is_raised_as_loader_error(self):
128
+ conversion_error = ConversionError("LibreOffice failed")
129
+ loader = _FakeBaseLoader(conversion_error)
130
+
131
+ sentry_sdk = Mock()
132
+ with patch("polytext.loader.base.logger.info") as mock_info:
133
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
134
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
135
+ with self.assertRaises(LoaderError) as error_context:
136
+ loader.get_text(["document.docx"])
137
+
138
+ error = error_context.exception
139
+ self.assertEqual(error.status, 422)
140
+ self.assertEqual(error.code, "CONVERSION_ERROR")
141
+ self.assertEqual(error.message, "LibreOffice failed")
142
+ mock_info.assert_not_called()
143
+ mock_exception.assert_not_called()
144
+ sentry_sdk.capture_exception.assert_called_once_with(conversion_error)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ unittest.main()
@@ -33,7 +33,7 @@ def main():
33
33
  # Define document data
34
34
  file_path = "gcs://opit-da-test-ml-ai-store-bucket/learning_resources/course_id=406/module_id=2658/id=31427/8434.mp4"
35
35
 
36
- local_file_path = "/Users/marcodelgiudice/Projects/polytext/audio_8_barbero_0_5_ore.m4a"
36
+ local_file_path = "/Users/marcodelgiudice/Downloads/mq0264a5-5a073da227de0ee462bd6de8731d586a1dcc635f.pdf"
37
37
 
38
38
  # Call get_text method
39
39
  start = time.time()
@@ -0,0 +1,43 @@
1
+ import subprocess
2
+ import tempfile
3
+ import unittest
4
+ from unittest.mock import patch
5
+
6
+ from polytext.converter.pdf import DocumentConverter
7
+ from polytext.exceptions import ConversionError
8
+
9
+
10
+ class TestPdfConversionError(unittest.TestCase):
11
+ @patch.object(DocumentConverter, "check_libreoffice_installed", return_value=True)
12
+ @patch("polytext.converter.pdf.subprocess.run")
13
+ @patch("polytext.converter.pdf.subprocess.check_call")
14
+ def test_conversion_error_includes_libreoffice_output(
15
+ self,
16
+ mock_check_call,
17
+ mock_run,
18
+ _mock_check_libreoffice,
19
+ ):
20
+ libreoffice_error = subprocess.CalledProcessError(
21
+ returncode=1,
22
+ cmd=["libreoffice", "--convert-to", "pdf"],
23
+ output="convert input.docx -> output.pdf",
24
+ stderr="Unspecified Application Error",
25
+ )
26
+ mock_check_call.side_effect = libreoffice_error
27
+ mock_run.side_effect = libreoffice_error
28
+
29
+ with tempfile.NamedTemporaryFile(suffix=".docx") as input_file:
30
+ with tempfile.NamedTemporaryFile(suffix=".pdf") as output_file:
31
+ with self.assertRaises(ConversionError) as error_context:
32
+ DocumentConverter().convert_to_pdf(
33
+ input_file=input_file.name,
34
+ original_file=input_file.name,
35
+ output_file=output_file.name,
36
+ )
37
+
38
+ self.assertIn("Unspecified Application Error", error_context.exception.message)
39
+ self.assertIn("convert input.docx -> output.pdf", error_context.exception.message)
40
+
41
+
42
+ if __name__ == "__main__":
43
+ unittest.main()
@@ -1,81 +0,0 @@
1
- import unittest
2
- from unittest.mock import Mock, patch
3
-
4
- from polytext.exceptions import EmptyDocument, LoaderError
5
- from polytext.loader.base import BaseLoader
6
-
7
-
8
- class _FailingLoader:
9
- def __init__(self, error):
10
- self.error = error
11
-
12
- def load(self, input_path):
13
- raise self.error
14
-
15
-
16
- class _FakeBaseLoader(BaseLoader):
17
- def __init__(self, error, **kwargs):
18
- super().__init__(**kwargs)
19
- self.error = error
20
-
21
- def initiate_storage(self, input):
22
- return {}
23
-
24
- def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
25
- return _FailingLoader(self.error)
26
-
27
-
28
- class TestBaseLoaderErrorMapping(unittest.TestCase):
29
- def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
30
- cases = [
31
- (995, "INVALID_ARGUMENT"),
32
- (996, "RECITATION"),
33
- (997, "REPETITIVE_OUTPUT"),
34
- (999, "MAX_TOKENS"),
35
- ]
36
-
37
- for empty_document_code, expected_loader_code in cases:
38
- with self.subTest(empty_document_code=empty_document_code):
39
- loader = _FakeBaseLoader(
40
- EmptyDocument(
41
- message=f"diagnostic failure {empty_document_code}",
42
- code=empty_document_code,
43
- )
44
- )
45
-
46
- sentry_sdk = Mock()
47
- with patch("polytext.loader.base.logger.info") as mock_info:
48
- with patch("polytext.loader.base.logger.exception") as mock_exception:
49
- with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
50
- with self.assertRaises(LoaderError) as error_context:
51
- loader.get_text(["dummy.txt"])
52
-
53
- error = error_context.exception
54
- self.assertEqual(error.status, 422)
55
- self.assertEqual(error.code, expected_loader_code)
56
- self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
57
- mock_info.assert_not_called()
58
- mock_exception.assert_not_called()
59
- sentry_sdk.capture_exception.assert_called_once()
60
- self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
61
-
62
- def test_empty_or_too_short_documents_still_return_empty_response(self):
63
- loader = _FakeBaseLoader(
64
- EmptyDocument(
65
- message="Document text with less than 400 characters",
66
- code=998,
67
- )
68
- )
69
-
70
- with patch("polytext.loader.base.logger.exception") as mock_exception:
71
- response = loader.get_text(["empty.txt"])
72
-
73
- self.assertEqual(response["text"], "")
74
- self.assertEqual(response["completion_tokens"], 0)
75
- self.assertEqual(response["prompt_tokens"], 0)
76
- self.assertEqual(response["output_list"][0]["input"], "empty.txt")
77
- mock_exception.assert_not_called()
78
-
79
-
80
- if __name__ == "__main__":
81
- unittest.main()
File without changes
File without changes
File without changes
File without changes
File without changes