polytext 0.2.2b2__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {polytext-0.2.2b2 → polytext-0.2.3}/PKG-INFO +1 -1
  2. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/audio_to_text.py +21 -11
  3. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/document_ocr_to_text.py +6 -1
  4. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/document_ocr_to_text_azure_oai.py +7 -2
  5. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/ocr_to_text.py +6 -1
  6. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/ocr_to_text_azure_oai.py +6 -1
  7. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/video_to_audio.py +3 -3
  8. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/base.py +24 -1
  9. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/youtube_llm.py +1 -1
  10. {polytext-0.2.2b2 → polytext-0.2.3}/polytext.egg-info/PKG-INFO +1 -1
  11. {polytext-0.2.2b2 → polytext-0.2.3}/polytext.egg-info/SOURCES.txt +4 -0
  12. {polytext-0.2.2b2 → polytext-0.2.3}/setup.py +1 -1
  13. polytext-0.2.3/tests/test_base_loader_error_mapping.py +79 -0
  14. polytext-0.2.3/tests/test_gemini_quality_guards.py +31 -0
  15. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_ocr_fallbacks.py +28 -5
  16. polytext-0.2.3/tests/test_python_version_metadata.py +45 -0
  17. polytext-0.2.3/tests/test_youtube_llm_fallbacks.py +103 -0
  18. {polytext-0.2.2b2 → polytext-0.2.3}/LICENSE +0 -0
  19. {polytext-0.2.2b2 → polytext-0.2.3}/README.md +0 -0
  20. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/__init__.py +0 -0
  21. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/__init__.py +0 -0
  22. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/base.py +0 -0
  23. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/gemini_quality_guards.py +0 -0
  24. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/html_to_md.py +0 -0
  25. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/md_to_text.py +0 -0
  26. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/pdf.py +0 -0
  27. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/converter/text_to_md.py +0 -0
  28. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/exceptions/__init__.py +0 -0
  29. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/exceptions/base.py +0 -0
  30. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/generator/__init__.py +0 -0
  31. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/generator/pdf.py +0 -0
  32. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/__init__.py +0 -0
  33. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/audio.py +0 -0
  34. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/document.py +0 -0
  35. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/document_ocr.py +0 -0
  36. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/downloader/__init__.py +0 -0
  37. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/downloader/downloader.py +0 -0
  38. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/html.py +0 -0
  39. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/markdown.py +0 -0
  40. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/notebook.py +0 -0
  41. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/ocr.py +0 -0
  42. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/plain_text.py +0 -0
  43. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/video.py +0 -0
  44. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/xml_xbrl.py +0 -0
  45. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/loader/youtube.py +0 -0
  46. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/processor/__init__.py +0 -0
  47. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/processor/audio_chunker.py +0 -0
  48. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/processor/text_merger.py +0 -0
  49. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/processor/transcript_chunker.py +0 -0
  50. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/prompts/__init__.py +0 -0
  51. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/prompts/ocr.py +0 -0
  52. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/prompts/text_merging.py +0 -0
  53. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/prompts/text_to_md.py +0 -0
  54. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/prompts/transcription.py +0 -0
  55. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/utils/__init__.py +0 -0
  56. {polytext-0.2.2b2 → polytext-0.2.3}/polytext/utils/utils.py +0 -0
  57. {polytext-0.2.2b2 → polytext-0.2.3}/polytext.egg-info/dependency_links.txt +0 -0
  58. {polytext-0.2.2b2 → polytext-0.2.3}/polytext.egg-info/not-zip-safe +0 -0
  59. {polytext-0.2.2b2 → polytext-0.2.3}/polytext.egg-info/requires.txt +0 -0
  60. {polytext-0.2.2b2 → polytext-0.2.3}/polytext.egg-info/top_level.txt +0 -0
  61. {polytext-0.2.2b2 → polytext-0.2.3}/pyproject.toml +0 -0
  62. {polytext-0.2.2b2 → polytext-0.2.3}/setup.cfg +0 -0
  63. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_audio_chunker.py +0 -0
  64. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_audio_comparison_helpers.py +0 -0
  65. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_audio_transcription_model_migration.py +0 -0
  66. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_compare_audio_models.py +0 -0
  67. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_compare_document_ocr_to_text_models.py +0 -0
  68. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_compare_ocr_to_text_models.py +0 -0
  69. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_compare_youtube_models.py +0 -0
  70. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_dowload_audio_from_youtube.py +0 -0
  71. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
  72. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_extracted_text_whitespace.py +0 -0
  73. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_audio_transcript_from_gcs.py +0 -0
  74. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_customized_pdf_from_markdown.py +0 -0
  75. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_document_ocr.py +0 -0
  76. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_document_ocr_azure_oai.py +0 -0
  77. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_document_text.py +0 -0
  78. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_document_text_from_gcs.py +0 -0
  79. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_ocr_from_image.py +0 -0
  80. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_text_from_markdown.py +0 -0
  81. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_get_video_transcript_from_gcs.py +0 -0
  82. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_library.py +0 -0
  83. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_markdown_loader_gzip.py +0 -0
  84. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_markitdown_html.py +0 -0
  85. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_notebook_loader.py +0 -0
  86. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_ocr_image_descriptions.py +0 -0
  87. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_pain_text.py +0 -0
  88. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_split_audio_with_llm.py +0 -0
  89. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_xml_xbrl_loader.py +0 -0
  90. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_youtube_gemini_minimal_check.py +0 -0
  91. {polytext-0.2.2b2 → polytext-0.2.3}/tests/test_youtube_transcript.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.2b2
3
+ Version: 0.2.3
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -90,16 +90,22 @@ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str
90
90
  os.close(fd)
91
91
 
92
92
  logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
93
- ffmpeg.input(input_path).output(
94
- temp_audio_path,
95
- q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
96
- acodec='libmp3lame',
97
- ac=1, # Convert to mono
98
- ar=16000, # Lower sample rate
99
- vn=None,
100
- threads=0, # Use maximum available threads
101
- loglevel='error', # Reduce logging overhead
102
- ).run(quiet=True, overwrite_output=True)
93
+ try:
94
+ ffmpeg.input(input_path).output(
95
+ temp_audio_path,
96
+ q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
97
+ acodec='libmp3lame',
98
+ ac=1, # Convert to mono
99
+ ar=16000, # Lower sample rate
100
+ vn=None,
101
+ threads=0, # Use maximum available threads
102
+ loglevel='error', # Reduce logging overhead
103
+ ).run(quiet=True, overwrite_output=True)
104
+ except Exception:
105
+ logger.exception("FFmpeg error during audio processing for %s", input_path)
106
+ if os.path.exists(temp_audio_path):
107
+ os.unlink(temp_audio_path)
108
+ raise
103
109
 
104
110
  logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
105
111
  return temp_audio_path
@@ -313,7 +319,11 @@ class AudioToTextConverter:
313
319
 
314
320
  mime_type, _ = mimetypes.guess_type(audio_file)
315
321
  if mime_type is None:
316
- raise ValueError("Audio format not recognized")
322
+ try:
323
+ raise ValueError("Audio format not recognized")
324
+ except ValueError:
325
+ logger.exception("Unsupported audio format for %s", audio_file)
326
+ raise
317
327
 
318
328
  return client.models.generate_content(
319
329
  model=self.transcription_model,
@@ -96,6 +96,7 @@ def compress_and_convert_image(input_path: str, target_size=1):
96
96
  return temp_image_path
97
97
 
98
98
  except Exception as e:
99
+ logger.exception("FFmpeg error during image processing for %s", input_path)
99
100
  raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
100
101
 
101
102
  def get_document_ocr(
@@ -389,7 +390,11 @@ class DocumentOCRToTextConverter:
389
390
  # Determine mimetype
390
391
  mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
391
392
  if mime_type is None:
392
- raise ValueError("Image format not recognized")
393
+ try:
394
+ raise ValueError("Image format not recognized")
395
+ except ValueError:
396
+ logger.exception("Unsupported image format for %s", temp_file_for_ocr)
397
+ raise
393
398
 
394
399
  response = client.models.generate_content(
395
400
  model=self.ocr_model,
@@ -88,6 +88,7 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
88
88
  return temp_image_path
89
89
 
90
90
  except Exception as e:
91
+ logger.exception("FFmpeg error during image processing for %s", input_path)
91
92
  raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
92
93
 
93
94
 
@@ -233,7 +234,11 @@ class DocumentOCRToTextConverter:
233
234
 
234
235
  mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
235
236
  if mime_type is None:
236
- raise ValueError("Image format not recognized")
237
+ try:
238
+ raise ValueError("Image format not recognized")
239
+ except ValueError:
240
+ logger.exception("Unsupported image format for %s", temp_file_for_ocr)
241
+ raise
237
242
 
238
243
  with open(temp_file_for_ocr, "rb") as f:
239
244
  image_b64 = base64.b64encode(f.read()).decode("utf-8")
@@ -308,7 +313,7 @@ class DocumentOCRToTextConverter:
308
313
  pdf = fitz.open(document_for_ocr)
309
314
  total_pages = len(pdf)
310
315
  if total_pages == 0:
311
- raise EmptyDocument(message="The document has no pages.", code=997)
316
+ raise EmptyDocument(message="The document has no pages.", code=998)
312
317
 
313
318
  start_page, end_page = self.validate_page_range(total_pages)
314
319
 
@@ -96,6 +96,7 @@ def compress_and_convert_image(input_path: str, target_size=1):
96
96
  return temp_image_path
97
97
 
98
98
  except Exception as e:
99
+ logger.exception("FFmpeg error during image processing for %s", input_path)
99
100
  raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
100
101
 
101
102
  def get_ocr(
@@ -383,7 +384,11 @@ class OCRToTextConverter:
383
384
  # Determine mimetype
384
385
  mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
385
386
  if mime_type is None:
386
- raise ValueError("Image format not recognized")
387
+ try:
388
+ raise ValueError("Image format not recognized")
389
+ except ValueError:
390
+ logger.exception("Unsupported image format for %s", temp_file_for_ocr)
391
+ raise
387
392
 
388
393
  response = client.models.generate_content(
389
394
  model=self.ocr_model,
@@ -87,6 +87,7 @@ def compress_and_convert_image(input_path: str, target_size=1) -> str:
87
87
  return temp_image_path
88
88
 
89
89
  except Exception as e:
90
+ logger.exception("FFmpeg error during image processing for %s", input_path)
90
91
  raise RuntimeError(f"FFmpeg error during image processing: {e}") from e
91
92
 
92
93
 
@@ -224,7 +225,11 @@ class OCRToTextConverter:
224
225
  # We'll use base64 data-URL.
225
226
  mime_type, _ = mimetypes.guess_type(temp_file_for_ocr)
226
227
  if mime_type is None:
227
- raise ValueError("Image format not recognized")
228
+ try:
229
+ raise ValueError("Image format not recognized")
230
+ except ValueError:
231
+ logger.exception("Unsupported image format for %s", temp_file_for_ocr)
232
+ raise
228
233
 
229
234
  with open(temp_file_for_ocr, "rb") as f:
230
235
  image_b64 = base64.b64encode(f.read()).decode("utf-8")
@@ -52,12 +52,12 @@ def convert_video_to_audio(video_file: str , bitrate_quality: int =9) -> str:
52
52
  return temp_audio_path
53
53
 
54
54
  except ffmpeg.Error as e:
55
- logger.info(f"FFmpeg conversion failed: {e.stderr.decode()}")
55
+ logger.exception("FFmpeg conversion failed: %s", e.stderr.decode())
56
56
  if os.path.exists(temp_audio_path):
57
57
  os.unlink(temp_audio_path)
58
58
  raise
59
59
  except Exception as e:
60
- logger.info(f"Failed to convert video to audio: {str(e)}")
60
+ logger.exception("Failed to convert video to audio: %s", str(e))
61
61
  if os.path.exists(temp_audio_path):
62
62
  os.unlink(temp_audio_path)
63
- raise
63
+ raise
@@ -40,6 +40,12 @@ logger = logging.getLogger(__name__)
40
40
 
41
41
  MIN_DOC_TEXT_LENGTH_ACCEPTED = int(os.getenv("MIN_DOC_TEXT_LENGTH_ACCEPTED", "400"))
42
42
  OCR_INCLUDE_IMAGE_DESCRIPTIONS_ENV = "OCR_INCLUDE_IMAGE_DESCRIPTIONS"
43
+ LLM_OUTPUT_ERROR_CODES = {
44
+ 995: "INVALID_ARGUMENT",
45
+ 996: "RECITATION",
46
+ 997: "REPETITIVE_OUTPUT",
47
+ 999: "MAX_TOKENS",
48
+ }
43
49
 
44
50
 
45
51
  def _read_bool_env(name: str, default: bool = False) -> bool:
@@ -148,6 +154,18 @@ class BaseLoader:
148
154
  response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
149
155
  except EmptyDocument as e:
150
156
  logger.info(f"Empty document encountered: {e.message}")
157
+ if e.code in LLM_OUTPUT_ERROR_CODES:
158
+ logger.exception(
159
+ "Raising LoaderError: status=422 code=%s original_empty_document_code=%s message=%s",
160
+ LLM_OUTPUT_ERROR_CODES[e.code],
161
+ e.code,
162
+ e.message,
163
+ )
164
+ raise LoaderError(
165
+ message=e.message,
166
+ status=422,
167
+ code=LLM_OUTPUT_ERROR_CODES[e.code],
168
+ ) from e
151
169
  if self.fallback_ocr:
152
170
  loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
153
171
  llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
@@ -317,6 +335,7 @@ class BaseLoader:
317
335
  return YoutubeTranscriptLoaderWithLlm(llm_api_key=llm_api_key, markdown_output=self.markdown_output, temp_dir=self.temp_dir, timeout_minutes=self.timeout_minutes, **kwargs)
318
336
  else:
319
337
  return HtmlLoader(markdown_output=self.markdown_output)
338
+ # Handle markdown files based on extension or MIME type
320
339
  if file_extension in [".md", ".markdown"] or (
321
340
  mime_type and mime_type.startswith("text/markdown")
322
341
  ):
@@ -344,7 +363,11 @@ class BaseLoader:
344
363
  **kwargs,
345
364
  )
346
365
  else:
347
- raise ValueError(f"Unsupported MIME type: {mime_type}")
366
+ try:
367
+ raise ValueError(f"Unsupported MIME type: {mime_type}")
368
+ except ValueError:
369
+ logger.exception("Unsupported media type while initializing loader: %s", mime_type)
370
+ raise
348
371
 
349
372
  elif self.validate_user_text(text=input):
350
373
  return PlainTextLoader(
@@ -384,7 +384,7 @@ class YoutubeTranscriptLoaderWithLlm:
384
384
  fallback_model=self.final_fallback_model,
385
385
  fallback_temperature=self.final_fallback_temperature,
386
386
  )
387
- raise Exception(f"Invalid argument: {e.message}; details={getattr(e, 'details', None)}")
387
+ raise e_tmp from e
388
388
 
389
389
  except errors.ServerError as e:
390
390
  logger.info("ServerError occurred with status %s and message: %s", e.status, e.message)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.2b2
3
+ Version: 0.2.3
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -56,6 +56,7 @@ polytext/utils/utils.py
56
56
  tests/test_audio_chunker.py
57
57
  tests/test_audio_comparison_helpers.py
58
58
  tests/test_audio_transcription_model_migration.py
59
+ tests/test_base_loader_error_mapping.py
59
60
  tests/test_compare_audio_models.py
60
61
  tests/test_compare_document_ocr_to_text_models.py
61
62
  tests/test_compare_ocr_to_text_models.py
@@ -63,6 +64,7 @@ tests/test_compare_youtube_models.py
63
64
  tests/test_dowload_audio_from_youtube.py
64
65
  tests/test_dowload_audio_from_youtube_helpers.py
65
66
  tests/test_extracted_text_whitespace.py
67
+ tests/test_gemini_quality_guards.py
66
68
  tests/test_get_audio_transcript_from_gcs.py
67
69
  tests/test_get_customized_pdf_from_markdown.py
68
70
  tests/test_get_document_ocr.py
@@ -79,7 +81,9 @@ tests/test_notebook_loader.py
79
81
  tests/test_ocr_fallbacks.py
80
82
  tests/test_ocr_image_descriptions.py
81
83
  tests/test_pain_text.py
84
+ tests/test_python_version_metadata.py
82
85
  tests/test_split_audio_with_llm.py
83
86
  tests/test_xml_xbrl_loader.py
84
87
  tests/test_youtube_gemini_minimal_check.py
88
+ tests/test_youtube_llm_fallbacks.py
85
89
  tests/test_youtube_transcript.py
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
51
51
 
52
52
  setup(
53
53
  name='polytext',
54
- version='0.2.2b2',
54
+ version='0.2.3',
55
55
  url='https://github.com/docsity/polytext',
56
56
  # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
57
57
  license='MIT',
@@ -0,0 +1,79 @@
1
+ import unittest
2
+ from unittest.mock import patch
3
+
4
+ from polytext.exceptions import EmptyDocument, LoaderError
5
+ from polytext.loader.base import BaseLoader
6
+
7
+
8
+ class _FailingLoader:
9
+ def __init__(self, error):
10
+ self.error = error
11
+
12
+ def load(self, input_path):
13
+ raise self.error
14
+
15
+
16
+ class _FakeBaseLoader(BaseLoader):
17
+ def __init__(self, error, **kwargs):
18
+ super().__init__(**kwargs)
19
+ self.error = error
20
+
21
+ def initiate_storage(self, input):
22
+ return {}
23
+
24
+ def init_loader_class(self, input, storage_client, llm_api_key, is_document_fallback=False, **kwargs):
25
+ return _FailingLoader(self.error)
26
+
27
+
28
+ class TestBaseLoaderErrorMapping(unittest.TestCase):
29
+ def test_llm_output_empty_document_codes_are_raised_as_loader_errors(self):
30
+ cases = [
31
+ (995, "INVALID_ARGUMENT"),
32
+ (996, "RECITATION"),
33
+ (997, "REPETITIVE_OUTPUT"),
34
+ (999, "MAX_TOKENS"),
35
+ ]
36
+
37
+ for empty_document_code, expected_loader_code in cases:
38
+ with self.subTest(empty_document_code=empty_document_code):
39
+ loader = _FakeBaseLoader(
40
+ EmptyDocument(
41
+ message=f"diagnostic failure {empty_document_code}",
42
+ code=empty_document_code,
43
+ )
44
+ )
45
+
46
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
47
+ with self.assertRaises(LoaderError) as error_context:
48
+ loader.get_text(["dummy.txt"])
49
+
50
+ error = error_context.exception
51
+ self.assertEqual(error.status, 422)
52
+ self.assertEqual(error.code, expected_loader_code)
53
+ self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
54
+ mock_exception.assert_called_once()
55
+ self.assertIn("Raising LoaderError", mock_exception.call_args.args[0])
56
+ self.assertEqual(mock_exception.call_args.args[1], expected_loader_code)
57
+ self.assertEqual(mock_exception.call_args.args[2], empty_document_code)
58
+ self.assertEqual(mock_exception.call_args.args[3], f"diagnostic failure {empty_document_code}")
59
+
60
+ def test_empty_or_too_short_documents_still_return_empty_response(self):
61
+ loader = _FakeBaseLoader(
62
+ EmptyDocument(
63
+ message="Document text with less than 400 characters",
64
+ code=998,
65
+ )
66
+ )
67
+
68
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
69
+ response = loader.get_text(["empty.txt"])
70
+
71
+ self.assertEqual(response["text"], "")
72
+ self.assertEqual(response["completion_tokens"], 0)
73
+ self.assertEqual(response["prompt_tokens"], 0)
74
+ self.assertEqual(response["output_list"][0]["input"], "empty.txt")
75
+ mock_exception.assert_not_called()
76
+
77
+
78
+ if __name__ == "__main__":
79
+ unittest.main()
@@ -0,0 +1,31 @@
1
+ import unittest
2
+
3
+ from polytext.converter.gemini_quality_guards import tail_has_excessive_repetition
4
+
5
+
6
+ class TestGeminiQualityGuards(unittest.TestCase):
7
+ def test_detects_consecutive_repeated_sentences_below_ratio_threshold(self):
8
+ text = (
9
+ "gli davamo nomi veri e falsi. "
10
+ "_Elio_ all'anagrafe e io gli gli dicevo _Roberto Gustativi_. "
11
+ "E questi scrivevano Roberto Gustativi. "
12
+ "E la soddisfazione perversa era andare a comprare il giornale. "
13
+ "È successo. "
14
+ "Sono stato. "
15
+ "Che è successo? "
16
+ "Siamo passati dal basement. "
17
+ "Siamo passati dal basement. "
18
+ "Siamo passati dal basement. "
19
+ "Siamo passati dal basement. "
20
+ "Il miglior finale di sempre. "
21
+ "Grazie, grazie, grazie, grazie, grazie, grazie, grazie. "
22
+ "E vi grazie."
23
+ )
24
+
25
+ self.assertTrue(
26
+ tail_has_excessive_repetition(text, tail_lines=200, threshold=0.35)
27
+ )
28
+
29
+
30
+ if __name__ == "__main__":
31
+ unittest.main()
@@ -4,7 +4,11 @@ from types import SimpleNamespace
4
4
  from unittest.mock import patch
5
5
 
6
6
  from polytext.converter.document_ocr_to_text import DocumentOCRToTextConverter
7
+ from polytext.converter.document_ocr_to_text_azure_oai import (
8
+ DocumentOCRToTextConverter as AzureDocumentOCRToTextConverter,
9
+ )
7
10
  from polytext.converter.ocr_to_text import OCRToTextConverter
11
+ from polytext.exceptions import EmptyDocument
8
12
 
9
13
 
10
14
  def _make_response(
@@ -208,11 +212,12 @@ class TestOcrFallbacks(unittest.TestCase):
208
212
  )
209
213
  mock_client_cls.return_value = fake_client
210
214
 
211
- converter = OCRToTextConverter(ocr_model="gemini-3.1-flash-lite-preview")
212
- with tempfile.NamedTemporaryFile(suffix=".png") as temp_image:
213
- temp_image.write(b"fake-image")
214
- temp_image.flush()
215
- result = converter.get_ocr(temp_image.name)
215
+ with patch("polytext.converter.ocr_to_text.OCR_FINAL_FALLBACK_MODEL", "gemini-2.0-flash"):
216
+ converter = OCRToTextConverter(ocr_model="gemini-3.1-flash-lite-preview")
217
+ with tempfile.NamedTemporaryFile(suffix=".png") as temp_image:
218
+ temp_image.write(b"fake-image")
219
+ temp_image.flush()
220
+ result = converter.get_ocr(temp_image.name)
216
221
 
217
222
  self.assertEqual(result["text"], "final fallback text")
218
223
  self.assertEqual(
@@ -263,6 +268,24 @@ class TestOcrFallbacks(unittest.TestCase):
263
268
  )
264
269
  self.assertEqual(fake_client.models.generate_content_temperatures, [0.0, 0.0, 1.0])
265
270
 
271
+ @patch("fitz.open")
272
+ def test_azure_document_ocr_no_pages_is_empty_or_too_short(self, mock_fitz_open):
273
+ mock_fitz_open.return_value = _FakePdf([])
274
+
275
+ converter = AzureDocumentOCRToTextConverter(
276
+ azure_endpoint="https://example.openai.azure.com",
277
+ azure_api_version="2024-10-21",
278
+ )
279
+
280
+ with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_pdf:
281
+ temp_pdf.write(b"%PDF-1.4\n")
282
+ temp_pdf.flush()
283
+ with self.assertRaises(EmptyDocument) as error_context:
284
+ converter.get_document_ocr(temp_pdf.name)
285
+
286
+ self.assertEqual(error_context.exception.code, 998)
287
+ self.assertEqual(error_context.exception.message, "The document has no pages.")
288
+
266
289
 
267
290
  if __name__ == "__main__":
268
291
  unittest.main()
@@ -0,0 +1,45 @@
1
+ import ast
2
+ import tomllib
3
+ import unittest
4
+ from pathlib import Path
5
+
6
+
7
+ ROOT = Path(__file__).resolve().parents[1]
8
+
9
+
10
+ def _setup_keyword(name):
11
+ tree = ast.parse((ROOT / "setup.py").read_text())
12
+ setup_call = next(
13
+ node
14
+ for node in ast.walk(tree)
15
+ if isinstance(node, ast.Call)
16
+ and getattr(node.func, "id", None) == "setup"
17
+ )
18
+ return next(
19
+ keyword.value
20
+ for keyword in setup_call.keywords
21
+ if keyword.arg == name
22
+ )
23
+
24
+
25
+ class PythonVersionMetadataTest(unittest.TestCase):
26
+ def test_packaging_metadata_allows_python_311(self):
27
+ setup_python_requires = ast.literal_eval(_setup_keyword("python_requires"))
28
+ pyproject = tomllib.loads((ROOT / "pyproject.toml").read_text())
29
+
30
+ self.assertEqual(setup_python_requires, ">=3.11")
31
+ self.assertEqual(
32
+ pyproject["tool"]["poetry"]["dependencies"]["python"],
33
+ ">=3.11,<3.14",
34
+ )
35
+
36
+ def test_setup_classifiers_include_supported_python_versions(self):
37
+ classifiers = ast.literal_eval(_setup_keyword("classifiers"))
38
+
39
+ self.assertIn("Programming Language :: Python :: 3.11", classifiers)
40
+ self.assertIn("Programming Language :: Python :: 3.12", classifiers)
41
+ self.assertIn("Programming Language :: Python :: 3.13", classifiers)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ unittest.main()
@@ -0,0 +1,103 @@
1
+ import unittest
2
+ from types import SimpleNamespace
3
+ from unittest.mock import patch
4
+
5
+ from google.genai import errors as genai_errors
6
+ from google.genai import types
7
+
8
+ from polytext.loader.youtube_llm import YoutubeTranscriptLoaderWithLlm
9
+ from polytext.exceptions import EmptyDocument
10
+
11
+
12
+ def _make_response(text="full transcript"):
13
+ return SimpleNamespace(
14
+ text=text,
15
+ candidates=[SimpleNamespace(finish_reason="STOP")],
16
+ usage_metadata=SimpleNamespace(
17
+ candidates_token_count=3,
18
+ prompt_token_count=2,
19
+ total_token_count=5,
20
+ ),
21
+ )
22
+
23
+
24
+ class _FakeModels:
25
+ def __init__(self, response):
26
+ self.response = response
27
+ self.generate_content_config = None
28
+ self.generate_content_model = None
29
+
30
+ def generate_content(self, model, contents, config):
31
+ self.generate_content_model = model
32
+ self.generate_content_config = config
33
+ if isinstance(self.response, Exception):
34
+ raise self.response
35
+ return self.response
36
+
37
+
38
+ class _FakeClient:
39
+ def __init__(self, response):
40
+ self.models = _FakeModels(response)
41
+
42
+
43
+ def _invalid_argument_error():
44
+ return genai_errors.ClientError(
45
+ 400,
46
+ {
47
+ "error": {
48
+ "code": 400,
49
+ "message": "Request contains an invalid argument.",
50
+ "status": "INVALID_ARGUMENT",
51
+ }
52
+ },
53
+ None,
54
+ )
55
+
56
+
57
+ def _long_transcript():
58
+ return " ".join(
59
+ f"This is transcript sentence number {index} with unique content."
60
+ for index in range(20)
61
+ )
62
+
63
+
64
+ class TestYoutubeLlmFallbacks(unittest.TestCase):
65
+ @patch("polytext.loader.youtube_llm.genai.Client")
66
+ def test_invalid_argument_final_fallback_uses_original_temperature(self, mock_client_cls):
67
+ clients = [
68
+ _FakeClient(_invalid_argument_error()),
69
+ _FakeClient(_invalid_argument_error()),
70
+ _FakeClient(_make_response(_long_transcript())),
71
+ ]
72
+ mock_client_cls.side_effect = clients
73
+
74
+ loader = YoutubeTranscriptLoaderWithLlm()
75
+ result = loader.get_text_from_youtube("https://www.youtube.com/watch?v=example")
76
+
77
+ self.assertEqual(result["completion_model"], "models/gemini-2.5-flash")
78
+ self.assertEqual(clients[2].models.generate_content_config.temperature, 0.0)
79
+ self.assertEqual(
80
+ clients[2].models.generate_content_config.media_resolution,
81
+ types.MediaResolution.MEDIA_RESOLUTION_LOW,
82
+ )
83
+
84
+ @patch("polytext.loader.youtube_llm.genai.Client")
85
+ def test_invalid_argument_after_fallbacks_raises_empty_document_code_995(self, mock_client_cls):
86
+ clients = [
87
+ _FakeClient(_invalid_argument_error()),
88
+ _FakeClient(_invalid_argument_error()),
89
+ _FakeClient(_invalid_argument_error()),
90
+ ]
91
+ mock_client_cls.side_effect = clients
92
+
93
+ loader = YoutubeTranscriptLoaderWithLlm()
94
+
95
+ with self.assertRaises(EmptyDocument) as error_context:
96
+ loader.get_text_from_youtube("https://www.youtube.com/watch?v=example")
97
+
98
+ self.assertEqual(error_context.exception.code, 995)
99
+ self.assertIn("INVALID_ARGUMENT", error_context.exception.message)
100
+
101
+
102
+ if __name__ == "__main__":
103
+ unittest.main()
File without changes
File without changes
File without changes
File without changes