polytext 0.2.3__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {polytext-0.2.3 → polytext-0.2.4}/PKG-INFO +1 -1
  2. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/audio_to_text.py +1 -1
  3. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/ocr_to_text.py +1 -1
  4. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/base.py +14 -7
  5. {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/PKG-INFO +1 -1
  6. {polytext-0.2.3 → polytext-0.2.4}/setup.py +1 -1
  7. {polytext-0.2.3 → polytext-0.2.4}/tests/test_base_loader_error_mapping.py +11 -9
  8. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_ocr_from_image.py +1 -1
  9. {polytext-0.2.3 → polytext-0.2.4}/tests/test_youtube_transcript.py +2 -2
  10. {polytext-0.2.3 → polytext-0.2.4}/LICENSE +0 -0
  11. {polytext-0.2.3 → polytext-0.2.4}/README.md +0 -0
  12. {polytext-0.2.3 → polytext-0.2.4}/polytext/__init__.py +0 -0
  13. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/__init__.py +0 -0
  14. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/base.py +0 -0
  15. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/document_ocr_to_text.py +0 -0
  16. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
  17. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/gemini_quality_guards.py +0 -0
  18. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/html_to_md.py +0 -0
  19. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/md_to_text.py +0 -0
  20. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
  21. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/pdf.py +0 -0
  22. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/text_to_md.py +0 -0
  23. {polytext-0.2.3 → polytext-0.2.4}/polytext/converter/video_to_audio.py +0 -0
  24. {polytext-0.2.3 → polytext-0.2.4}/polytext/exceptions/__init__.py +0 -0
  25. {polytext-0.2.3 → polytext-0.2.4}/polytext/exceptions/base.py +0 -0
  26. {polytext-0.2.3 → polytext-0.2.4}/polytext/generator/__init__.py +0 -0
  27. {polytext-0.2.3 → polytext-0.2.4}/polytext/generator/pdf.py +0 -0
  28. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/__init__.py +0 -0
  29. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/audio.py +0 -0
  30. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/document.py +0 -0
  31. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/document_ocr.py +0 -0
  32. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/downloader/__init__.py +0 -0
  33. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/downloader/downloader.py +0 -0
  34. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/html.py +0 -0
  35. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/markdown.py +0 -0
  36. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/notebook.py +0 -0
  37. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/ocr.py +0 -0
  38. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/plain_text.py +0 -0
  39. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/video.py +0 -0
  40. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/xml_xbrl.py +0 -0
  41. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/youtube.py +0 -0
  42. {polytext-0.2.3 → polytext-0.2.4}/polytext/loader/youtube_llm.py +0 -0
  43. {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/__init__.py +0 -0
  44. {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/audio_chunker.py +0 -0
  45. {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/text_merger.py +0 -0
  46. {polytext-0.2.3 → polytext-0.2.4}/polytext/processor/transcript_chunker.py +0 -0
  47. {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/__init__.py +0 -0
  48. {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/ocr.py +0 -0
  49. {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/text_merging.py +0 -0
  50. {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/text_to_md.py +0 -0
  51. {polytext-0.2.3 → polytext-0.2.4}/polytext/prompts/transcription.py +0 -0
  52. {polytext-0.2.3 → polytext-0.2.4}/polytext/utils/__init__.py +0 -0
  53. {polytext-0.2.3 → polytext-0.2.4}/polytext/utils/utils.py +0 -0
  54. {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/SOURCES.txt +0 -0
  55. {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/dependency_links.txt +0 -0
  56. {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/not-zip-safe +0 -0
  57. {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/requires.txt +0 -0
  58. {polytext-0.2.3 → polytext-0.2.4}/polytext.egg-info/top_level.txt +0 -0
  59. {polytext-0.2.3 → polytext-0.2.4}/pyproject.toml +0 -0
  60. {polytext-0.2.3 → polytext-0.2.4}/setup.cfg +0 -0
  61. {polytext-0.2.3 → polytext-0.2.4}/tests/test_audio_chunker.py +0 -0
  62. {polytext-0.2.3 → polytext-0.2.4}/tests/test_audio_comparison_helpers.py +0 -0
  63. {polytext-0.2.3 → polytext-0.2.4}/tests/test_audio_transcription_model_migration.py +0 -0
  64. {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_audio_models.py +0 -0
  65. {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_document_ocr_to_text_models.py +0 -0
  66. {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_ocr_to_text_models.py +0 -0
  67. {polytext-0.2.3 → polytext-0.2.4}/tests/test_compare_youtube_models.py +0 -0
  68. {polytext-0.2.3 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube.py +0 -0
  69. {polytext-0.2.3 → polytext-0.2.4}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
  70. {polytext-0.2.3 → polytext-0.2.4}/tests/test_extracted_text_whitespace.py +0 -0
  71. {polytext-0.2.3 → polytext-0.2.4}/tests/test_gemini_quality_guards.py +0 -0
  72. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_audio_transcript_from_gcs.py +0 -0
  73. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_customized_pdf_from_markdown.py +0 -0
  74. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_ocr.py +0 -0
  75. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_ocr_azure_oai.py +0 -0
  76. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_text.py +0 -0
  77. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_document_text_from_gcs.py +0 -0
  78. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_text_from_markdown.py +0 -0
  79. {polytext-0.2.3 → polytext-0.2.4}/tests/test_get_video_transcript_from_gcs.py +0 -0
  80. {polytext-0.2.3 → polytext-0.2.4}/tests/test_library.py +0 -0
  81. {polytext-0.2.3 → polytext-0.2.4}/tests/test_markdown_loader_gzip.py +0 -0
  82. {polytext-0.2.3 → polytext-0.2.4}/tests/test_markitdown_html.py +0 -0
  83. {polytext-0.2.3 → polytext-0.2.4}/tests/test_notebook_loader.py +0 -0
  84. {polytext-0.2.3 → polytext-0.2.4}/tests/test_ocr_fallbacks.py +0 -0
  85. {polytext-0.2.3 → polytext-0.2.4}/tests/test_ocr_image_descriptions.py +0 -0
  86. {polytext-0.2.3 → polytext-0.2.4}/tests/test_pain_text.py +0 -0
  87. {polytext-0.2.3 → polytext-0.2.4}/tests/test_python_version_metadata.py +0 -0
  88. {polytext-0.2.3 → polytext-0.2.4}/tests/test_split_audio_with_llm.py +0 -0
  89. {polytext-0.2.3 → polytext-0.2.4}/tests/test_xml_xbrl_loader.py +0 -0
  90. {polytext-0.2.3 → polytext-0.2.4}/tests/test_youtube_gemini_minimal_check.py +0 -0
  91. {polytext-0.2.3 → polytext-0.2.4}/tests/test_youtube_llm_fallbacks.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -46,7 +46,7 @@ AUDIO_TAIL_REPETITION_THRESHOLD = float(os.getenv("AUDIO_TAIL_REPETITION_THRESHO
46
46
  AUDIO_FALLBACK_SOURCE_PATTERN = os.getenv("AUDIO_FALLBACK_SOURCE_PATTERN", "flash-lite")
47
47
  AUDIO_FALLBACK_MODEL = os.getenv("AUDIO_FALLBACK_MODEL", "gemini-3-flash-preview")
48
48
  AUDIO_FALLBACK_TEMPERATURE = float(os.getenv("AUDIO_FALLBACK_TEMPERATURE", "1.0"))
49
- AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
49
+ AUDIO_FINAL_FALLBACK_MODEL = os.getenv("AUDIO_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
50
50
  AUDIO_FILE_UPLOAD_THRESHOLD_BYTES = 20 * 1024 * 1024
51
51
  NO_HUMAN_SPEECH_MARKER = "no human speech detected"
52
52
 
@@ -33,7 +33,7 @@ OCR_TAIL_REPETITION_THRESHOLD = float(os.getenv("OCR_TAIL_REPETITION_THRESHOLD",
33
33
  OCR_FALLBACK_SOURCE_PATTERN = os.getenv("OCR_FALLBACK_SOURCE_PATTERN", "flash-lite-preview")
34
34
  OCR_FALLBACK_MODEL = os.getenv("OCR_FALLBACK_MODEL", "gemini-3-flash-preview")
35
35
  OCR_FALLBACK_TEMPERATURE = float(os.getenv("OCR_FALLBACK_TEMPERATURE", "1.0"))
36
- OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-2.0-flash")
36
+ OCR_FINAL_FALLBACK_MODEL = os.getenv("OCR_FINAL_FALLBACK_MODEL", "gemini-3.5-flash")
37
37
 
38
38
 
39
39
  def compress_and_convert_image(input_path: str, target_size=1):
@@ -55,6 +55,18 @@ def _read_bool_env(name: str, default: bool = False) -> bool:
55
55
  return value.strip().lower() in {"1", "true", "yes", "y", "on"}
56
56
 
57
57
 
58
+ def _capture_exception_for_sentry(error: Exception) -> None:
59
+ try:
60
+ import sentry_sdk
61
+ except ImportError:
62
+ return
63
+
64
+ try:
65
+ sentry_sdk.capture_exception(error)
66
+ except Exception:
67
+ return
68
+
69
+
58
70
  class BaseLoader:
59
71
  def __init__(self, markdown_output=True, llm_api_key=None, provider: str = "google", temp_dir: str = "temp",
60
72
  ocr_model: str = "gpt-5-mini", timeout_minutes: int | None = None,
@@ -153,19 +165,14 @@ class BaseLoader:
153
165
  try:
154
166
  response = self.run_loader_class(loader_class=loader_class, input_list=input_list)
155
167
  except EmptyDocument as e:
156
- logger.info(f"Empty document encountered: {e.message}")
157
168
  if e.code in LLM_OUTPUT_ERROR_CODES:
158
- logger.exception(
159
- "Raising LoaderError: status=422 code=%s original_empty_document_code=%s message=%s",
160
- LLM_OUTPUT_ERROR_CODES[e.code],
161
- e.code,
162
- e.message,
163
- )
169
+ _capture_exception_for_sentry(e)
164
170
  raise LoaderError(
165
171
  message=e.message,
166
172
  status=422,
167
173
  code=LLM_OUTPUT_ERROR_CODES[e.code],
168
174
  ) from e
175
+ logger.info(f"Empty document encountered: {e.message}")
169
176
  if self.fallback_ocr:
170
177
  loader_class = self.init_loader_class(input=first_file_url, storage_client=storage_client,
171
178
  llm_api_key=self.llm_api_key, is_document_fallback=True, **kwargs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.3
3
+ Version: 0.2.4
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
51
51
 
52
52
  setup(
53
53
  name='polytext',
54
- version='0.2.3',
54
+ version='0.2.4',
55
55
  url='https://github.com/docsity/polytext',
56
56
  # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
57
57
  license='MIT',
@@ -1,5 +1,5 @@
1
1
  import unittest
2
- from unittest.mock import patch
2
+ from unittest.mock import Mock, patch
3
3
 
4
4
  from polytext.exceptions import EmptyDocument, LoaderError
5
5
  from polytext.loader.base import BaseLoader
@@ -43,19 +43,21 @@ class TestBaseLoaderErrorMapping(unittest.TestCase):
43
43
  )
44
44
  )
45
45
 
46
- with patch("polytext.loader.base.logger.exception") as mock_exception:
47
- with self.assertRaises(LoaderError) as error_context:
48
- loader.get_text(["dummy.txt"])
46
+ sentry_sdk = Mock()
47
+ with patch("polytext.loader.base.logger.info") as mock_info:
48
+ with patch("polytext.loader.base.logger.exception") as mock_exception:
49
+ with patch.dict("sys.modules", {"sentry_sdk": sentry_sdk}):
50
+ with self.assertRaises(LoaderError) as error_context:
51
+ loader.get_text(["dummy.txt"])
49
52
 
50
53
  error = error_context.exception
51
54
  self.assertEqual(error.status, 422)
52
55
  self.assertEqual(error.code, expected_loader_code)
53
56
  self.assertEqual(error.message, f"diagnostic failure {empty_document_code}")
54
- mock_exception.assert_called_once()
55
- self.assertIn("Raising LoaderError", mock_exception.call_args.args[0])
56
- self.assertEqual(mock_exception.call_args.args[1], expected_loader_code)
57
- self.assertEqual(mock_exception.call_args.args[2], empty_document_code)
58
- self.assertEqual(mock_exception.call_args.args[3], f"diagnostic failure {empty_document_code}")
57
+ mock_info.assert_not_called()
58
+ mock_exception.assert_not_called()
59
+ sentry_sdk.capture_exception.assert_called_once()
60
+ self.assertIs(sentry_sdk.capture_exception.call_args.args[0], error.__cause__)
59
61
 
60
62
  def test_empty_or_too_short_documents_still_return_empty_response(self):
61
63
  loader = _FakeBaseLoader(
@@ -38,7 +38,7 @@ def main():
38
38
 
39
39
  # local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9695.jpg"
40
40
  # local_file_path = "/Users/marcodelgiudice/Projects/polytext/IMG_9701.jpg"
41
- local_file_path = "/Users/marcodelgiudice/Projects/polytext/chimicaformula.png"
41
+ local_file_path = "/Users/marcodelgiudice/Projects/polytext/gm1.png"
42
42
 
43
43
  try:
44
44
  start = time.time()
@@ -32,9 +32,9 @@ url = 'https://www.youtube.com/watch?v=L4as3tks4Js' # basement alberto angela
32
32
 
33
33
  # url = 'https://www.youtube.com/watch?v=UabBYexBD4k' # INM RAG 11 minuti, completato in 26 secondi con successo con gemini-3.1-flash-lite
34
34
 
35
- url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
35
+ #url = 'https://www.youtube.com/watch?v=96jN2OCOfLs' # Vibe coding 30 minuti, completato in 150 secondi con successo con gemini-3-flash-preview (160k token in input, 7k in output), 3.1-flash-lite ha raggiunto i max tokens in output (50k) probabile repetition
36
36
 
37
- # url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw' # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
37
+ url = 'https://www.youtube.com/watch?v=HGfsGvmRaaw' # barbero2 50 minuti, fallito, RECITATION in tutti e 3 i modelli (275k token in input)
38
38
 
39
39
  # url = 'https://www.youtube.com/watch?v=CM2CkNU9xR0' # google antigravity 27 minuti, completato in 39 secondi con successo con gemini-3.1-flash-lite (146k token in input, 6k token in output)
40
40
 
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes