polytext 0.2.6__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {polytext-0.2.6 → polytext-0.2.7}/PKG-INFO +1 -1
  2. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/audio_to_text.py +30 -2
  3. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/base.py +0 -4
  4. {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/PKG-INFO +1 -1
  5. {polytext-0.2.6 → polytext-0.2.7}/setup.py +1 -1
  6. {polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_transcription_model_migration.py +35 -0
  7. {polytext-0.2.6 → polytext-0.2.7}/LICENSE +0 -0
  8. {polytext-0.2.6 → polytext-0.2.7}/README.md +0 -0
  9. {polytext-0.2.6 → polytext-0.2.7}/polytext/__init__.py +0 -0
  10. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/__init__.py +0 -0
  11. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/base.py +0 -0
  12. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/beautiful_text.py +0 -0
  13. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/document_ocr_to_text.py +0 -0
  14. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
  15. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/gemini_quality_guards.py +0 -0
  16. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/html_to_md.py +0 -0
  17. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/md_to_text.py +0 -0
  18. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/ocr_to_text.py +0 -0
  19. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
  20. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/pdf.py +0 -0
  21. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/text_to_md.py +0 -0
  22. {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/video_to_audio.py +0 -0
  23. {polytext-0.2.6 → polytext-0.2.7}/polytext/exceptions/__init__.py +0 -0
  24. {polytext-0.2.6 → polytext-0.2.7}/polytext/exceptions/base.py +0 -0
  25. {polytext-0.2.6 → polytext-0.2.7}/polytext/generator/__init__.py +0 -0
  26. {polytext-0.2.6 → polytext-0.2.7}/polytext/generator/pdf.py +0 -0
  27. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/__init__.py +0 -0
  28. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/audio.py +0 -0
  29. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/document.py +0 -0
  30. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/document_ocr.py +0 -0
  31. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/downloader/__init__.py +0 -0
  32. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/downloader/downloader.py +0 -0
  33. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/html.py +0 -0
  34. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/markdown.py +0 -0
  35. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/notebook.py +0 -0
  36. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/ocr.py +0 -0
  37. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/plain_text.py +0 -0
  38. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/video.py +0 -0
  39. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/xml_xbrl.py +0 -0
  40. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/youtube.py +0 -0
  41. {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/youtube_llm.py +0 -0
  42. {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/__init__.py +0 -0
  43. {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/audio_chunker.py +0 -0
  44. {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/text_merger.py +0 -0
  45. {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/transcript_chunker.py +0 -0
  46. {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/__init__.py +0 -0
  47. {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/beautiful_text.py +0 -0
  48. {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/ocr.py +0 -0
  49. {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/text_merging.py +0 -0
  50. {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/text_to_md.py +0 -0
  51. {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/transcription.py +0 -0
  52. {polytext-0.2.6 → polytext-0.2.7}/polytext/utils/__init__.py +0 -0
  53. {polytext-0.2.6 → polytext-0.2.7}/polytext/utils/utils.py +0 -0
  54. {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/SOURCES.txt +0 -0
  55. {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/dependency_links.txt +0 -0
  56. {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/not-zip-safe +0 -0
  57. {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/requires.txt +0 -0
  58. {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/top_level.txt +0 -0
  59. {polytext-0.2.6 → polytext-0.2.7}/pyproject.toml +0 -0
  60. {polytext-0.2.6 → polytext-0.2.7}/setup.cfg +0 -0
  61. {polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_chunker.py +0 -0
  62. {polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_comparison_helpers.py +0 -0
  63. {polytext-0.2.6 → polytext-0.2.7}/tests/test_base_loader_error_mapping.py +0 -0
  64. {polytext-0.2.6 → polytext-0.2.7}/tests/test_beautiful_text_manual.py +0 -0
  65. {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_audio_models.py +0 -0
  66. {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_document_ocr_to_text_models.py +0 -0
  67. {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_ocr_to_text_models.py +0 -0
  68. {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_youtube_models.py +0 -0
  69. {polytext-0.2.6 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube.py +0 -0
  70. {polytext-0.2.6 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
  71. {polytext-0.2.6 → polytext-0.2.7}/tests/test_extracted_text_whitespace.py +0 -0
  72. {polytext-0.2.6 → polytext-0.2.7}/tests/test_gemini_quality_guards.py +0 -0
  73. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_audio_transcript_from_gcs.py +0 -0
  74. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_customized_pdf_from_markdown.py +0 -0
  75. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_ocr.py +0 -0
  76. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_ocr_azure_oai.py +0 -0
  77. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_text.py +0 -0
  78. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_text_from_gcs.py +0 -0
  79. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_ocr_from_image.py +0 -0
  80. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_text_from_markdown.py +0 -0
  81. {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_video_transcript_from_gcs.py +0 -0
  82. {polytext-0.2.6 → polytext-0.2.7}/tests/test_library.py +0 -0
  83. {polytext-0.2.6 → polytext-0.2.7}/tests/test_markdown_loader_gzip.py +0 -0
  84. {polytext-0.2.6 → polytext-0.2.7}/tests/test_markitdown_html.py +0 -0
  85. {polytext-0.2.6 → polytext-0.2.7}/tests/test_notebook_loader.py +0 -0
  86. {polytext-0.2.6 → polytext-0.2.7}/tests/test_ocr_fallbacks.py +0 -0
  87. {polytext-0.2.6 → polytext-0.2.7}/tests/test_ocr_image_descriptions.py +0 -0
  88. {polytext-0.2.6 → polytext-0.2.7}/tests/test_pain_text.py +0 -0
  89. {polytext-0.2.6 → polytext-0.2.7}/tests/test_pdf_conversion_error.py +0 -0
  90. {polytext-0.2.6 → polytext-0.2.7}/tests/test_python_version_metadata.py +0 -0
  91. {polytext-0.2.6 → polytext-0.2.7}/tests/test_split_audio_with_llm.py +0 -0
  92. {polytext-0.2.6 → polytext-0.2.7}/tests/test_xml_xbrl_loader.py +0 -0
  93. {polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_gemini_minimal_check.py +0 -0
  94. {polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_llm_fallbacks.py +0 -0
  95. {polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_transcript.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -66,6 +66,29 @@ def normalize_no_human_speech_marker(text: str) -> tuple[str, bool]:
66
66
  return cleaned_text, False
67
67
 
68
68
 
69
+ def add_line_break_after_each_sentence(text: str) -> str:
70
+ if not text:
71
+ return text
72
+
73
+ lines = text.splitlines()
74
+ formatted_lines = []
75
+
76
+ for line in lines:
77
+ stripped_line = line.strip()
78
+ if not stripped_line:
79
+ formatted_lines.append("")
80
+ continue
81
+ if re.match(r"^#{1,6}\s+", stripped_line):
82
+ formatted_lines.append(stripped_line)
83
+ continue
84
+
85
+ normalized_line = re.sub(r"\s+", " ", stripped_line)
86
+ normalized_line = re.sub(r"([.!?])\s+", r"\1\n", normalized_line)
87
+ formatted_lines.append(normalized_line)
88
+
89
+ return "\n".join(formatted_lines).strip()
90
+
91
+
69
92
  def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
70
93
  """
71
94
  Compress and convert an audio file to MP3 using ffmpeg.
@@ -434,6 +457,8 @@ class AudioToTextConverter:
434
457
  )
435
458
 
436
459
  response_text, marker_only = normalize_no_human_speech_marker(response_text)
460
+ if not marker_only:
461
+ response_text = self.format_audio_output_text(response_text)
437
462
 
438
463
  response_dict = {
439
464
  "transcript": "" if marker_only else response_text,
@@ -473,6 +498,9 @@ class AudioToTextConverter:
473
498
  transcript_dict = self.transcribe_audio(chunk["file_path"])
474
499
  return index, transcript_dict
475
500
 
501
+ def format_audio_output_text(self, text: str) -> str:
502
+ return add_line_break_after_each_sentence(text)
503
+
476
504
  def transcribe_full_audio(self,
477
505
  audio_path: str, save_transcript_chunks: bool = False) -> dict:
478
506
  """
@@ -565,7 +593,7 @@ class AudioToTextConverter:
565
593
  full_text_merged_dict = text_merger.merge_chunks_with_llm_sequential(chunks=transcript_chunks)
566
594
 
567
595
  result_dict = {
568
- "text": full_text_merged_dict["full_text_merged"],
596
+ "text": self.format_audio_output_text(full_text_merged_dict["full_text_merged"]),
569
597
  "completion_tokens": completion_tokens + full_text_merged_dict["completion_tokens"],
570
598
  "prompt_tokens": prompt_tokens + full_text_merged_dict["prompt_tokens"],
571
599
  "completion_model": self.transcription_model,
@@ -586,7 +614,7 @@ class AudioToTextConverter:
586
614
  if key in chunk_results[0]:
587
615
  result_dict[key] = chunk_results[0][key]
588
616
  if save_transcript_chunks:
589
- result_dict["text_chunks"] = transcript_chunks
617
+ result_dict["text_chunks"] = [self.format_audio_output_text(chunk) for chunk in transcript_chunks]
590
618
  result_dict["chunk_results"] = chunk_results
591
619
 
592
620
  # Clean up temporary files
@@ -268,8 +268,6 @@ class BaseLoader:
268
268
  "type": raw_result.get("type", "text"),
269
269
  "input": input_list[0],
270
270
  }
271
- if "markdown_json" in cleanup_result:
272
- result_item["markdown_json"] = cleanup_result["markdown_json"]
273
271
  if "chapters" in cleanup_result:
274
272
  result_item["chapters"] = cleanup_result["chapters"]
275
273
 
@@ -284,8 +282,6 @@ class BaseLoader:
284
282
  "input": result_item["input"],
285
283
  "output_list": [result_item],
286
284
  }
287
- if "markdown_json" in result_item:
288
- response["markdown_json"] = result_item["markdown_json"]
289
285
  if "chapters" in result_item:
290
286
  response["chapters"] = result_item["chapters"]
291
287
  return response
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.6
3
+ Version: 0.2.7
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
51
51
 
52
52
  setup(
53
53
  name='polytext',
54
- version='0.2.6',
54
+ version='0.2.7',
55
55
  url='https://github.com/docsity/polytext',
56
56
  # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
57
57
  license='MIT',
@@ -1,5 +1,6 @@
1
1
  import unittest
2
2
  import tempfile
3
+ import os
3
4
  from types import SimpleNamespace
4
5
  from unittest.mock import MagicMock, patch
5
6
 
@@ -31,7 +32,11 @@ def _make_response(
31
32
 
32
33
 
33
34
  class _FakeFiles:
35
+ def __init__(self):
36
+ self.uploaded_files = []
37
+
34
38
  def upload(self, file):
39
+ self.uploaded_files.append(file)
35
40
  return SimpleNamespace(name="uploaded-audio")
36
41
 
37
42
  def delete(self, name):
@@ -115,6 +120,18 @@ class _ImmediateExecutor:
115
120
 
116
121
 
117
122
  class TestAudioTranscriptionModelMigration(unittest.TestCase):
123
+ def test_formats_audio_output_with_single_line_break_after_each_sentence(self):
124
+ converter = AudioToTextConverter()
125
+
126
+ formatted = converter.format_audio_output_text(
127
+ "Prima frase. Seconda frase? Terza frase!\n## Titolo\nQuarta frase. Quinta frase."
128
+ )
129
+
130
+ self.assertEqual(
131
+ formatted,
132
+ "Prima frase.\nSeconda frase?\nTerza frase!\n## Titolo\nQuarta frase.\nQuinta frase.",
133
+ )
134
+
118
135
  def test_normalize_no_human_speech_marker_returns_empty_for_marker_only(self):
119
136
  cleaned_text, marker_only = normalize_no_human_speech_marker("no human speech detected")
120
137
 
@@ -217,6 +234,24 @@ class TestAudioTranscriptionModelMigration(unittest.TestCase):
217
234
  fake_client.models.generate_content_config.system_instruction,
218
235
  )
219
236
 
237
+ @patch("polytext.converter.audio_to_text.os.path.getsize", return_value=21 * 1024 * 1024)
238
+ @patch("polytext.converter.audio_to_text.os.path.isfile", return_value=True)
239
+ def test_large_audio_with_non_ascii_filename_uploads_ascii_safe_temp_copy(
240
+ self,
241
+ _mock_isfile,
242
+ _mock_getsize,
243
+ ):
244
+ fake_client = _FakeClient()
245
+
246
+ with patch("polytext.converter.audio_to_text.genai.Client", return_value=fake_client):
247
+ converter = AudioToTextConverter()
248
+ result = converter.transcribe_audio("/tmp/mercoledi_\u00ec.aac")
249
+
250
+ uploaded_path = fake_client.files.uploaded_files[0]
251
+ self.assertEqual(result["transcript"], "transcript")
252
+ self.assertTrue(os.path.basename(uploaded_path).isascii())
253
+ self.assertTrue(uploaded_path.endswith(".aac"))
254
+
220
255
  @patch("polytext.converter.audio_to_text.genai.Client")
221
256
  def test_custom_max_output_tokens_only_changes_generation_budget(self, mock_client_cls):
222
257
  fake_client = _FakeClient()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes