PyPI - polytext - Versions diffs - 0.2.6__tar.gz → 0.2.7__tar.gz - Mend

polytext 0.2.6tar.gz → 0.2.7tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

{polytext-0.2.6 → polytext-0.2.7}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: polytext
-Version: 0.2.6
+Version: 0.2.7
 Summary: Python utilities to simplify document files management
 Home-page: https://github.com/docsity/polytext
 Author: Matteo Senardi

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/audio_to_text.py RENAMED Viewed

@@ -66,6 +66,29 @@ def normalize_no_human_speech_marker(text: str) -> tuple[str, bool]:
     return cleaned_text, False
+def add_line_break_after_each_sentence(text: str) -> str:
+    if not text:
+        return text
+    lines = text.splitlines()
+    formatted_lines = []
+    for line in lines:
+        stripped_line = line.strip()
+        if not stripped_line:
+            formatted_lines.append("")
+            continue
+        if re.match(r"^#{1,6}\s+", stripped_line):
+            formatted_lines.append(stripped_line)
+            continue
+        normalized_line = re.sub(r"\s+", " ", stripped_line)
+        normalized_line = re.sub(r"([.!?])\s+", r"\1\n", normalized_line)
+        formatted_lines.append(normalized_line)
+    return "\n".join(formatted_lines).strip()
 def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
     """
     Compress and convert an audio file to MP3 using ffmpeg.
@@ -434,6 +457,8 @@ class AudioToTextConverter:
                 )
             response_text, marker_only = normalize_no_human_speech_marker(response_text)
+            if not marker_only:
+                response_text = self.format_audio_output_text(response_text)
             response_dict = {
                 "transcript": "" if marker_only else response_text,
@@ -473,6 +498,9 @@ class AudioToTextConverter:
         transcript_dict = self.transcribe_audio(chunk["file_path"])
         return index, transcript_dict
+    def format_audio_output_text(self, text: str) -> str:
+        return add_line_break_after_each_sentence(text)
     def transcribe_full_audio(self,
             audio_path: str, save_transcript_chunks: bool = False) -> dict:
         """
@@ -565,7 +593,7 @@ class AudioToTextConverter:
         full_text_merged_dict = text_merger.merge_chunks_with_llm_sequential(chunks=transcript_chunks)
         result_dict = {
-            "text": full_text_merged_dict["full_text_merged"],
+            "text": self.format_audio_output_text(full_text_merged_dict["full_text_merged"]),
             "completion_tokens": completion_tokens + full_text_merged_dict["completion_tokens"],
             "prompt_tokens": prompt_tokens + full_text_merged_dict["prompt_tokens"],
             "completion_model": self.transcription_model,
@@ -586,7 +614,7 @@ class AudioToTextConverter:
                 if key in chunk_results[0]:
                     result_dict[key] = chunk_results[0][key]
         if save_transcript_chunks:
-            result_dict["text_chunks"] = transcript_chunks
+            result_dict["text_chunks"] = [self.format_audio_output_text(chunk) for chunk in transcript_chunks]
             result_dict["chunk_results"] = chunk_results
         # Clean up temporary files

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/base.py RENAMED Viewed

@@ -268,8 +268,6 @@ class BaseLoader:
             "type": raw_result.get("type", "text"),
             "input": input_list[0],
         }
-        if "markdown_json" in cleanup_result:
-            result_item["markdown_json"] = cleanup_result["markdown_json"]
         if "chapters" in cleanup_result:
             result_item["chapters"] = cleanup_result["chapters"]
@@ -284,8 +282,6 @@ class BaseLoader:
             "input": result_item["input"],
             "output_list": [result_item],
         }
-        if "markdown_json" in result_item:
-            response["markdown_json"] = result_item["markdown_json"]
         if "chapters" in result_item:
             response["chapters"] = result_item["chapters"]
         return response

{polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: polytext
-Version: 0.2.6
+Version: 0.2.7
 Summary: Python utilities to simplify document files management
 Home-page: https://github.com/docsity/polytext
 Author: Matteo Senardi

{polytext-0.2.6 → polytext-0.2.7}/setup.py RENAMED Viewed

@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
 setup(
     name='polytext',
-    version='0.2.6',
+    version='0.2.7',
     url='https://github.com/docsity/polytext',
     # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
     license='MIT',

{polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_transcription_model_migration.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import unittest
 import tempfile
+import os
 from types import SimpleNamespace
 from unittest.mock import MagicMock, patch
@@ -31,7 +32,11 @@ def _make_response(
 class _FakeFiles:
+    def __init__(self):
+        self.uploaded_files = []
     def upload(self, file):
+        self.uploaded_files.append(file)
         return SimpleNamespace(name="uploaded-audio")
     def delete(self, name):
@@ -115,6 +120,18 @@ class _ImmediateExecutor:
 class TestAudioTranscriptionModelMigration(unittest.TestCase):
+    def test_formats_audio_output_with_single_line_break_after_each_sentence(self):
+        converter = AudioToTextConverter()
+        formatted = converter.format_audio_output_text(
+            "Prima frase. Seconda frase? Terza frase!\n## Titolo\nQuarta frase. Quinta frase."
+        )
+        self.assertEqual(
+            formatted,
+            "Prima frase.\nSeconda frase?\nTerza frase!\n## Titolo\nQuarta frase.\nQuinta frase.",
+        )
     def test_normalize_no_human_speech_marker_returns_empty_for_marker_only(self):
         cleaned_text, marker_only = normalize_no_human_speech_marker("no human speech detected")
@@ -217,6 +234,24 @@ class TestAudioTranscriptionModelMigration(unittest.TestCase):
             fake_client.models.generate_content_config.system_instruction,
         )
+    @patch("polytext.converter.audio_to_text.os.path.getsize", return_value=21 * 1024 * 1024)
+    @patch("polytext.converter.audio_to_text.os.path.isfile", return_value=True)
+    def test_large_audio_with_non_ascii_filename_uploads_ascii_safe_temp_copy(
+        self,
+        _mock_isfile,
+        _mock_getsize,
+    ):
+        fake_client = _FakeClient()
+        with patch("polytext.converter.audio_to_text.genai.Client", return_value=fake_client):
+            converter = AudioToTextConverter()
+            result = converter.transcribe_audio("/tmp/mercoledi_\u00ec.aac")
+        uploaded_path = fake_client.files.uploaded_files[0]
+        self.assertEqual(result["transcript"], "transcript")
+        self.assertTrue(os.path.basename(uploaded_path).isascii())
+        self.assertTrue(uploaded_path.endswith(".aac"))
     @patch("polytext.converter.audio_to_text.genai.Client")
     def test_custom_max_output_tokens_only_changes_generation_budget(self, mock_client_cls):
         fake_client = _FakeClient()

{polytext-0.2.6 → polytext-0.2.7}/LICENSE RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/README.md RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/base.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/beautiful_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/document_ocr_to_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/document_ocr_to_text_azure_oai.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/gemini_quality_guards.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/html_to_md.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/md_to_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/ocr_to_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/ocr_to_text_azure_oai.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/pdf.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/text_to_md.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/converter/video_to_audio.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/exceptions/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/exceptions/base.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/generator/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/generator/pdf.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/audio.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/document.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/document_ocr.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/downloader/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/downloader/downloader.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/html.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/markdown.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/notebook.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/ocr.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/plain_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/video.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/xml_xbrl.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/youtube.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/loader/youtube_llm.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/processor/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/processor/audio_chunker.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/processor/text_merger.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/processor/transcript_chunker.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/beautiful_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/ocr.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/text_merging.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/text_to_md.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/transcription.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/utils/__init__.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext/utils/utils.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/not-zip-safe RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/requires.txt RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/top_level.txt RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/pyproject.toml RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/setup.cfg RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_chunker.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_comparison_helpers.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_base_loader_error_mapping.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_beautiful_text_manual.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_audio_models.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_document_ocr_to_text_models.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_ocr_to_text_models.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_youtube_models.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube_helpers.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_extracted_text_whitespace.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_gemini_quality_guards.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_audio_transcript_from_gcs.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_customized_pdf_from_markdown.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_ocr.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_ocr_azure_oai.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_text_from_gcs.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_ocr_from_image.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_text_from_markdown.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_get_video_transcript_from_gcs.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_library.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_markdown_loader_gzip.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_markitdown_html.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_notebook_loader.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_ocr_fallbacks.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_ocr_image_descriptions.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_pain_text.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_pdf_conversion_error.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_python_version_metadata.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_split_audio_with_llm.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_xml_xbrl_loader.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_gemini_minimal_check.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_llm_fallbacks.py RENAMED Viewed

File without changes

{polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_transcript.py RENAMED Viewed

File without changes

polytext 0.2.6__tar.gz → 0.2.7__tar.gz

polytext 0.2.6tar.gz → 0.2.7tar.gz