polytext 0.2.6__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {polytext-0.2.6 → polytext-0.2.7}/PKG-INFO +1 -1
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/audio_to_text.py +30 -2
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/base.py +0 -4
- {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/PKG-INFO +1 -1
- {polytext-0.2.6 → polytext-0.2.7}/setup.py +1 -1
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_transcription_model_migration.py +35 -0
- {polytext-0.2.6 → polytext-0.2.7}/LICENSE +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/README.md +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/base.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/beautiful_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/document_ocr_to_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/gemini_quality_guards.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/html_to_md.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/md_to_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/ocr_to_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/pdf.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/text_to_md.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/converter/video_to_audio.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/exceptions/base.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/generator/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/generator/pdf.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/audio.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/document.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/document_ocr.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/downloader/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/downloader/downloader.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/html.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/markdown.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/notebook.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/ocr.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/plain_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/video.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/xml_xbrl.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/youtube.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/loader/youtube_llm.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/audio_chunker.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/text_merger.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/processor/transcript_chunker.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/beautiful_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/ocr.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/text_merging.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/text_to_md.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/prompts/transcription.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/utils/__init__.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext/utils/utils.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/SOURCES.txt +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/requires.txt +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/pyproject.toml +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/setup.cfg +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_chunker.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_audio_comparison_helpers.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_base_loader_error_mapping.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_beautiful_text_manual.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_audio_models.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_document_ocr_to_text_models.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_ocr_to_text_models.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_compare_youtube_models.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_extracted_text_whitespace.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_gemini_quality_guards.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_audio_transcript_from_gcs.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_customized_pdf_from_markdown.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_ocr.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_ocr_azure_oai.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_document_text_from_gcs.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_ocr_from_image.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_text_from_markdown.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_get_video_transcript_from_gcs.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_library.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_markdown_loader_gzip.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_markitdown_html.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_notebook_loader.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_ocr_fallbacks.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_ocr_image_descriptions.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_pain_text.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_pdf_conversion_error.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_python_version_metadata.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_split_audio_with_llm.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_xml_xbrl_loader.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_gemini_minimal_check.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_llm_fallbacks.py +0 -0
- {polytext-0.2.6 → polytext-0.2.7}/tests/test_youtube_transcript.py +0 -0
|
@@ -66,6 +66,29 @@ def normalize_no_human_speech_marker(text: str) -> tuple[str, bool]:
|
|
|
66
66
|
return cleaned_text, False
|
|
67
67
|
|
|
68
68
|
|
|
69
|
+
def add_line_break_after_each_sentence(text: str) -> str:
|
|
70
|
+
if not text:
|
|
71
|
+
return text
|
|
72
|
+
|
|
73
|
+
lines = text.splitlines()
|
|
74
|
+
formatted_lines = []
|
|
75
|
+
|
|
76
|
+
for line in lines:
|
|
77
|
+
stripped_line = line.strip()
|
|
78
|
+
if not stripped_line:
|
|
79
|
+
formatted_lines.append("")
|
|
80
|
+
continue
|
|
81
|
+
if re.match(r"^#{1,6}\s+", stripped_line):
|
|
82
|
+
formatted_lines.append(stripped_line)
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
normalized_line = re.sub(r"\s+", " ", stripped_line)
|
|
86
|
+
normalized_line = re.sub(r"([.!?])\s+", r"\1\n", normalized_line)
|
|
87
|
+
formatted_lines.append(normalized_line)
|
|
88
|
+
|
|
89
|
+
return "\n".join(formatted_lines).strip()
|
|
90
|
+
|
|
91
|
+
|
|
69
92
|
def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
|
|
70
93
|
"""
|
|
71
94
|
Compress and convert an audio file to MP3 using ffmpeg.
|
|
@@ -434,6 +457,8 @@ class AudioToTextConverter:
|
|
|
434
457
|
)
|
|
435
458
|
|
|
436
459
|
response_text, marker_only = normalize_no_human_speech_marker(response_text)
|
|
460
|
+
if not marker_only:
|
|
461
|
+
response_text = self.format_audio_output_text(response_text)
|
|
437
462
|
|
|
438
463
|
response_dict = {
|
|
439
464
|
"transcript": "" if marker_only else response_text,
|
|
@@ -473,6 +498,9 @@ class AudioToTextConverter:
|
|
|
473
498
|
transcript_dict = self.transcribe_audio(chunk["file_path"])
|
|
474
499
|
return index, transcript_dict
|
|
475
500
|
|
|
501
|
+
def format_audio_output_text(self, text: str) -> str:
|
|
502
|
+
return add_line_break_after_each_sentence(text)
|
|
503
|
+
|
|
476
504
|
def transcribe_full_audio(self,
|
|
477
505
|
audio_path: str, save_transcript_chunks: bool = False) -> dict:
|
|
478
506
|
"""
|
|
@@ -565,7 +593,7 @@ class AudioToTextConverter:
|
|
|
565
593
|
full_text_merged_dict = text_merger.merge_chunks_with_llm_sequential(chunks=transcript_chunks)
|
|
566
594
|
|
|
567
595
|
result_dict = {
|
|
568
|
-
"text": full_text_merged_dict["full_text_merged"],
|
|
596
|
+
"text": self.format_audio_output_text(full_text_merged_dict["full_text_merged"]),
|
|
569
597
|
"completion_tokens": completion_tokens + full_text_merged_dict["completion_tokens"],
|
|
570
598
|
"prompt_tokens": prompt_tokens + full_text_merged_dict["prompt_tokens"],
|
|
571
599
|
"completion_model": self.transcription_model,
|
|
@@ -586,7 +614,7 @@ class AudioToTextConverter:
|
|
|
586
614
|
if key in chunk_results[0]:
|
|
587
615
|
result_dict[key] = chunk_results[0][key]
|
|
588
616
|
if save_transcript_chunks:
|
|
589
|
-
result_dict["text_chunks"] = transcript_chunks
|
|
617
|
+
result_dict["text_chunks"] = [self.format_audio_output_text(chunk) for chunk in transcript_chunks]
|
|
590
618
|
result_dict["chunk_results"] = chunk_results
|
|
591
619
|
|
|
592
620
|
# Clean up temporary files
|
|
@@ -268,8 +268,6 @@ class BaseLoader:
|
|
|
268
268
|
"type": raw_result.get("type", "text"),
|
|
269
269
|
"input": input_list[0],
|
|
270
270
|
}
|
|
271
|
-
if "markdown_json" in cleanup_result:
|
|
272
|
-
result_item["markdown_json"] = cleanup_result["markdown_json"]
|
|
273
271
|
if "chapters" in cleanup_result:
|
|
274
272
|
result_item["chapters"] = cleanup_result["chapters"]
|
|
275
273
|
|
|
@@ -284,8 +282,6 @@ class BaseLoader:
|
|
|
284
282
|
"input": result_item["input"],
|
|
285
283
|
"output_list": [result_item],
|
|
286
284
|
}
|
|
287
|
-
if "markdown_json" in result_item:
|
|
288
|
-
response["markdown_json"] = result_item["markdown_json"]
|
|
289
285
|
if "chapters" in result_item:
|
|
290
286
|
response["chapters"] = result_item["chapters"]
|
|
291
287
|
return response
|
|
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
|
|
|
51
51
|
|
|
52
52
|
setup(
|
|
53
53
|
name='polytext',
|
|
54
|
-
version='0.2.
|
|
54
|
+
version='0.2.7',
|
|
55
55
|
url='https://github.com/docsity/polytext',
|
|
56
56
|
# download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
|
|
57
57
|
license='MIT',
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import unittest
|
|
2
2
|
import tempfile
|
|
3
|
+
import os
|
|
3
4
|
from types import SimpleNamespace
|
|
4
5
|
from unittest.mock import MagicMock, patch
|
|
5
6
|
|
|
@@ -31,7 +32,11 @@ def _make_response(
|
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
class _FakeFiles:
|
|
35
|
+
def __init__(self):
|
|
36
|
+
self.uploaded_files = []
|
|
37
|
+
|
|
34
38
|
def upload(self, file):
|
|
39
|
+
self.uploaded_files.append(file)
|
|
35
40
|
return SimpleNamespace(name="uploaded-audio")
|
|
36
41
|
|
|
37
42
|
def delete(self, name):
|
|
@@ -115,6 +120,18 @@ class _ImmediateExecutor:
|
|
|
115
120
|
|
|
116
121
|
|
|
117
122
|
class TestAudioTranscriptionModelMigration(unittest.TestCase):
|
|
123
|
+
def test_formats_audio_output_with_single_line_break_after_each_sentence(self):
|
|
124
|
+
converter = AudioToTextConverter()
|
|
125
|
+
|
|
126
|
+
formatted = converter.format_audio_output_text(
|
|
127
|
+
"Prima frase. Seconda frase? Terza frase!\n## Titolo\nQuarta frase. Quinta frase."
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
self.assertEqual(
|
|
131
|
+
formatted,
|
|
132
|
+
"Prima frase.\nSeconda frase?\nTerza frase!\n## Titolo\nQuarta frase.\nQuinta frase.",
|
|
133
|
+
)
|
|
134
|
+
|
|
118
135
|
def test_normalize_no_human_speech_marker_returns_empty_for_marker_only(self):
|
|
119
136
|
cleaned_text, marker_only = normalize_no_human_speech_marker("no human speech detected")
|
|
120
137
|
|
|
@@ -217,6 +234,24 @@ class TestAudioTranscriptionModelMigration(unittest.TestCase):
|
|
|
217
234
|
fake_client.models.generate_content_config.system_instruction,
|
|
218
235
|
)
|
|
219
236
|
|
|
237
|
+
@patch("polytext.converter.audio_to_text.os.path.getsize", return_value=21 * 1024 * 1024)
|
|
238
|
+
@patch("polytext.converter.audio_to_text.os.path.isfile", return_value=True)
|
|
239
|
+
def test_large_audio_with_non_ascii_filename_uploads_ascii_safe_temp_copy(
|
|
240
|
+
self,
|
|
241
|
+
_mock_isfile,
|
|
242
|
+
_mock_getsize,
|
|
243
|
+
):
|
|
244
|
+
fake_client = _FakeClient()
|
|
245
|
+
|
|
246
|
+
with patch("polytext.converter.audio_to_text.genai.Client", return_value=fake_client):
|
|
247
|
+
converter = AudioToTextConverter()
|
|
248
|
+
result = converter.transcribe_audio("/tmp/mercoledi_\u00ec.aac")
|
|
249
|
+
|
|
250
|
+
uploaded_path = fake_client.files.uploaded_files[0]
|
|
251
|
+
self.assertEqual(result["transcript"], "transcript")
|
|
252
|
+
self.assertTrue(os.path.basename(uploaded_path).isascii())
|
|
253
|
+
self.assertTrue(uploaded_path.endswith(".aac"))
|
|
254
|
+
|
|
220
255
|
@patch("polytext.converter.audio_to_text.genai.Client")
|
|
221
256
|
def test_custom_max_output_tokens_only_changes_generation_budget(self, mock_client_cls):
|
|
222
257
|
fake_client = _FakeClient()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|