polytext 0.2.5__tar.gz → 0.2.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. {polytext-0.2.5 → polytext-0.2.7}/PKG-INFO +2 -1
  2. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/audio_to_text.py +30 -2
  3. polytext-0.2.7/polytext/converter/beautiful_text.py +209 -0
  4. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/base.py +159 -0
  5. polytext-0.2.7/polytext/prompts/beautiful_text.py +43 -0
  6. {polytext-0.2.5 → polytext-0.2.7}/polytext.egg-info/PKG-INFO +2 -1
  7. {polytext-0.2.5 → polytext-0.2.7}/polytext.egg-info/SOURCES.txt +3 -0
  8. {polytext-0.2.5 → polytext-0.2.7}/polytext.egg-info/requires.txt +1 -0
  9. {polytext-0.2.5 → polytext-0.2.7}/setup.py +1 -1
  10. {polytext-0.2.5 → polytext-0.2.7}/tests/test_audio_transcription_model_migration.py +35 -0
  11. polytext-0.2.7/tests/test_beautiful_text_manual.py +68 -0
  12. {polytext-0.2.5 → polytext-0.2.7}/LICENSE +0 -0
  13. {polytext-0.2.5 → polytext-0.2.7}/README.md +0 -0
  14. {polytext-0.2.5 → polytext-0.2.7}/polytext/__init__.py +0 -0
  15. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/__init__.py +0 -0
  16. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/base.py +0 -0
  17. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/document_ocr_to_text.py +0 -0
  18. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/document_ocr_to_text_azure_oai.py +0 -0
  19. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/gemini_quality_guards.py +0 -0
  20. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/html_to_md.py +0 -0
  21. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/md_to_text.py +0 -0
  22. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/ocr_to_text.py +0 -0
  23. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/ocr_to_text_azure_oai.py +0 -0
  24. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/pdf.py +0 -0
  25. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/text_to_md.py +0 -0
  26. {polytext-0.2.5 → polytext-0.2.7}/polytext/converter/video_to_audio.py +0 -0
  27. {polytext-0.2.5 → polytext-0.2.7}/polytext/exceptions/__init__.py +0 -0
  28. {polytext-0.2.5 → polytext-0.2.7}/polytext/exceptions/base.py +0 -0
  29. {polytext-0.2.5 → polytext-0.2.7}/polytext/generator/__init__.py +0 -0
  30. {polytext-0.2.5 → polytext-0.2.7}/polytext/generator/pdf.py +0 -0
  31. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/__init__.py +0 -0
  32. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/audio.py +0 -0
  33. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/document.py +0 -0
  34. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/document_ocr.py +0 -0
  35. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/downloader/__init__.py +0 -0
  36. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/downloader/downloader.py +0 -0
  37. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/html.py +0 -0
  38. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/markdown.py +0 -0
  39. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/notebook.py +0 -0
  40. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/ocr.py +0 -0
  41. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/plain_text.py +0 -0
  42. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/video.py +0 -0
  43. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/xml_xbrl.py +0 -0
  44. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/youtube.py +0 -0
  45. {polytext-0.2.5 → polytext-0.2.7}/polytext/loader/youtube_llm.py +0 -0
  46. {polytext-0.2.5 → polytext-0.2.7}/polytext/processor/__init__.py +0 -0
  47. {polytext-0.2.5 → polytext-0.2.7}/polytext/processor/audio_chunker.py +0 -0
  48. {polytext-0.2.5 → polytext-0.2.7}/polytext/processor/text_merger.py +0 -0
  49. {polytext-0.2.5 → polytext-0.2.7}/polytext/processor/transcript_chunker.py +0 -0
  50. {polytext-0.2.5 → polytext-0.2.7}/polytext/prompts/__init__.py +0 -0
  51. {polytext-0.2.5 → polytext-0.2.7}/polytext/prompts/ocr.py +0 -0
  52. {polytext-0.2.5 → polytext-0.2.7}/polytext/prompts/text_merging.py +0 -0
  53. {polytext-0.2.5 → polytext-0.2.7}/polytext/prompts/text_to_md.py +0 -0
  54. {polytext-0.2.5 → polytext-0.2.7}/polytext/prompts/transcription.py +0 -0
  55. {polytext-0.2.5 → polytext-0.2.7}/polytext/utils/__init__.py +0 -0
  56. {polytext-0.2.5 → polytext-0.2.7}/polytext/utils/utils.py +0 -0
  57. {polytext-0.2.5 → polytext-0.2.7}/polytext.egg-info/dependency_links.txt +0 -0
  58. {polytext-0.2.5 → polytext-0.2.7}/polytext.egg-info/not-zip-safe +0 -0
  59. {polytext-0.2.5 → polytext-0.2.7}/polytext.egg-info/top_level.txt +0 -0
  60. {polytext-0.2.5 → polytext-0.2.7}/pyproject.toml +0 -0
  61. {polytext-0.2.5 → polytext-0.2.7}/setup.cfg +0 -0
  62. {polytext-0.2.5 → polytext-0.2.7}/tests/test_audio_chunker.py +0 -0
  63. {polytext-0.2.5 → polytext-0.2.7}/tests/test_audio_comparison_helpers.py +0 -0
  64. {polytext-0.2.5 → polytext-0.2.7}/tests/test_base_loader_error_mapping.py +0 -0
  65. {polytext-0.2.5 → polytext-0.2.7}/tests/test_compare_audio_models.py +0 -0
  66. {polytext-0.2.5 → polytext-0.2.7}/tests/test_compare_document_ocr_to_text_models.py +0 -0
  67. {polytext-0.2.5 → polytext-0.2.7}/tests/test_compare_ocr_to_text_models.py +0 -0
  68. {polytext-0.2.5 → polytext-0.2.7}/tests/test_compare_youtube_models.py +0 -0
  69. {polytext-0.2.5 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube.py +0 -0
  70. {polytext-0.2.5 → polytext-0.2.7}/tests/test_dowload_audio_from_youtube_helpers.py +0 -0
  71. {polytext-0.2.5 → polytext-0.2.7}/tests/test_extracted_text_whitespace.py +0 -0
  72. {polytext-0.2.5 → polytext-0.2.7}/tests/test_gemini_quality_guards.py +0 -0
  73. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_audio_transcript_from_gcs.py +0 -0
  74. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_customized_pdf_from_markdown.py +0 -0
  75. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_document_ocr.py +0 -0
  76. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_document_ocr_azure_oai.py +0 -0
  77. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_document_text.py +0 -0
  78. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_document_text_from_gcs.py +0 -0
  79. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_ocr_from_image.py +0 -0
  80. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_text_from_markdown.py +0 -0
  81. {polytext-0.2.5 → polytext-0.2.7}/tests/test_get_video_transcript_from_gcs.py +0 -0
  82. {polytext-0.2.5 → polytext-0.2.7}/tests/test_library.py +0 -0
  83. {polytext-0.2.5 → polytext-0.2.7}/tests/test_markdown_loader_gzip.py +0 -0
  84. {polytext-0.2.5 → polytext-0.2.7}/tests/test_markitdown_html.py +0 -0
  85. {polytext-0.2.5 → polytext-0.2.7}/tests/test_notebook_loader.py +0 -0
  86. {polytext-0.2.5 → polytext-0.2.7}/tests/test_ocr_fallbacks.py +0 -0
  87. {polytext-0.2.5 → polytext-0.2.7}/tests/test_ocr_image_descriptions.py +0 -0
  88. {polytext-0.2.5 → polytext-0.2.7}/tests/test_pain_text.py +0 -0
  89. {polytext-0.2.5 → polytext-0.2.7}/tests/test_pdf_conversion_error.py +0 -0
  90. {polytext-0.2.5 → polytext-0.2.7}/tests/test_python_version_metadata.py +0 -0
  91. {polytext-0.2.5 → polytext-0.2.7}/tests/test_split_audio_with_llm.py +0 -0
  92. {polytext-0.2.5 → polytext-0.2.7}/tests/test_xml_xbrl_loader.py +0 -0
  93. {polytext-0.2.5 → polytext-0.2.7}/tests/test_youtube_gemini_minimal_check.py +0 -0
  94. {polytext-0.2.5 → polytext-0.2.7}/tests/test_youtube_llm_fallbacks.py +0 -0
  95. {polytext-0.2.5 → polytext-0.2.7}/tests/test_youtube_transcript.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -21,6 +21,7 @@ Requires-Dist: PyMuPDF>=1.25.5
21
21
  Requires-Dist: pycryptodome==3.23.0
22
22
  Requires-Dist: weasyprint==65.1
23
23
  Requires-Dist: markdown==3.8
24
+ Requires-Dist: markdown-to-json==2.1.2
24
25
  Requires-Dist: python-docx==1.1.2
25
26
  Requires-Dist: google-api-core>=2.24.2
26
27
  Requires-Dist: google-cloud-storage<3.0.0,>=2.17
@@ -66,6 +66,29 @@ def normalize_no_human_speech_marker(text: str) -> tuple[str, bool]:
66
66
  return cleaned_text, False
67
67
 
68
68
 
69
+ def add_line_break_after_each_sentence(text: str) -> str:
70
+ if not text:
71
+ return text
72
+
73
+ lines = text.splitlines()
74
+ formatted_lines = []
75
+
76
+ for line in lines:
77
+ stripped_line = line.strip()
78
+ if not stripped_line:
79
+ formatted_lines.append("")
80
+ continue
81
+ if re.match(r"^#{1,6}\s+", stripped_line):
82
+ formatted_lines.append(stripped_line)
83
+ continue
84
+
85
+ normalized_line = re.sub(r"\s+", " ", stripped_line)
86
+ normalized_line = re.sub(r"([.!?])\s+", r"\1\n", normalized_line)
87
+ formatted_lines.append(normalized_line)
88
+
89
+ return "\n".join(formatted_lines).strip()
90
+
91
+
69
92
  def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
70
93
  """
71
94
  Compress and convert an audio file to MP3 using ffmpeg.
@@ -434,6 +457,8 @@ class AudioToTextConverter:
434
457
  )
435
458
 
436
459
  response_text, marker_only = normalize_no_human_speech_marker(response_text)
460
+ if not marker_only:
461
+ response_text = self.format_audio_output_text(response_text)
437
462
 
438
463
  response_dict = {
439
464
  "transcript": "" if marker_only else response_text,
@@ -473,6 +498,9 @@ class AudioToTextConverter:
473
498
  transcript_dict = self.transcribe_audio(chunk["file_path"])
474
499
  return index, transcript_dict
475
500
 
501
+ def format_audio_output_text(self, text: str) -> str:
502
+ return add_line_break_after_each_sentence(text)
503
+
476
504
  def transcribe_full_audio(self,
477
505
  audio_path: str, save_transcript_chunks: bool = False) -> dict:
478
506
  """
@@ -565,7 +593,7 @@ class AudioToTextConverter:
565
593
  full_text_merged_dict = text_merger.merge_chunks_with_llm_sequential(chunks=transcript_chunks)
566
594
 
567
595
  result_dict = {
568
- "text": full_text_merged_dict["full_text_merged"],
596
+ "text": self.format_audio_output_text(full_text_merged_dict["full_text_merged"]),
569
597
  "completion_tokens": completion_tokens + full_text_merged_dict["completion_tokens"],
570
598
  "prompt_tokens": prompt_tokens + full_text_merged_dict["prompt_tokens"],
571
599
  "completion_model": self.transcription_model,
@@ -586,7 +614,7 @@ class AudioToTextConverter:
586
614
  if key in chunk_results[0]:
587
615
  result_dict[key] = chunk_results[0][key]
588
616
  if save_transcript_chunks:
589
- result_dict["text_chunks"] = transcript_chunks
617
+ result_dict["text_chunks"] = [self.format_audio_output_text(chunk) for chunk in transcript_chunks]
590
618
  result_dict["chunk_results"] = chunk_results
591
619
 
592
620
  # Clean up temporary files
@@ -0,0 +1,209 @@
1
+ import logging
2
+ import re
3
+ import time
4
+ from importlib import import_module
5
+
6
+ from google import genai
7
+ from google.genai import types
8
+ from google.api_core import exceptions as google_exceptions
9
+ from retry import retry
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+
12
+ from polytext.processor.transcript_chunker import TranscriptChunker
13
+ from polytext.processor.text_merger import TextMerger
14
+ from polytext.prompts.beautiful_text import BEAUTIFUL_TEXT_PROMPT
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class BeautifulTextConverter:
20
+ def __init__(
21
+ self,
22
+ llm_api_key: str = None,
23
+ model: str = "gemini-3.1-flash-lite",
24
+ model_provider: str = "google",
25
+ max_llm_tokens: int = 8000,
26
+ prompt_overhead: int = 1800,
27
+ tokens_per_char: float = 0.25,
28
+ overlap_chars: int = 800,
29
+ ) -> None:
30
+ self.llm_api_key = llm_api_key
31
+ self.model = model
32
+ self.model_provider = model_provider
33
+ self.max_llm_tokens = max_llm_tokens
34
+ self.prompt_overhead = prompt_overhead
35
+ self.tokens_per_char = tokens_per_char
36
+ self.overlap_chars = overlap_chars
37
+
38
+ def get_client(self):
39
+ return genai.Client(api_key=self.llm_api_key) if self.llm_api_key else genai.Client()
40
+
41
+ def chunk_raw_text(self, raw_text: str) -> list[dict]:
42
+ chunker = TranscriptChunker(
43
+ transcript=raw_text,
44
+ max_llm_tokens=self.max_llm_tokens,
45
+ prompt_overhead=self.prompt_overhead,
46
+ tokens_per_char=self.tokens_per_char,
47
+ overlap_chars=self.overlap_chars,
48
+ )
49
+ return chunker.chunk_transcript()
50
+
51
+ @retry(
52
+ (
53
+ google_exceptions.DeadlineExceeded,
54
+ google_exceptions.ResourceExhausted,
55
+ google_exceptions.ServiceUnavailable,
56
+ google_exceptions.InternalServerError,
57
+ ),
58
+ tries=5,
59
+ delay=2,
60
+ backoff=2,
61
+ logger=logger,
62
+ )
63
+ def process_chunk(self, client, chunk_text: str, index: int) -> dict:
64
+ logger.info("Processing beautiful text chunk %s", index + 1)
65
+ start_time = time.time()
66
+
67
+ config = types.GenerateContentConfig(
68
+ safety_settings=[
69
+ types.SafetySetting(
70
+ category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
71
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
72
+ ),
73
+ types.SafetySetting(
74
+ category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
75
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
76
+ ),
77
+ types.SafetySetting(
78
+ category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
79
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
80
+ ),
81
+ types.SafetySetting(
82
+ category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
83
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
84
+ ),
85
+ ]
86
+ )
87
+
88
+ response = client.models.generate_content(
89
+ model=self.model,
90
+ contents=[BEAUTIFUL_TEXT_PROMPT, chunk_text],
91
+ config=config,
92
+ )
93
+
94
+ logger.info("Beautiful text chunk %s processed in %.2fs", index + 1, time.time() - start_time)
95
+
96
+ return {
97
+ "transcript": response.text,
98
+ "completion_tokens": response.usage_metadata.candidates_token_count,
99
+ "prompt_tokens": response.usage_metadata.prompt_token_count,
100
+ }
101
+
102
+ def merge_cleaned_chunks(self, chunks: list[str]) -> str:
103
+ return TextMerger(llm_api_key=self.llm_api_key).merge_chunks(chunks=chunks)
104
+
105
+ def _convert_markdown_to_json(self, markdown_text: str) -> dict:
106
+ if not markdown_text.strip():
107
+ return {}
108
+
109
+ try:
110
+ markdown_to_json = import_module("markdown_to_json")
111
+ except ImportError as exc:
112
+ raise ImportError(
113
+ "markdown-to-json is required when active_chapters=True. "
114
+ "Install it with: pip install markdown-to-json"
115
+ ) from exc
116
+
117
+ return markdown_to_json.dictify(markdown_text)
118
+
119
+ def _build_chapters(self, markdown_text: str) -> list[dict]:
120
+ heading_pattern = re.compile(r"^(#{1,6})\s+(.*?)\s*$")
121
+ chapters = []
122
+ stack: list[dict] = []
123
+
124
+ def finalize_nodes(target_depth: int = 0) -> None:
125
+ while len(stack) > target_depth:
126
+ node_state = stack.pop()
127
+ node_state["node"]["content"] = "\n".join(node_state["content_lines"]).strip()
128
+
129
+ for line in markdown_text.splitlines():
130
+ heading_match = heading_pattern.match(line)
131
+ if heading_match:
132
+ level = len(heading_match.group(1))
133
+ title = heading_match.group(2).strip()
134
+
135
+ while stack and stack[-1]["node"]["level"] >= level:
136
+ finalize_nodes(len(stack) - 1)
137
+
138
+ chapter_node = {
139
+ "title": title,
140
+ "level": level,
141
+ "content": "",
142
+ "children": [],
143
+ }
144
+
145
+ if stack:
146
+ stack[-1]["node"]["children"].append(chapter_node)
147
+ else:
148
+ chapters.append(chapter_node)
149
+
150
+ stack.append({"node": chapter_node, "content_lines": []})
151
+ continue
152
+
153
+ if stack:
154
+ stack[-1]["content_lines"].append(line)
155
+
156
+ finalize_nodes()
157
+ return chapters
158
+
159
+ def convert(self, raw_text: str, save_transcript_chunks: bool = False, active_chapters: bool = False) -> dict:
160
+ cleaned_input = (raw_text or "").strip()
161
+ if not cleaned_input:
162
+ result = {
163
+ "text": "",
164
+ "completion_tokens": 0,
165
+ "prompt_tokens": 0,
166
+ "completion_model": self.model,
167
+ "completion_model_provider": self.model_provider,
168
+ "text_chunks": [] if save_transcript_chunks else "not provided",
169
+ }
170
+ if active_chapters:
171
+ result["markdown_json"] = {}
172
+ result["chapters"] = []
173
+ return result
174
+
175
+ chunks = self.chunk_raw_text(cleaned_input)
176
+ client = self.get_client()
177
+
178
+ results = []
179
+ total_completion_tokens = 0
180
+ total_prompt_tokens = 0
181
+
182
+ with ThreadPoolExecutor() as executor:
183
+ future_to_index = {
184
+ executor.submit(self.process_chunk, client, chunk["text"], chunk["index"]): chunk["index"]
185
+ for chunk in chunks
186
+ }
187
+
188
+ for future in as_completed(future_to_index):
189
+ index = future_to_index[future]
190
+ result = future.result()
191
+ results.append((index, result["transcript"]))
192
+ total_completion_tokens += result["completion_tokens"]
193
+ total_prompt_tokens += result["prompt_tokens"]
194
+
195
+ cleaned_chunks = [text for index, text in sorted(results, key=lambda item: item[0])]
196
+ final_text = self.merge_cleaned_chunks(cleaned_chunks)
197
+
198
+ result = {
199
+ "text": final_text,
200
+ "completion_tokens": total_completion_tokens,
201
+ "prompt_tokens": total_prompt_tokens,
202
+ "completion_model": self.model,
203
+ "completion_model_provider": self.model_provider,
204
+ "text_chunks": cleaned_chunks if save_transcript_chunks else "not provided",
205
+ }
206
+ if active_chapters:
207
+ result["markdown_json"] = self._convert_markdown_to_json(final_text)
208
+ result["chapters"] = self._build_chapters(final_text)
209
+ return result
@@ -33,6 +33,8 @@ import boto3
33
33
  from google.cloud import storage
34
34
  from google.genai import errors as genai_errors
35
35
 
36
+ from ..converter.beautiful_text import BeautifulTextConverter
37
+
36
38
 
37
39
  dotenv.load_dotenv()
38
40
 
@@ -235,6 +237,55 @@ class BaseLoader:
235
237
 
236
238
  return response
237
239
 
240
+ def get_beautiful_text(self, input_list: list[str], **kwargs):
241
+ if not isinstance(input_list, list) or not all(isinstance(item, str) for item in input_list):
242
+ raise TypeError("Parameter 'input' must be a list of strings.")
243
+ if not input_list:
244
+ raise ValueError("Input list is empty.")
245
+ if len(input_list) != 1:
246
+ raise ValueError("get_beautiful_text expects exactly one input.")
247
+
248
+ kwargs = {**self.kwargs, **kwargs}
249
+ raw_result = self.extract_raw_text_for_beautiful_text(input_value=input_list[0], **kwargs)
250
+
251
+ converter = BeautifulTextConverter(llm_api_key=self.llm_api_key)
252
+ cleanup_result = converter.convert(
253
+ raw_text=raw_result["text"],
254
+ save_transcript_chunks=kwargs.get("save_transcript_chunks", self.save_transcript_chunks),
255
+ active_chapters=kwargs.get("active_chapters", False),
256
+ )
257
+
258
+ total_completion_tokens = raw_result.get("completion_tokens", 0) + cleanup_result.get("completion_tokens", 0)
259
+ total_prompt_tokens = raw_result.get("prompt_tokens", 0) + cleanup_result.get("prompt_tokens", 0)
260
+
261
+ result_item = {
262
+ "text": cleanup_result["text"],
263
+ "completion_tokens": total_completion_tokens,
264
+ "prompt_tokens": total_prompt_tokens,
265
+ "completion_model": cleanup_result.get("completion_model", "not provided"),
266
+ "completion_model_provider": cleanup_result.get("completion_model_provider", "not provided"),
267
+ "text_chunks": cleanup_result.get("text_chunks", "not provided"),
268
+ "type": raw_result.get("type", "text"),
269
+ "input": input_list[0],
270
+ }
271
+ if "chapters" in cleanup_result:
272
+ result_item["chapters"] = cleanup_result["chapters"]
273
+
274
+ response = {
275
+ "text": result_item["text"],
276
+ "completion_tokens": result_item["completion_tokens"],
277
+ "prompt_tokens": result_item["prompt_tokens"],
278
+ "completion_model": result_item["completion_model"],
279
+ "completion_model_provider": result_item["completion_model_provider"],
280
+ "text_chunks": result_item["text_chunks"],
281
+ "type": result_item["type"],
282
+ "input": result_item["input"],
283
+ "output_list": [result_item],
284
+ }
285
+ if "chapters" in result_item:
286
+ response["chapters"] = result_item["chapters"]
287
+ return response
288
+
238
289
  def initiate_storage(self, input: str) -> dict:
239
290
  """
240
291
  Initializes and returns a client and relevant details for various cloud storage services or web URLs.
@@ -518,6 +569,114 @@ class BaseLoader:
518
569
  return True
519
570
  return False
520
571
 
572
+ @staticmethod
573
+ def is_remote_input(s: str) -> bool:
574
+ return s.startswith(("s3://", "gcs://", "http://", "https://", "www.", "www.youtube"))
575
+
576
+ @staticmethod
577
+ def is_text_file_extension(path_value: str) -> bool:
578
+ return Path(path_value).suffix.lower() in {".txt", ".text", ".md", ".markdown"}
579
+
580
+ @staticmethod
581
+ def is_beautiful_text_supported_file_extension(path_value: str) -> bool:
582
+ return Path(path_value).suffix.lower() in {
583
+ ".txt",
584
+ ".text",
585
+ ".md",
586
+ ".markdown",
587
+ ".pdf",
588
+ ".xlsx",
589
+ ".docx",
590
+ ".csv",
591
+ ".odt",
592
+ ".pptx",
593
+ ".xls",
594
+ ".doc",
595
+ ".ppt",
596
+ ".rtf",
597
+ ".ipynb",
598
+ ".xml",
599
+ ".xbrl",
600
+ }
601
+
602
+ def extract_raw_text_for_beautiful_text(self, input_value: str, **kwargs) -> dict:
603
+ cleaned_input = input_value.strip()
604
+
605
+ if "\n" in cleaned_input or (not self.is_local_path(cleaned_input) and not self.is_remote_input(cleaned_input)):
606
+ return {
607
+ "text": cleaned_input,
608
+ "completion_tokens": 0,
609
+ "prompt_tokens": 0,
610
+ "completion_model": "not provided",
611
+ "completion_model_provider": "not provided",
612
+ "text_chunks": "not provided",
613
+ "type": "text",
614
+ "input": input_value,
615
+ }
616
+
617
+ local_path = Path(cleaned_input)
618
+
619
+ if local_path.exists():
620
+ if local_path.is_file() and self.is_text_file_extension(cleaned_input):
621
+ return {
622
+ "text": local_path.read_text(encoding="utf-8"),
623
+ "completion_tokens": 0,
624
+ "prompt_tokens": 0,
625
+ "completion_model": "not provided",
626
+ "completion_model_provider": "not provided",
627
+ "text_chunks": "not provided",
628
+ "type": "text",
629
+ "input": input_value,
630
+ }
631
+
632
+ if self.is_local_path(cleaned_input) and not self.is_remote_input(cleaned_input):
633
+ if not local_path.exists():
634
+ raise FileNotFoundError(f"Input not found or format not recognized: {input_value}")
635
+
636
+ if not self.is_beautiful_text_supported_file_extension(cleaned_input):
637
+ raise ValueError(
638
+ "get_beautiful_text supports only text or document inputs such as txt, md, pdf, docx, xlsx, csv, ipynb, xml, or xbrl."
639
+ )
640
+
641
+ if self.is_remote_input(cleaned_input):
642
+ if cleaned_input.startswith(("http://", "https://", "www.", "www.youtube")):
643
+ raise ValueError(
644
+ "get_beautiful_text does not support web pages, YouTube, audio, video, or image URLs. Pass text directly or a text/document file path."
645
+ )
646
+
647
+ if not self.is_beautiful_text_supported_file_extension(cleaned_input):
648
+ raise ValueError(
649
+ "get_beautiful_text supports only text or document file inputs such as txt, md, pdf, docx, xlsx, csv, ipynb, xml, or xbrl."
650
+ )
651
+
652
+ storage_client = self.initiate_storage(input=input_value)
653
+ loader_class = self.init_loader_class(
654
+ input=input_value,
655
+ storage_client=storage_client,
656
+ llm_api_key=self.llm_api_key,
657
+ **kwargs,
658
+ )
659
+
660
+ unsupported_loader_types = (AudioLoader, VideoLoader, OCRLoader, HtmlLoader, YoutubeTranscriptLoaderWithLlm)
661
+ if isinstance(loader_class, unsupported_loader_types):
662
+ raise ValueError(
663
+ "get_beautiful_text supports only text or document inputs, not audio, video, image, HTML, or YouTube sources."
664
+ )
665
+
666
+ extracted = self.run_loader_class(loader_class=loader_class, input_list=[input_value])
667
+ output_item = extracted.get("output_list", [{}])[0]
668
+
669
+ return {
670
+ "text": extracted.get("text", ""),
671
+ "completion_tokens": extracted.get("completion_tokens", 0),
672
+ "prompt_tokens": extracted.get("prompt_tokens", 0),
673
+ "completion_model": output_item.get("completion_model", "not provided"),
674
+ "completion_model_provider": output_item.get("completion_model_provider", "not provided"),
675
+ "text_chunks": output_item.get("text_chunks", "not provided"),
676
+ "type": output_item.get("type", "not provided"),
677
+ "input": output_item.get("input", input_value),
678
+ }
679
+
521
680
  def validate_user_text(self, text: str) -> bool:
522
681
  """
523
682
  Validate a text string. Raises EmptyDocument if the text is too short.
@@ -0,0 +1,43 @@
1
+ BEAUTIFUL_TEXT_PROMPT = """
2
+ You are an editor specialized in cleaning spoken transcripts and raw text into faithful Markdown.
3
+ This is not summarization. This is not rewriting. This is a cleaned transcript or cleaned source text.
4
+
5
+ Your task is to remove only accidental noise while preserving the speaker's or author's original words,
6
+ phrasing, reasoning, tone, and sequence of ideas as faithfully as possible.
7
+
8
+ REMOVE ONLY:
9
+ - non-meaningful fillers such as "eh", "uhm", "diciamo", "eccetera eccetera", "no?" when used only as filler
10
+ - redundant "quindi", "appunto", "comunque" when they are only conversational padding
11
+ - accidental repeated words such as "di di", "da da", "che che"
12
+ - false starts and self-corrections only when they do not carry meaning
13
+ - irrelevant overlap fragments between speakers
14
+
15
+ PRESERVE COMPLETELY:
16
+ - the original wording and sentence structure, even if colloquial
17
+ - technical terms and proper nouns exactly
18
+ - the original tone and register
19
+ - reasoning, opinions, nuances, and meaningful uncertainty
20
+ - the logical order of the discussion
21
+
22
+ DO NOT:
23
+ - rewrite sentences in a more elegant style
24
+ - replace words with synonyms
25
+ - summarize, compress, or simplify concepts
26
+ - add explanations, transitions, or missing content
27
+ - correct the speaker's opinions or inaccuracies
28
+ - make the language more formal than the original
29
+
30
+ FORMATTING:
31
+ - output Markdown only
32
+ - use paragraphs to separate thematic blocks
33
+ - add headings only when the speaker explicitly introduces a new topic
34
+ - use bullet lists or numbered lists only when the source explicitly enumerates items or when the sequence is clearly list-shaped
35
+ - use emphasis sparingly and only when grounded in the original text
36
+ - use **bold** for key information and important concepts, and *italics* for subtle emphasis or contextual terms in every chapter and paragraph whenever they improve readability and understanding
37
+ - do not add code fences
38
+ - do not add introductions or commentary
39
+
40
+ FINAL CHECK:
41
+ - every sentence in the output must be traceable to an equivalent sentence in the input
42
+ - if a sentence cannot be grounded in the input, remove it
43
+ """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: polytext
3
- Version: 0.2.5
3
+ Version: 0.2.7
4
4
  Summary: Python utilities to simplify document files management
5
5
  Home-page: https://github.com/docsity/polytext
6
6
  Author: Matteo Senardi
@@ -21,6 +21,7 @@ Requires-Dist: PyMuPDF>=1.25.5
21
21
  Requires-Dist: pycryptodome==3.23.0
22
22
  Requires-Dist: weasyprint==65.1
23
23
  Requires-Dist: markdown==3.8
24
+ Requires-Dist: markdown-to-json==2.1.2
24
25
  Requires-Dist: python-docx==1.1.2
25
26
  Requires-Dist: google-api-core>=2.24.2
26
27
  Requires-Dist: google-cloud-storage<3.0.0,>=2.17
@@ -12,6 +12,7 @@ polytext.egg-info/top_level.txt
12
12
  polytext/converter/__init__.py
13
13
  polytext/converter/audio_to_text.py
14
14
  polytext/converter/base.py
15
+ polytext/converter/beautiful_text.py
15
16
  polytext/converter/document_ocr_to_text.py
16
17
  polytext/converter/document_ocr_to_text_azure_oai.py
17
18
  polytext/converter/gemini_quality_guards.py
@@ -47,6 +48,7 @@ polytext/processor/audio_chunker.py
47
48
  polytext/processor/text_merger.py
48
49
  polytext/processor/transcript_chunker.py
49
50
  polytext/prompts/__init__.py
51
+ polytext/prompts/beautiful_text.py
50
52
  polytext/prompts/ocr.py
51
53
  polytext/prompts/text_merging.py
52
54
  polytext/prompts/text_to_md.py
@@ -57,6 +59,7 @@ tests/test_audio_chunker.py
57
59
  tests/test_audio_comparison_helpers.py
58
60
  tests/test_audio_transcription_model_migration.py
59
61
  tests/test_base_loader_error_mapping.py
62
+ tests/test_beautiful_text_manual.py
60
63
  tests/test_compare_audio_models.py
61
64
  tests/test_compare_document_ocr_to_text_models.py
62
65
  tests/test_compare_ocr_to_text_models.py
@@ -3,6 +3,7 @@ PyMuPDF>=1.25.5
3
3
  pycryptodome==3.23.0
4
4
  weasyprint==65.1
5
5
  markdown==3.8
6
+ markdown-to-json==2.1.2
6
7
  python-docx==1.1.2
7
8
  google-api-core>=2.24.2
8
9
  google-cloud-storage<3.0.0,>=2.17
@@ -51,7 +51,7 @@ def get_requirements(*requirements_file):
51
51
 
52
52
  setup(
53
53
  name='polytext',
54
- version='0.2.5',
54
+ version='0.2.7',
55
55
  url='https://github.com/docsity/polytext',
56
56
  # download_url='https://github.com/pualien/py-polytext/archive/0.1.23.tar.gz',
57
57
  license='MIT',
@@ -1,5 +1,6 @@
1
1
  import unittest
2
2
  import tempfile
3
+ import os
3
4
  from types import SimpleNamespace
4
5
  from unittest.mock import MagicMock, patch
5
6
 
@@ -31,7 +32,11 @@ def _make_response(
31
32
 
32
33
 
33
34
  class _FakeFiles:
35
+ def __init__(self):
36
+ self.uploaded_files = []
37
+
34
38
  def upload(self, file):
39
+ self.uploaded_files.append(file)
35
40
  return SimpleNamespace(name="uploaded-audio")
36
41
 
37
42
  def delete(self, name):
@@ -115,6 +120,18 @@ class _ImmediateExecutor:
115
120
 
116
121
 
117
122
  class TestAudioTranscriptionModelMigration(unittest.TestCase):
123
+ def test_formats_audio_output_with_single_line_break_after_each_sentence(self):
124
+ converter = AudioToTextConverter()
125
+
126
+ formatted = converter.format_audio_output_text(
127
+ "Prima frase. Seconda frase? Terza frase!\n## Titolo\nQuarta frase. Quinta frase."
128
+ )
129
+
130
+ self.assertEqual(
131
+ formatted,
132
+ "Prima frase.\nSeconda frase?\nTerza frase!\n## Titolo\nQuarta frase.\nQuinta frase.",
133
+ )
134
+
118
135
  def test_normalize_no_human_speech_marker_returns_empty_for_marker_only(self):
119
136
  cleaned_text, marker_only = normalize_no_human_speech_marker("no human speech detected")
120
137
 
@@ -217,6 +234,24 @@ class TestAudioTranscriptionModelMigration(unittest.TestCase):
217
234
  fake_client.models.generate_content_config.system_instruction,
218
235
  )
219
236
 
237
+ @patch("polytext.converter.audio_to_text.os.path.getsize", return_value=21 * 1024 * 1024)
238
+ @patch("polytext.converter.audio_to_text.os.path.isfile", return_value=True)
239
+ def test_large_audio_with_non_ascii_filename_uploads_ascii_safe_temp_copy(
240
+ self,
241
+ _mock_isfile,
242
+ _mock_getsize,
243
+ ):
244
+ fake_client = _FakeClient()
245
+
246
+ with patch("polytext.converter.audio_to_text.genai.Client", return_value=fake_client):
247
+ converter = AudioToTextConverter()
248
+ result = converter.transcribe_audio("/tmp/mercoledi_\u00ec.aac")
249
+
250
+ uploaded_path = fake_client.files.uploaded_files[0]
251
+ self.assertEqual(result["transcript"], "transcript")
252
+ self.assertTrue(os.path.basename(uploaded_path).isascii())
253
+ self.assertTrue(uploaded_path.endswith(".aac"))
254
+
220
255
  @patch("polytext.converter.audio_to_text.genai.Client")
221
256
  def test_custom_max_output_tokens_only_changes_generation_budget(self, mock_client_cls):
222
257
  fake_client = _FakeClient()
@@ -0,0 +1,68 @@
1
+ import logging
2
+ import os
3
+ import sys
4
+
5
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
6
+
7
+ from dotenv import load_dotenv
8
+
9
+ from polytext.loader.base import BaseLoader
10
+
11
+ load_dotenv("..env")
12
+
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger("polytext")
15
+
16
+
17
+ PLAIN_TEXT_SAMPLE = """"""
18
+
19
+
20
+ def run_beautiful_text(input_value: str, source: str = "local", activate_full_process: bool = False) -> dict:
21
+ loader = BaseLoader(
22
+ markdown_output=True,
23
+ save_transcript_chunks=True,
24
+ source=source,
25
+ )
26
+ result_dict_original = None
27
+ if activate_full_process:
28
+ local_file_path = "/Users/username/Projects/polytext/mk5konma-f4c711ba3480d81e35d60ddee5cea1cf15c690d6.mp4" # "s3://docsity-data/documents/original/2011/06/05/La_disciplina_degli_enti_locali_negli_statuti_regionali.pdf" # "https://www.youtube.com/watch?v=Njrr5etGFGg&list=PLufwFtblC0Rta8GZlxVJ7DWqVnrENbrCV&index=4" # "/Users/andreasolfanelli/Projects/polytext/giovedì alle 10-15(2).aac"
29
+
30
+ logger.info("FULL PROCESS ACTIVE - using local file path: %s", local_file_path)
31
+ result_dict_original = loader.get_text(input_list=[local_file_path])
32
+ logger.info("***** END FULL PROCESS ACTIVE ******")
33
+ print(result_dict_original)
34
+
35
+ result_dict = loader.get_beautiful_text(input_list=[result_dict_original["text"] if activate_full_process else input_value], active_chapters=True)
36
+
37
+ logger.info("Input: %s", input_value)
38
+ logger.info("Type: %s", result_dict.get("type"))
39
+ logger.info("Completion model: %s", result_dict.get("completion_model"))
40
+ logger.info("Completion tokens: %s", result_dict.get("completion_tokens"))
41
+ logger.info("Prompt tokens: %s", result_dict.get("prompt_tokens"))
42
+ logger.info("Preview:\n%s", result_dict.get("text", "")[:2000])
43
+
44
+ return result_dict
45
+
46
+
47
+ def main():
48
+ mode = "full_process"
49
+
50
+ if mode == "plain_text":
51
+ return run_beautiful_text(PLAIN_TEXT_SAMPLE, source="local")
52
+
53
+ if mode == "local_file":
54
+ local_file_path = "/Users/username/Projects/polytext/summary_note_64_84_level_1_develop_type_G20Y40.pdf" # "/Users/andreasolfanelli/Projects/polytext/2.-Principi-fondamentali.pdf" # "/Users/andreasolfanelli/Projects/polytext/1.materiale del'istituto Lezione-2_La-norma-giuridica--caratteristiche-e-interpretazione--e-sanzioni.pdf"
55
+ return run_beautiful_text(local_file_path, source="local")
56
+
57
+ if mode == "s3_file":
58
+ s3_file_path = "s3://docsity-data/documents/original/2011/06/05/La_disciplina_degli_enti_locali_negli_statuti_regionali.pdf" # "s3://docsity-ai-develop/da_ml_ai_summary_output/lang=it/y=2026/m=05/d=12/upload_date=2026-05-12/summary_note_70_96_level_1_develop_type_G20Y40.pdf"
59
+ return run_beautiful_text(s3_file_path, source="cloud")
60
+
61
+ if mode == "full_process":
62
+ return run_beautiful_text("", source="local", activate_full_process=True)
63
+
64
+ raise ValueError(f"Unsupported mode: {mode}")
65
+
66
+
67
+ if __name__ == "__main__":
68
+ main()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes