polytext 0.1.2__tar.gz → 0.1.3b2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. polytext-0.1.3b2/PKG-INFO +113 -0
  2. polytext-0.1.3b2/README.md +60 -0
  3. polytext-0.1.3b2/polytext/__init__.py +41 -0
  4. polytext-0.1.3b2/polytext/converter/__init__.py +7 -0
  5. polytext-0.1.3b2/polytext/converter/audio_to_text.py +368 -0
  6. polytext-0.1.3b2/polytext/converter/base.py +35 -0
  7. polytext-0.1.3b2/polytext/converter/document_ocr_to_text.py +404 -0
  8. polytext-0.1.3b2/polytext/converter/html_to_md.py +25 -0
  9. polytext-0.1.3b2/polytext/converter/md_to_text.py +49 -0
  10. polytext-0.1.3b2/polytext/converter/ocr_to_text.py +288 -0
  11. {polytext-0.1.2 → polytext-0.1.3b2}/polytext/converter/pdf.py +5 -5
  12. polytext-0.1.3b2/polytext/converter/text_to_md.py +247 -0
  13. polytext-0.1.3b2/polytext/converter/video_to_audio.py +63 -0
  14. {polytext-0.1.2 → polytext-0.1.3b2}/polytext/exceptions/base.py +3 -3
  15. {polytext-0.1.2 → polytext-0.1.3b2}/polytext/generator/pdf.py +59 -27
  16. polytext-0.1.3b2/polytext/loader/__init__.py +13 -0
  17. polytext-0.1.3b2/polytext/loader/audio.py +159 -0
  18. polytext-0.1.3b2/polytext/loader/base.py +414 -0
  19. polytext-0.1.3b2/polytext/loader/document.py +596 -0
  20. polytext-0.1.3b2/polytext/loader/document_ocr.py +196 -0
  21. polytext-0.1.3b2/polytext/loader/downloader/__init__.py +3 -0
  22. polytext-0.1.3b2/polytext/loader/downloader/downloader.py +76 -0
  23. polytext-0.1.3b2/polytext/loader/html.py +65 -0
  24. polytext-0.1.3b2/polytext/loader/markdown.py +185 -0
  25. polytext-0.1.3b2/polytext/loader/ocr.py +155 -0
  26. polytext-0.1.3b2/polytext/loader/plain_text.py +97 -0
  27. polytext-0.1.3b2/polytext/loader/video.py +234 -0
  28. polytext-0.1.3b2/polytext/loader/youtube.py +235 -0
  29. polytext-0.1.3b2/polytext/processor/__init__.py +0 -0
  30. polytext-0.1.3b2/polytext/processor/audio_chunker.py +145 -0
  31. polytext-0.1.3b2/polytext/processor/text_merger.py +386 -0
  32. polytext-0.1.3b2/polytext/processor/transcript_chunker.py +79 -0
  33. polytext-0.1.3b2/polytext/prompts/__init__.py +0 -0
  34. polytext-0.1.3b2/polytext/prompts/ocr.py +16 -0
  35. polytext-0.1.3b2/polytext/prompts/text_merging.py +17 -0
  36. polytext-0.1.3b2/polytext/prompts/text_to_md.py +21 -0
  37. polytext-0.1.3b2/polytext/prompts/transcription.py +36 -0
  38. polytext-0.1.3b2/polytext.egg-info/PKG-INFO +113 -0
  39. polytext-0.1.3b2/polytext.egg-info/SOURCES.txt +60 -0
  40. polytext-0.1.3b2/polytext.egg-info/requires.txt +24 -0
  41. {polytext-0.1.2 → polytext-0.1.3b2}/pyproject.toml +3 -3
  42. {polytext-0.1.2 → polytext-0.1.3b2}/setup.py +27 -28
  43. polytext-0.1.3b2/tests/test_get_audio_transcript_from_gcs.py +61 -0
  44. polytext-0.1.3b2/tests/test_get_customized_pdf_from_markdown.py +89 -0
  45. polytext-0.1.3b2/tests/test_get_document_ocr.py +72 -0
  46. {polytext-0.1.2 → polytext-0.1.3b2}/tests/test_get_document_text.py +6 -6
  47. polytext-0.1.3b2/tests/test_get_document_text_from_gcs.py +73 -0
  48. polytext-0.1.3b2/tests/test_get_ocr_from_image.py +61 -0
  49. polytext-0.1.3b2/tests/test_get_text_from_markdown.py +71 -0
  50. polytext-0.1.3b2/tests/test_get_video_transcript_from_gcs.py +62 -0
  51. polytext-0.1.3b2/tests/test_library.py +29 -0
  52. polytext-0.1.3b2/tests/test_markitdown_html.py +32 -0
  53. polytext-0.1.3b2/tests/test_pain_text.py +29 -0
  54. polytext-0.1.3b2/tests/test_split_audio_with_llm.py +137 -0
  55. polytext-0.1.3b2/tests/test_youtube_transcript.py +35 -0
  56. polytext-0.1.2/PKG-INFO +0 -93
  57. polytext-0.1.2/README.md +0 -53
  58. polytext-0.1.2/polytext/__init__.py +0 -18
  59. polytext-0.1.2/polytext/converter/__init__.py +0 -4
  60. polytext-0.1.2/polytext/loader/__init__.py +0 -4
  61. polytext-0.1.2/polytext/loader/text.py +0 -606
  62. polytext-0.1.2/polytext.egg-info/PKG-INFO +0 -93
  63. polytext-0.1.2/polytext.egg-info/SOURCES.txt +0 -22
  64. polytext-0.1.2/polytext.egg-info/requires.txt +0 -6
  65. polytext-0.1.2/tests/test_extract_text_from_file.py +0 -44
  66. polytext-0.1.2/tests/test_get_customized_pdf_from_markdown.py +0 -43
  67. {polytext-0.1.2 → polytext-0.1.3b2}/LICENSE +0 -0
  68. {polytext-0.1.2 → polytext-0.1.3b2}/polytext/exceptions/__init__.py +0 -0
  69. {polytext-0.1.2 → polytext-0.1.3b2}/polytext/generator/__init__.py +0 -0
  70. {polytext-0.1.2 → polytext-0.1.3b2}/polytext.egg-info/dependency_links.txt +0 -0
  71. {polytext-0.1.2 → polytext-0.1.3b2}/polytext.egg-info/not-zip-safe +0 -0
  72. {polytext-0.1.2 → polytext-0.1.3b2}/polytext.egg-info/top_level.txt +0 -0
  73. {polytext-0.1.2 → polytext-0.1.3b2}/setup.cfg +0 -0
@@ -0,0 +1,113 @@
1
+ Metadata-Version: 2.4
2
+ Name: polytext
3
+ Version: 0.1.3b2
4
+ Summary: Python utilities to simplify document files management
5
+ Home-page: https://github.com/docsity/polytext
6
+ Author: Matteo Senardi
7
+ Author-email: matteo.s@docsity.com
8
+ License: MIT
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3.12
12
+ Classifier: Programming Language :: Python :: 3.13
13
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
14
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
15
+ Requires-Python: >=3.12
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Requires-Dist: pypdf==5.5.0
19
+ Requires-Dist: PyMuPDF>=1.25.5
20
+ Requires-Dist: pycryptodome==3.23.0
21
+ Requires-Dist: weasyprint==65.1
22
+ Requires-Dist: markdown==3.8
23
+ Requires-Dist: python-docx==1.1.2
24
+ Requires-Dist: google-api-core>=2.24.2
25
+ Requires-Dist: google-cloud-storage>=3.1.0
26
+ Requires-Dist: google-genai>=1.16.1
27
+ Requires-Dist: boto3>=1.38.19
28
+ Requires-Dist: botocore>=1.18.19
29
+ Requires-Dist: ffmpeg-python==0.2.0
30
+ Requires-Dist: pydub==0.25.1
31
+ Requires-Dist: youtube-transcript-api==1.0.3
32
+ Requires-Dist: yt-dlp==2025.5.22
33
+ Requires-Dist: charset-normalizer==3.4.2
34
+ Requires-Dist: requests==2.32.3
35
+ Requires-Dist: markitdown==0.1.1
36
+ Requires-Dist: pymupdf4llm==0.0.24
37
+ Requires-Dist: pathlib==1.0.1
38
+ Requires-Dist: retry==0.9.2
39
+ Provides-Extra: sentry
40
+ Requires-Dist: sentry-sdk==2.29.1; extra == "sentry"
41
+ Dynamic: author
42
+ Dynamic: author-email
43
+ Dynamic: classifier
44
+ Dynamic: description
45
+ Dynamic: description-content-type
46
+ Dynamic: home-page
47
+ Dynamic: license
48
+ Dynamic: license-file
49
+ Dynamic: provides-extra
50
+ Dynamic: requires-dist
51
+ Dynamic: requires-python
52
+ Dynamic: summary
53
+
54
+ # polytext
55
+
56
+ # Doc Utils
57
+
58
+ A Python package for document conversion and text extraction.
59
+
60
+ ## Features
61
+
62
+ - Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
63
+ - Extract text from PDF documents
64
+ - Support for both local files and S3 storage
65
+ - Multiple PDF parsing backends (PyPDF, PyMuPDF)
66
+
67
+ ## Installation
68
+
69
+ ```bash
70
+ # Library only – assumes system requirements are already present
71
+ pip install polytext
72
+ ```
73
+
74
+ > **Heads-up:** Polytext’s PDF generator relies on [WeasyPrint] under the hood.
75
+ > The PyPI wheel contains *only* Python code; you still need WeasyPrint’s **native libraries** (Pango, Cairo, GDK-PixBuf, HarfBuzz, Fontconfig) installed at the OS level.
76
+
77
+ ### System requirements
78
+
79
+ | Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
80
+ |-------------|---------------------------------------------------------------------------------|------------------|-----------------|
81
+ | **Python** | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
82
+ | **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
83
+ | **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
84
+
85
+
86
+ ## Usage
87
+
88
+ Converting Documents to PDF
89
+
90
+ ```python
91
+ from polytext import convert_to_pdf, ConversionError
92
+
93
+ try:
94
+ # Convert a document to PDF
95
+ pdf_path = convert_to_pdf('input.docx', 'output.pdf')
96
+ print(f"PDF saved to: {pdf_path}")
97
+ except ConversionError as e:
98
+ print(f"Conversion failed: {e}")
99
+ ```
100
+
101
+ Text Extraction
102
+
103
+ ```python
104
+ from polytext import extract_text_from_file
105
+
106
+ # Extract text from any supported file
107
+ text = extract_text_from_file('document.docx')
108
+ print(f"Extracted text: {text}")
109
+ ```
110
+
111
+ ## License
112
+
113
+ MIT Licence
@@ -0,0 +1,60 @@
1
+ # polytext
2
+
3
+ # Doc Utils
4
+
5
+ A Python package for document conversion and text extraction.
6
+
7
+ ## Features
8
+
9
+ - Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
10
+ - Extract text from PDF documents
11
+ - Support for both local files and S3 storage
12
+ - Multiple PDF parsing backends (PyPDF, PyMuPDF)
13
+
14
+ ## Installation
15
+
16
+ ```bash
17
+ # Library only – assumes system requirements are already present
18
+ pip install polytext
19
+ ```
20
+
21
+ > **Heads-up:** Polytext’s PDF generator relies on [WeasyPrint] under the hood.
22
+ > The PyPI wheel contains *only* Python code; you still need WeasyPrint’s **native libraries** (Pango, Cairo, GDK-PixBuf, HarfBuzz, Fontconfig) installed at the OS level.
23
+
24
+ ### System requirements
25
+
26
+ | Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
27
+ |-------------|---------------------------------------------------------------------------------|------------------|-----------------|
28
+ | **Python** | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
29
+ | **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
30
+ | **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
31
+
32
+
33
+ ## Usage
34
+
35
+ Converting Documents to PDF
36
+
37
+ ```python
38
+ from polytext import convert_to_pdf, ConversionError
39
+
40
+ try:
41
+ # Convert a document to PDF
42
+ pdf_path = convert_to_pdf('input.docx', 'output.pdf')
43
+ print(f"PDF saved to: {pdf_path}")
44
+ except ConversionError as e:
45
+ print(f"Conversion failed: {e}")
46
+ ```
47
+
48
+ Text Extraction
49
+
50
+ ```python
51
+ from polytext import extract_text_from_file
52
+
53
+ # Extract text from any supported file
54
+ text = extract_text_from_file('document.docx')
55
+ print(f"Extracted text: {text}")
56
+ ```
57
+
58
+ ## License
59
+
60
+ MIT Licence
@@ -0,0 +1,41 @@
1
+ # polytext/__init__.py
2
+ import os
3
+ import logging
4
+ import dotenv
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # Load environment variables
9
+ dotenv.load_dotenv()
10
+
11
+ # Initialize Sentry if DSN is configured
12
+ sentry_dsn = os.getenv('SENTRY_DSN_POLYTEXT')
13
+ if sentry_dsn:
14
+ try:
15
+ import sentry_sdk
16
+ sentry_sdk.init(
17
+ dsn=sentry_dsn,
18
+ environment=os.getenv('ENVIRONMENT', 'prod'),
19
+ traces_sample_rate=1.0,
20
+ profiles_sample_rate=1.0,
21
+ )
22
+ logger.info("Sentry monitoring initialized")
23
+ except ImportError:
24
+ logger.warning("Sentry DSN is configured but sentry-sdk is not installed. "
25
+ "Install with: pip install polytext[sentry]")
26
+
27
+ from .converter.pdf import convert_to_pdf, DocumentConverter
28
+ from .loader.document import DocumentLoader
29
+ from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
30
+ from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
31
+
32
+ __all__ = [
33
+ 'convert_to_pdf',
34
+ 'DocumentConverter',
35
+ 'DocumentLoader',
36
+ 'EmptyDocument',
37
+ 'ExceededMaxPages',
38
+ 'ConversionError',
39
+ 'get_customized_pdf_from_markdown',
40
+ 'PDFGenerator'
41
+ ]
@@ -0,0 +1,7 @@
1
+ # polytext/converter/__init__.py
2
+ from .pdf import convert_to_pdf, DocumentConverter
3
+ from .md_to_text import md_to_text
4
+ from .html_to_md import html_to_md
5
+ from .base import BaseConverter
6
+
7
+ __all__ = ['convert_to_pdf', 'DocumentConverter', 'html_to_md', 'md_to_text', 'BaseConverter']
@@ -0,0 +1,368 @@
1
+ # converter/audio_to_text.py
2
+ import os
3
+ import logging
4
+ import tempfile
5
+ import time
6
+ import mimetypes
7
+ import ffmpeg
8
+ from retry import retry
9
+ from google import genai
10
+ from google.genai import types
11
+ from concurrent.futures import ThreadPoolExecutor, as_completed
12
+ from google.api_core import exceptions as google_exceptions
13
+
14
+ from ..prompts.transcription import AUDIO_TO_MARKDOWN_PROMPT, AUDIO_TO_PLAIN_TEXT_PROMPT
15
+ from ..processor.audio_chunker import AudioChunker
16
+ from ..processor.text_merger import TextMerger
17
+ from ..converter import BaseConverter
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ SUPPORTED_MIME_TYPES = {
22
+ 'audio/x-aac', 'audio/flac', 'audio/mp3', 'audio/m4a', 'audio/mpeg',
23
+ 'audio/mpga', 'audio/mp4', 'audio/opus', 'audio/pcm', 'audio/wav', 'audio/webm'
24
+ }
25
+
26
+ def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
27
+ """
28
+ Compress and convert an audio file to MP3 using ffmpeg.
29
+
30
+ Args:
31
+ input_path (str): Path to the original audio file
32
+ bitrate_quality (int, optional): Variable bitrate quality from 0-9 (9 being lowest). Defaults to 9
33
+
34
+ Returns:
35
+ str: Path to the temporary compressed/converted MP3 file
36
+
37
+ Raises:
38
+ RuntimeError: If FFmpeg compression/conversion fails
39
+
40
+ Notes:
41
+ - Creates a temporary MP3 file that should be deleted after use
42
+ - Converts audio to mono and 16kHz sample rate for smaller file size
43
+ - Uses maximum available CPU threads for faster processing
44
+ """
45
+ # Create temporary file for audio output
46
+ fd, temp_audio_path = tempfile.mkstemp(suffix='.mp3')
47
+ os.close(fd)
48
+
49
+ logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
50
+ ffmpeg.input(input_path).output(
51
+ temp_audio_path,
52
+ q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
53
+ acodec='libmp3lame',
54
+ ac=1, # Convert to mono
55
+ ar=16000, # Lower sample rate
56
+ vn=None,
57
+ threads=0, # Use maximum available threads
58
+ loglevel='error', # Reduce logging overhead
59
+ ).run(quiet=True, overwrite_output=True)
60
+
61
+ logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
62
+ return temp_audio_path
63
+
64
+ def transcribe_full_audio(audio_file, markdown_output: bool = False,
65
+ llm_api_key: str = None,
66
+ save_transcript_chunks: bool = False, bitrate_quality=9) -> dict:
67
+ """
68
+ Convenience function to transcribe an audio file into text, optionally formatted as Markdown.
69
+
70
+ This function initializes an `AudioToTextConverter` instance and uses it
71
+ to transcribe the provided audio file. The output can be formatted as
72
+ Markdown or plain text based on the `markdown_output` parameter.
73
+
74
+ Args:
75
+ audio_file (str): Path to the audio file to be transcribed.
76
+ markdown_output (bool, optional): If True, the transcription will be
77
+ formatted as Markdown. Defaults to True.
78
+ llm_api_key (str, optional): API key for the LLM service. If provided, it will override the default configuration.
79
+ save_transcript_chunks (bool, optional): Whether to save chunk transcripts in final output. Defaults to False.
80
+ bitrate_quality (int, optional): Variable bitrate quality from 0-9 (9 being lowest). Defaults to 9
81
+
82
+ Returns:
83
+ str: The transcribed text from the audio file.
84
+ """
85
+ converter = AudioToTextConverter(markdown_output=markdown_output, llm_api_key=llm_api_key,
86
+ bitrate_quality=bitrate_quality)
87
+ return converter.transcribe_full_audio(audio_file, save_transcript_chunks)
88
+
89
+ class AudioToTextConverter:
90
+ def __init__(self, transcription_model: str ="gemini-2.0-flash", transcription_model_provider: str ="google",
91
+ k: int =5, min_matches: int =3, markdown_output: bool =True, llm_api_key: str =None, max_llm_tokens: int =8000, temp_dir: str ="temp",
92
+ bitrate_quality: int =9):
93
+ """
94
+ Initialize the AudioToTextConverter class with a specified transcription model and provider.
95
+
96
+ Args:
97
+ transcription_model (str): Model name for transcription. Defaults to "gemini-2.0-flash".
98
+ transcription_model_provider (str): Provider of transcription service. Defaults to "google".
99
+ k (int): Number of words to use when searching for overlap between chunks. Defaults to 5.
100
+ min_matches (int): Minimum matching words for chunk merging. Defaults to 3.
101
+ markdown_output (bool): Enable Markdown formatting in output. Defaults to True.
102
+ llm_api_key (str, optional): Override API key for language model. Defaults to None.
103
+ max_llm_tokens (int): Maximum number of tokens for the language model output. Defaults to 8000.
104
+ temp_dir (str): Directory for temporary files. Defaults to "temp".
105
+ bitrate_quality (int, optional): Variable bitrate quality from 0-9 (9 being lowest). Defaults to 9
106
+
107
+ Raises:
108
+ OSError: If temp directory creation fails
109
+ ValueError: If invalid model or provider specified
110
+ """
111
+ self.transcription_model = transcription_model
112
+ self.transcription_model_provider = transcription_model_provider
113
+ self.k = k
114
+ self.min_matches = min_matches
115
+ self.markdown_output = markdown_output
116
+ self.llm_api_key = llm_api_key
117
+ self.max_llm_tokens = max_llm_tokens
118
+ self.chunked_audio = False
119
+ self.bitrate_quality = bitrate_quality
120
+
121
+ # Set up custom temp directory
122
+ self.temp_dir = os.path.abspath(temp_dir)
123
+ os.makedirs(self.temp_dir, exist_ok=True)
124
+ tempfile.tempdir = self.temp_dir
125
+
126
+ @retry(
127
+ (
128
+ google_exceptions.DeadlineExceeded,
129
+ google_exceptions.ResourceExhausted,
130
+ google_exceptions.ServiceUnavailable,
131
+ google_exceptions.InternalServerError
132
+ ),
133
+ tries=8,
134
+ delay=1,
135
+ backoff=2,
136
+ logger=logger,
137
+ )
138
+ def transcribe_audio(self, audio_file: str) -> dict:
139
+ """
140
+ Transcribe audio using a specified model and prompt template.
141
+
142
+ Args:
143
+ audio_file (str): Path to the audio file to be transcribed.
144
+
145
+ Returns:
146
+ dict: Dictionary containing:
147
+ - transcript (str): The transcribed text
148
+ - completion_tokens (int): Number of tokens in completion
149
+ - prompt_tokens (int): Number of tokens in prompt
150
+
151
+ Raises:
152
+ ValueError: If the audio file format is not recognized.
153
+ Exception: For any other errors during the transcription process.
154
+ """
155
+
156
+ start_time = time.time()
157
+
158
+ if self.markdown_output:
159
+ logger.info("Using prompt for markdown format")
160
+ # Convert the text to Markdown format
161
+ prompt_template = AUDIO_TO_MARKDOWN_PROMPT
162
+ else:
163
+ logger.info("Using prompt for plain text format")
164
+ # Convert the text to plain text format
165
+ prompt_template = AUDIO_TO_PLAIN_TEXT_PROMPT
166
+
167
+ if self.llm_api_key:
168
+ logger.info("Using provided Google API key")
169
+ client = genai.Client(api_key=self.llm_api_key)
170
+ else:
171
+ logger.info("Using Google API key from ENV")
172
+ client = genai.Client()
173
+
174
+ config = types.GenerateContentConfig(
175
+ safety_settings=[
176
+ types.SafetySetting(
177
+ category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
178
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
179
+ ),
180
+ types.SafetySetting(
181
+ category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
182
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
183
+ ),
184
+ types.SafetySetting(
185
+ category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
186
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
187
+ ),
188
+ types.SafetySetting(
189
+ category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
190
+ threshold=types.HarmBlockThreshold.BLOCK_NONE,
191
+ ),
192
+ ]
193
+ )
194
+
195
+ file_size = os.path.getsize(audio_file)
196
+ logger.info(f"Audio file size: {file_size / (1024 * 1024):.2f} MB")
197
+ if file_size > 20 * 1024 * 1024:
198
+ logger.info("Audio file size exceeds 20MB, uploading file before transcription")
199
+
200
+ my_file = client.files.upload(file=audio_file)
201
+
202
+ response = client.models.count_tokens(
203
+ model='gemini-2.0-flash',
204
+ contents=[my_file]
205
+ )
206
+ logger.info(f"File size in tokens: {response}")
207
+
208
+ logger.info(f"Uploaded file: {my_file.name} - Starting transcription...")
209
+
210
+ response = client.models.generate_content(
211
+ model=self.transcription_model,
212
+ contents=[prompt_template, my_file],
213
+ config=config
214
+ )
215
+
216
+ client.files.delete(name=my_file.name)
217
+
218
+ else:
219
+ logger.info("Audio file size does not exceed 20MB")
220
+ with open(audio_file, "rb") as f:
221
+ audio_data = f.read()
222
+
223
+ # Determine mimetype
224
+ mime_type, _ = mimetypes.guess_type(audio_file)
225
+ if mime_type is None:
226
+ raise ValueError("Audio format not recognized")
227
+
228
+ response = client.models.generate_content(
229
+ model=self.transcription_model,
230
+ contents=[
231
+ prompt_template,
232
+ types.Part.from_bytes(
233
+ data=audio_data,
234
+ mime_type=mime_type,
235
+ )
236
+ ],
237
+ config=config
238
+ )
239
+
240
+ end_time = time.time()
241
+ time_elapsed = end_time - start_time
242
+
243
+ logger.info(f"Completion tokens: {response.usage_metadata.candidates_token_count}")
244
+ logger.info(f"Prompt tokens: {response.usage_metadata.prompt_token_count}")
245
+
246
+ response_dict = {"transcript": response.text,
247
+ "completion_tokens": response.usage_metadata.candidates_token_count,
248
+ "prompt_tokens": response.usage_metadata.prompt_token_count}
249
+
250
+ logger.info(f"Transcribed text from {audio_file} using {self.transcription_model} in {time_elapsed:.2f} seconds")
251
+ return response_dict
252
+
253
+ def process_chunk(self, chunk: dict, index: int) -> tuple[int, dict]:
254
+ """Process a single audio chunk and return its transcript"""
255
+ logger.info(f"Transcribing chunk {index + 1}...")
256
+ transcript_dict = self.transcribe_audio(chunk["file_path"])
257
+ transcript = transcript_dict["transcript"]
258
+
259
+ return index, transcript_dict
260
+
261
+ def transcribe_full_audio(self,
262
+ audio_path: str, save_transcript_chunks: bool = False) -> dict:
263
+ """
264
+ Process and transcribe a long audio file by chunking, parallel transcription, and merging.
265
+
266
+ Args:
267
+ audio_path (str): Path to the audio file to be transcribed
268
+ save_transcript_chunks (bool, optional): Whether to save chunk transcripts in final output. Defaults to False.
269
+
270
+ Returns:
271
+ dict: Dictionary containing:
272
+ - text (str): The final merged transcript
273
+ - completion_tokens (int): Total number of completion tokens used
274
+ - prompt_tokens (int): Total number of prompt tokens used
275
+ - completion_model (str): Name of the transcription model used
276
+ - completion_model_provider (str): Provider of the transcription model
277
+
278
+ Raises:
279
+ ValueError: If the audio file format is not recognized
280
+ RuntimeError: If there's an error during audio processing or transcription
281
+ """
282
+ processed_audio_path = None
283
+ logger.info(f"Processing audio file {audio_path}...")
284
+ file_size = os.path.getsize(audio_path)
285
+ logger.info(f"Audio file size: {file_size / (1024 * 1024):.2f} MB")
286
+
287
+ mime_type, _ = mimetypes.guess_type(audio_path)
288
+ logger.info(f"Original MIME type: {mime_type}")
289
+
290
+ # Check if conversion and/or compression is needed
291
+ needs_conversion = mime_type not in SUPPORTED_MIME_TYPES
292
+ needs_compression = file_size > 20 * 1024 * 1024
293
+
294
+ # If you need at least one of the two, apply compress_and_convert_audio
295
+ if needs_conversion: # or needs_compression:
296
+ logger.info("Audio file needs conversion, processing file...")
297
+ processed_audio_path = compress_and_convert_audio(audio_path)
298
+ used_file = processed_audio_path
299
+ logger.info(f"Audio file processed (conversion): {used_file}")
300
+ else:
301
+ used_file = audio_path
302
+ logger.info("Audio file is already in supported format")
303
+ # If you need at least one of the two, apply compress_and_convert_audio
304
+ if needs_conversion: # or needs_compression:
305
+ logger.info("Audio file needs conversion, processing file...")
306
+ processed_audio_path = compress_and_convert_audio(input_path=audio_path,
307
+ bitrate_quality=self.bitrate_quality)
308
+ used_file = processed_audio_path
309
+ logger.info(f"Audio file processed (conversion): {used_file}")
310
+ else:
311
+ used_file = audio_path
312
+ logger.info("Audio file is already in supported format")
313
+
314
+ # Create chunker and extract chunks
315
+ logger.info("Creating AudioChunker instance...")
316
+ chunker = AudioChunker(used_file, max_llm_tokens=self.max_llm_tokens)
317
+ chunks = chunker.extract_chunks()
318
+
319
+ logger.info(f"chunks: {chunks}")
320
+
321
+ logger.info(f"Split audio into {len(chunks)} chunks")
322
+ if len(chunks) > 1 and self.markdown_output:
323
+ logger.info("Audio chunking is needed, returning minimal markdown output")
324
+ # self.markdown_output=False
325
+ self.chunked_audio = True
326
+
327
+ # Transcribe each chunk
328
+ transcript_chunks = [""] * len(chunks) # Pre-allocate list to maintain order
329
+ with ThreadPoolExecutor() as executor:
330
+ # Submit all chunks to the thread pool
331
+ future_to_chunk = {
332
+ executor.submit(self.process_chunk, chunk, i): i
333
+ for i, chunk in enumerate(chunks)
334
+ }
335
+
336
+ # Process completed transcriptions in order of completion
337
+ completion_tokens = 0
338
+ prompt_tokens = 0
339
+ for future in as_completed(future_to_chunk):
340
+ index, transcript_dict = future.result()
341
+ chunks[index]["transcript"] = transcript_dict["transcript"]
342
+ transcript_chunks[index] = transcript_dict["transcript"]
343
+ completion_tokens += transcript_dict["completion_tokens"]
344
+ prompt_tokens += transcript_dict["prompt_tokens"]
345
+
346
+ text_merger = TextMerger()
347
+ # Merge all transcripts
348
+ full_text_merged_dict = text_merger.merge_chunks_with_llm_sequential(chunks=transcript_chunks)
349
+
350
+ result_dict = {
351
+ "text": full_text_merged_dict["full_text_merged"],
352
+ "completion_tokens": completion_tokens + full_text_merged_dict["completion_tokens"],
353
+ "prompt_tokens": prompt_tokens + full_text_merged_dict["prompt_tokens"],
354
+ "completion_model": self.transcription_model,
355
+ "completion_model_provider": self.transcription_model_provider
356
+ }
357
+ if save_transcript_chunks:
358
+ result_dict["text_chunks"] = transcript_chunks
359
+
360
+ # Clean up temporary files
361
+ if len(chunks) > 1:
362
+ chunker.cleanup_temp_files(chunks)
363
+
364
+ # Clean up the temporary compressed file
365
+ if processed_audio_path and os.path.exists(processed_audio_path):
366
+ os.remove(processed_audio_path)
367
+
368
+ return result_dict
@@ -0,0 +1,35 @@
1
+ import re
2
+
3
+
4
+ class BaseConverter:
5
+ def __init__(self):
6
+ pass
7
+ """
8
+ Base class for all converters
9
+ """
10
+
11
+ @staticmethod
12
+ def format_subtitles(text: str) -> str:
13
+ """
14
+ Format a text by adjusting spacing around Markdown-style subtitles (## or ###).
15
+
16
+ Args:
17
+ text: Text to be formatted.
18
+
19
+ Returns:
20
+ Formatted text with consistent spacing around subtitles and no excessive blank lines.
21
+ """
22
+ # Ensure there is a blank line before and after ## or ### subtitles
23
+ # Prevent breaking the line between the subtitle and its content
24
+ pattern = r'(?<!\n)\n?(#{2,3} .+?)\n?(?!\n)'
25
+
26
+ def replacer(match):
27
+ subtitle = match.group(1).strip()
28
+ return f"\n\n{subtitle}\n\n"
29
+
30
+ formatted_text = re.sub(pattern, replacer, text)
31
+
32
+ # Remove any excessive blank lines
33
+ formatted_text = re.sub(r'\n{3,}', '\n\n', formatted_text)
34
+
35
+ return formatted_text