polytext 0.1.2__tar.gz → 0.1.3b2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- polytext-0.1.3b2/PKG-INFO +113 -0
- polytext-0.1.3b2/README.md +60 -0
- polytext-0.1.3b2/polytext/__init__.py +41 -0
- polytext-0.1.3b2/polytext/converter/__init__.py +7 -0
- polytext-0.1.3b2/polytext/converter/audio_to_text.py +368 -0
- polytext-0.1.3b2/polytext/converter/base.py +35 -0
- polytext-0.1.3b2/polytext/converter/document_ocr_to_text.py +404 -0
- polytext-0.1.3b2/polytext/converter/html_to_md.py +25 -0
- polytext-0.1.3b2/polytext/converter/md_to_text.py +49 -0
- polytext-0.1.3b2/polytext/converter/ocr_to_text.py +288 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext/converter/pdf.py +5 -5
- polytext-0.1.3b2/polytext/converter/text_to_md.py +247 -0
- polytext-0.1.3b2/polytext/converter/video_to_audio.py +63 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext/exceptions/base.py +3 -3
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext/generator/pdf.py +59 -27
- polytext-0.1.3b2/polytext/loader/__init__.py +13 -0
- polytext-0.1.3b2/polytext/loader/audio.py +159 -0
- polytext-0.1.3b2/polytext/loader/base.py +414 -0
- polytext-0.1.3b2/polytext/loader/document.py +596 -0
- polytext-0.1.3b2/polytext/loader/document_ocr.py +196 -0
- polytext-0.1.3b2/polytext/loader/downloader/__init__.py +3 -0
- polytext-0.1.3b2/polytext/loader/downloader/downloader.py +76 -0
- polytext-0.1.3b2/polytext/loader/html.py +65 -0
- polytext-0.1.3b2/polytext/loader/markdown.py +185 -0
- polytext-0.1.3b2/polytext/loader/ocr.py +155 -0
- polytext-0.1.3b2/polytext/loader/plain_text.py +97 -0
- polytext-0.1.3b2/polytext/loader/video.py +234 -0
- polytext-0.1.3b2/polytext/loader/youtube.py +235 -0
- polytext-0.1.3b2/polytext/processor/__init__.py +0 -0
- polytext-0.1.3b2/polytext/processor/audio_chunker.py +145 -0
- polytext-0.1.3b2/polytext/processor/text_merger.py +386 -0
- polytext-0.1.3b2/polytext/processor/transcript_chunker.py +79 -0
- polytext-0.1.3b2/polytext/prompts/__init__.py +0 -0
- polytext-0.1.3b2/polytext/prompts/ocr.py +16 -0
- polytext-0.1.3b2/polytext/prompts/text_merging.py +17 -0
- polytext-0.1.3b2/polytext/prompts/text_to_md.py +21 -0
- polytext-0.1.3b2/polytext/prompts/transcription.py +36 -0
- polytext-0.1.3b2/polytext.egg-info/PKG-INFO +113 -0
- polytext-0.1.3b2/polytext.egg-info/SOURCES.txt +60 -0
- polytext-0.1.3b2/polytext.egg-info/requires.txt +24 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/pyproject.toml +3 -3
- {polytext-0.1.2 → polytext-0.1.3b2}/setup.py +27 -28
- polytext-0.1.3b2/tests/test_get_audio_transcript_from_gcs.py +61 -0
- polytext-0.1.3b2/tests/test_get_customized_pdf_from_markdown.py +89 -0
- polytext-0.1.3b2/tests/test_get_document_ocr.py +72 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/tests/test_get_document_text.py +6 -6
- polytext-0.1.3b2/tests/test_get_document_text_from_gcs.py +73 -0
- polytext-0.1.3b2/tests/test_get_ocr_from_image.py +61 -0
- polytext-0.1.3b2/tests/test_get_text_from_markdown.py +71 -0
- polytext-0.1.3b2/tests/test_get_video_transcript_from_gcs.py +62 -0
- polytext-0.1.3b2/tests/test_library.py +29 -0
- polytext-0.1.3b2/tests/test_markitdown_html.py +32 -0
- polytext-0.1.3b2/tests/test_pain_text.py +29 -0
- polytext-0.1.3b2/tests/test_split_audio_with_llm.py +137 -0
- polytext-0.1.3b2/tests/test_youtube_transcript.py +35 -0
- polytext-0.1.2/PKG-INFO +0 -93
- polytext-0.1.2/README.md +0 -53
- polytext-0.1.2/polytext/__init__.py +0 -18
- polytext-0.1.2/polytext/converter/__init__.py +0 -4
- polytext-0.1.2/polytext/loader/__init__.py +0 -4
- polytext-0.1.2/polytext/loader/text.py +0 -606
- polytext-0.1.2/polytext.egg-info/PKG-INFO +0 -93
- polytext-0.1.2/polytext.egg-info/SOURCES.txt +0 -22
- polytext-0.1.2/polytext.egg-info/requires.txt +0 -6
- polytext-0.1.2/tests/test_extract_text_from_file.py +0 -44
- polytext-0.1.2/tests/test_get_customized_pdf_from_markdown.py +0 -43
- {polytext-0.1.2 → polytext-0.1.3b2}/LICENSE +0 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext/exceptions/__init__.py +0 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext/generator/__init__.py +0 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext.egg-info/dependency_links.txt +0 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext.egg-info/not-zip-safe +0 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/polytext.egg-info/top_level.txt +0 -0
- {polytext-0.1.2 → polytext-0.1.3b2}/setup.cfg +0 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: polytext
|
|
3
|
+
Version: 0.1.3b2
|
|
4
|
+
Summary: Python utilities to simplify document files management
|
|
5
|
+
Home-page: https://github.com/docsity/polytext
|
|
6
|
+
Author: Matteo Senardi
|
|
7
|
+
Author-email: matteo.s@docsity.com
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
14
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: pypdf==5.5.0
|
|
19
|
+
Requires-Dist: PyMuPDF>=1.25.5
|
|
20
|
+
Requires-Dist: pycryptodome==3.23.0
|
|
21
|
+
Requires-Dist: weasyprint==65.1
|
|
22
|
+
Requires-Dist: markdown==3.8
|
|
23
|
+
Requires-Dist: python-docx==1.1.2
|
|
24
|
+
Requires-Dist: google-api-core>=2.24.2
|
|
25
|
+
Requires-Dist: google-cloud-storage>=3.1.0
|
|
26
|
+
Requires-Dist: google-genai>=1.16.1
|
|
27
|
+
Requires-Dist: boto3>=1.38.19
|
|
28
|
+
Requires-Dist: botocore>=1.18.19
|
|
29
|
+
Requires-Dist: ffmpeg-python==0.2.0
|
|
30
|
+
Requires-Dist: pydub==0.25.1
|
|
31
|
+
Requires-Dist: youtube-transcript-api==1.0.3
|
|
32
|
+
Requires-Dist: yt-dlp==2025.5.22
|
|
33
|
+
Requires-Dist: charset-normalizer==3.4.2
|
|
34
|
+
Requires-Dist: requests==2.32.3
|
|
35
|
+
Requires-Dist: markitdown==0.1.1
|
|
36
|
+
Requires-Dist: pymupdf4llm==0.0.24
|
|
37
|
+
Requires-Dist: pathlib==1.0.1
|
|
38
|
+
Requires-Dist: retry==0.9.2
|
|
39
|
+
Provides-Extra: sentry
|
|
40
|
+
Requires-Dist: sentry-sdk==2.29.1; extra == "sentry"
|
|
41
|
+
Dynamic: author
|
|
42
|
+
Dynamic: author-email
|
|
43
|
+
Dynamic: classifier
|
|
44
|
+
Dynamic: description
|
|
45
|
+
Dynamic: description-content-type
|
|
46
|
+
Dynamic: home-page
|
|
47
|
+
Dynamic: license
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
Dynamic: provides-extra
|
|
50
|
+
Dynamic: requires-dist
|
|
51
|
+
Dynamic: requires-python
|
|
52
|
+
Dynamic: summary
|
|
53
|
+
|
|
54
|
+
# polytext
|
|
55
|
+
|
|
56
|
+
# Doc Utils
|
|
57
|
+
|
|
58
|
+
A Python package for document conversion and text extraction.
|
|
59
|
+
|
|
60
|
+
## Features
|
|
61
|
+
|
|
62
|
+
- Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
|
|
63
|
+
- Extract text from PDF documents
|
|
64
|
+
- Support for both local files and S3 storage
|
|
65
|
+
- Multiple PDF parsing backends (PyPDF, PyMuPDF)
|
|
66
|
+
|
|
67
|
+
## Installation
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
# Library only – assumes system requirements are already present
|
|
71
|
+
pip install polytext
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
> **Heads-up:** Polytext’s PDF generator relies on [WeasyPrint] under the hood.
|
|
75
|
+
> The PyPI wheel contains *only* Python code; you still need WeasyPrint’s **native libraries** (Pango, Cairo, GDK-PixBuf, HarfBuzz, Fontconfig) installed at the OS level.
|
|
76
|
+
|
|
77
|
+
### System requirements
|
|
78
|
+
|
|
79
|
+
| Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
|
|
80
|
+
|-------------|---------------------------------------------------------------------------------|------------------|-----------------|
|
|
81
|
+
| **Python** | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
|
|
82
|
+
| **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
|
|
83
|
+
| **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
## Usage
|
|
87
|
+
|
|
88
|
+
Converting Documents to PDF
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from polytext import convert_to_pdf, ConversionError
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
# Convert a document to PDF
|
|
95
|
+
pdf_path = convert_to_pdf('input.docx', 'output.pdf')
|
|
96
|
+
print(f"PDF saved to: {pdf_path}")
|
|
97
|
+
except ConversionError as e:
|
|
98
|
+
print(f"Conversion failed: {e}")
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
Text Extraction
|
|
102
|
+
|
|
103
|
+
```python
|
|
104
|
+
from polytext import extract_text_from_file
|
|
105
|
+
|
|
106
|
+
# Extract text from any supported file
|
|
107
|
+
text = extract_text_from_file('document.docx')
|
|
108
|
+
print(f"Extracted text: {text}")
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
## License
|
|
112
|
+
|
|
113
|
+
MIT Licence
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# polytext
|
|
2
|
+
|
|
3
|
+
# Doc Utils
|
|
4
|
+
|
|
5
|
+
A Python package for document conversion and text extraction.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Convert various document formats (DOCX, ODT, PPT, etc.) to PDF
|
|
10
|
+
- Extract text from PDF documents
|
|
11
|
+
- Support for both local files and S3 storage
|
|
12
|
+
- Multiple PDF parsing backends (PyPDF, PyMuPDF)
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
# Library only – assumes system requirements are already present
|
|
18
|
+
pip install polytext
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
> **Heads-up:** Polytext’s PDF generator relies on [WeasyPrint] under the hood.
|
|
22
|
+
> The PyPI wheel contains *only* Python code; you still need WeasyPrint’s **native libraries** (Pango, Cairo, GDK-PixBuf, HarfBuzz, Fontconfig) installed at the OS level.
|
|
23
|
+
|
|
24
|
+
### System requirements
|
|
25
|
+
|
|
26
|
+
| Requirement | Notes | macOS (Homebrew) | Ubuntu / Debian |
|
|
27
|
+
|-------------|---------------------------------------------------------------------------------|------------------|-----------------|
|
|
28
|
+
| **Python** | ✔️ Tested on **3.12**<br> Older versions may fail to locate WeasyPrint’s dylibs | `brew install python@3.12` | `sudo apt install python3.12` |
|
|
29
|
+
| **WeasyPrint – native stack** | installs Pango, Cairo, etc. | `brew install weasyprint` | `sudo apt install weasyprint` |
|
|
30
|
+
| **LibreOffice** | used for Office → PDF conversion | `brew install --cask libreoffice` | `sudo apt install libreoffice` |
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
## Usage
|
|
34
|
+
|
|
35
|
+
Converting Documents to PDF
|
|
36
|
+
|
|
37
|
+
```python
|
|
38
|
+
from polytext import convert_to_pdf, ConversionError
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
# Convert a document to PDF
|
|
42
|
+
pdf_path = convert_to_pdf('input.docx', 'output.pdf')
|
|
43
|
+
print(f"PDF saved to: {pdf_path}")
|
|
44
|
+
except ConversionError as e:
|
|
45
|
+
print(f"Conversion failed: {e}")
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
Text Extraction
|
|
49
|
+
|
|
50
|
+
```python
|
|
51
|
+
from polytext import extract_text_from_file
|
|
52
|
+
|
|
53
|
+
# Extract text from any supported file
|
|
54
|
+
text = extract_text_from_file('document.docx')
|
|
55
|
+
print(f"Extracted text: {text}")
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
## License
|
|
59
|
+
|
|
60
|
+
MIT Licence
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# polytext/__init__.py
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
import dotenv
|
|
5
|
+
|
|
6
|
+
logger = logging.getLogger(__name__)
|
|
7
|
+
|
|
8
|
+
# Load environment variables
|
|
9
|
+
dotenv.load_dotenv()
|
|
10
|
+
|
|
11
|
+
# Initialize Sentry if DSN is configured
|
|
12
|
+
sentry_dsn = os.getenv('SENTRY_DSN_POLYTEXT')
|
|
13
|
+
if sentry_dsn:
|
|
14
|
+
try:
|
|
15
|
+
import sentry_sdk
|
|
16
|
+
sentry_sdk.init(
|
|
17
|
+
dsn=sentry_dsn,
|
|
18
|
+
environment=os.getenv('ENVIRONMENT', 'prod'),
|
|
19
|
+
traces_sample_rate=1.0,
|
|
20
|
+
profiles_sample_rate=1.0,
|
|
21
|
+
)
|
|
22
|
+
logger.info("Sentry monitoring initialized")
|
|
23
|
+
except ImportError:
|
|
24
|
+
logger.warning("Sentry DSN is configured but sentry-sdk is not installed. "
|
|
25
|
+
"Install with: pip install polytext[sentry]")
|
|
26
|
+
|
|
27
|
+
from .converter.pdf import convert_to_pdf, DocumentConverter
|
|
28
|
+
from .loader.document import DocumentLoader
|
|
29
|
+
from .exceptions.base import EmptyDocument, ExceededMaxPages, ConversionError
|
|
30
|
+
from .generator.pdf import get_customized_pdf_from_markdown, PDFGenerator
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
'convert_to_pdf',
|
|
34
|
+
'DocumentConverter',
|
|
35
|
+
'DocumentLoader',
|
|
36
|
+
'EmptyDocument',
|
|
37
|
+
'ExceededMaxPages',
|
|
38
|
+
'ConversionError',
|
|
39
|
+
'get_customized_pdf_from_markdown',
|
|
40
|
+
'PDFGenerator'
|
|
41
|
+
]
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# polytext/converter/__init__.py
|
|
2
|
+
from .pdf import convert_to_pdf, DocumentConverter
|
|
3
|
+
from .md_to_text import md_to_text
|
|
4
|
+
from .html_to_md import html_to_md
|
|
5
|
+
from .base import BaseConverter
|
|
6
|
+
|
|
7
|
+
__all__ = ['convert_to_pdf', 'DocumentConverter', 'html_to_md', 'md_to_text', 'BaseConverter']
|
|
@@ -0,0 +1,368 @@
|
|
|
1
|
+
# converter/audio_to_text.py
|
|
2
|
+
import os
|
|
3
|
+
import logging
|
|
4
|
+
import tempfile
|
|
5
|
+
import time
|
|
6
|
+
import mimetypes
|
|
7
|
+
import ffmpeg
|
|
8
|
+
from retry import retry
|
|
9
|
+
from google import genai
|
|
10
|
+
from google.genai import types
|
|
11
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
12
|
+
from google.api_core import exceptions as google_exceptions
|
|
13
|
+
|
|
14
|
+
from ..prompts.transcription import AUDIO_TO_MARKDOWN_PROMPT, AUDIO_TO_PLAIN_TEXT_PROMPT
|
|
15
|
+
from ..processor.audio_chunker import AudioChunker
|
|
16
|
+
from ..processor.text_merger import TextMerger
|
|
17
|
+
from ..converter import BaseConverter
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
SUPPORTED_MIME_TYPES = {
|
|
22
|
+
'audio/x-aac', 'audio/flac', 'audio/mp3', 'audio/m4a', 'audio/mpeg',
|
|
23
|
+
'audio/mpga', 'audio/mp4', 'audio/opus', 'audio/pcm', 'audio/wav', 'audio/webm'
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
def compress_and_convert_audio(input_path: str, bitrate_quality: int = 9) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Compress and convert an audio file to MP3 using ffmpeg.
|
|
29
|
+
|
|
30
|
+
Args:
|
|
31
|
+
input_path (str): Path to the original audio file
|
|
32
|
+
bitrate_quality (int, optional): Variable bitrate quality from 0-9 (9 being lowest). Defaults to 9
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
str: Path to the temporary compressed/converted MP3 file
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
RuntimeError: If FFmpeg compression/conversion fails
|
|
39
|
+
|
|
40
|
+
Notes:
|
|
41
|
+
- Creates a temporary MP3 file that should be deleted after use
|
|
42
|
+
- Converts audio to mono and 16kHz sample rate for smaller file size
|
|
43
|
+
- Uses maximum available CPU threads for faster processing
|
|
44
|
+
"""
|
|
45
|
+
# Create temporary file for audio output
|
|
46
|
+
fd, temp_audio_path = tempfile.mkstemp(suffix='.mp3')
|
|
47
|
+
os.close(fd)
|
|
48
|
+
|
|
49
|
+
logger.info(f"Compressing audio to bitrate quality: {bitrate_quality}")
|
|
50
|
+
ffmpeg.input(input_path).output(
|
|
51
|
+
temp_audio_path,
|
|
52
|
+
q=bitrate_quality, # Variable bitrate quality (0-9, 9 being lowest)
|
|
53
|
+
acodec='libmp3lame',
|
|
54
|
+
ac=1, # Convert to mono
|
|
55
|
+
ar=16000, # Lower sample rate
|
|
56
|
+
vn=None,
|
|
57
|
+
threads=0, # Use maximum available threads
|
|
58
|
+
loglevel='error', # Reduce logging overhead
|
|
59
|
+
).run(quiet=True, overwrite_output=True)
|
|
60
|
+
|
|
61
|
+
logger.info(f"Successfully converted and compressed audio: {temp_audio_path}")
|
|
62
|
+
return temp_audio_path
|
|
63
|
+
|
|
64
|
+
def transcribe_full_audio(audio_file, markdown_output: bool = False,
|
|
65
|
+
llm_api_key: str = None,
|
|
66
|
+
save_transcript_chunks: bool = False, bitrate_quality=9) -> dict:
|
|
67
|
+
"""
|
|
68
|
+
Convenience function to transcribe an audio file into text, optionally formatted as Markdown.
|
|
69
|
+
|
|
70
|
+
This function initializes an `AudioToTextConverter` instance and uses it
|
|
71
|
+
to transcribe the provided audio file. The output can be formatted as
|
|
72
|
+
Markdown or plain text based on the `markdown_output` parameter.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
audio_file (str): Path to the audio file to be transcribed.
|
|
76
|
+
markdown_output (bool, optional): If True, the transcription will be
|
|
77
|
+
formatted as Markdown. Defaults to True.
|
|
78
|
+
llm_api_key (str, optional): API key for the LLM service. If provided, it will override the default configuration.
|
|
79
|
+
save_transcript_chunks (bool, optional): Whether to save chunk transcripts in final output. Defaults to False.
|
|
80
|
+
bitrate_quality (int, optional): Variable bitrate quality from 0-9 (9 being lowest). Defaults to 9
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
str: The transcribed text from the audio file.
|
|
84
|
+
"""
|
|
85
|
+
converter = AudioToTextConverter(markdown_output=markdown_output, llm_api_key=llm_api_key,
|
|
86
|
+
bitrate_quality=bitrate_quality)
|
|
87
|
+
return converter.transcribe_full_audio(audio_file, save_transcript_chunks)
|
|
88
|
+
|
|
89
|
+
class AudioToTextConverter:
|
|
90
|
+
def __init__(self, transcription_model: str ="gemini-2.0-flash", transcription_model_provider: str ="google",
|
|
91
|
+
k: int =5, min_matches: int =3, markdown_output: bool =True, llm_api_key: str =None, max_llm_tokens: int =8000, temp_dir: str ="temp",
|
|
92
|
+
bitrate_quality: int =9):
|
|
93
|
+
"""
|
|
94
|
+
Initialize the AudioToTextConverter class with a specified transcription model and provider.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
transcription_model (str): Model name for transcription. Defaults to "gemini-2.0-flash".
|
|
98
|
+
transcription_model_provider (str): Provider of transcription service. Defaults to "google".
|
|
99
|
+
k (int): Number of words to use when searching for overlap between chunks. Defaults to 5.
|
|
100
|
+
min_matches (int): Minimum matching words for chunk merging. Defaults to 3.
|
|
101
|
+
markdown_output (bool): Enable Markdown formatting in output. Defaults to True.
|
|
102
|
+
llm_api_key (str, optional): Override API key for language model. Defaults to None.
|
|
103
|
+
max_llm_tokens (int): Maximum number of tokens for the language model output. Defaults to 8000.
|
|
104
|
+
temp_dir (str): Directory for temporary files. Defaults to "temp".
|
|
105
|
+
bitrate_quality (int, optional): Variable bitrate quality from 0-9 (9 being lowest). Defaults to 9
|
|
106
|
+
|
|
107
|
+
Raises:
|
|
108
|
+
OSError: If temp directory creation fails
|
|
109
|
+
ValueError: If invalid model or provider specified
|
|
110
|
+
"""
|
|
111
|
+
self.transcription_model = transcription_model
|
|
112
|
+
self.transcription_model_provider = transcription_model_provider
|
|
113
|
+
self.k = k
|
|
114
|
+
self.min_matches = min_matches
|
|
115
|
+
self.markdown_output = markdown_output
|
|
116
|
+
self.llm_api_key = llm_api_key
|
|
117
|
+
self.max_llm_tokens = max_llm_tokens
|
|
118
|
+
self.chunked_audio = False
|
|
119
|
+
self.bitrate_quality = bitrate_quality
|
|
120
|
+
|
|
121
|
+
# Set up custom temp directory
|
|
122
|
+
self.temp_dir = os.path.abspath(temp_dir)
|
|
123
|
+
os.makedirs(self.temp_dir, exist_ok=True)
|
|
124
|
+
tempfile.tempdir = self.temp_dir
|
|
125
|
+
|
|
126
|
+
@retry(
|
|
127
|
+
(
|
|
128
|
+
google_exceptions.DeadlineExceeded,
|
|
129
|
+
google_exceptions.ResourceExhausted,
|
|
130
|
+
google_exceptions.ServiceUnavailable,
|
|
131
|
+
google_exceptions.InternalServerError
|
|
132
|
+
),
|
|
133
|
+
tries=8,
|
|
134
|
+
delay=1,
|
|
135
|
+
backoff=2,
|
|
136
|
+
logger=logger,
|
|
137
|
+
)
|
|
138
|
+
def transcribe_audio(self, audio_file: str) -> dict:
|
|
139
|
+
"""
|
|
140
|
+
Transcribe audio using a specified model and prompt template.
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
audio_file (str): Path to the audio file to be transcribed.
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
dict: Dictionary containing:
|
|
147
|
+
- transcript (str): The transcribed text
|
|
148
|
+
- completion_tokens (int): Number of tokens in completion
|
|
149
|
+
- prompt_tokens (int): Number of tokens in prompt
|
|
150
|
+
|
|
151
|
+
Raises:
|
|
152
|
+
ValueError: If the audio file format is not recognized.
|
|
153
|
+
Exception: For any other errors during the transcription process.
|
|
154
|
+
"""
|
|
155
|
+
|
|
156
|
+
start_time = time.time()
|
|
157
|
+
|
|
158
|
+
if self.markdown_output:
|
|
159
|
+
logger.info("Using prompt for markdown format")
|
|
160
|
+
# Convert the text to Markdown format
|
|
161
|
+
prompt_template = AUDIO_TO_MARKDOWN_PROMPT
|
|
162
|
+
else:
|
|
163
|
+
logger.info("Using prompt for plain text format")
|
|
164
|
+
# Convert the text to plain text format
|
|
165
|
+
prompt_template = AUDIO_TO_PLAIN_TEXT_PROMPT
|
|
166
|
+
|
|
167
|
+
if self.llm_api_key:
|
|
168
|
+
logger.info("Using provided Google API key")
|
|
169
|
+
client = genai.Client(api_key=self.llm_api_key)
|
|
170
|
+
else:
|
|
171
|
+
logger.info("Using Google API key from ENV")
|
|
172
|
+
client = genai.Client()
|
|
173
|
+
|
|
174
|
+
config = types.GenerateContentConfig(
|
|
175
|
+
safety_settings=[
|
|
176
|
+
types.SafetySetting(
|
|
177
|
+
category=types.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
|
|
178
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
179
|
+
),
|
|
180
|
+
types.SafetySetting(
|
|
181
|
+
category=types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
|
|
182
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
183
|
+
),
|
|
184
|
+
types.SafetySetting(
|
|
185
|
+
category=types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
|
|
186
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
187
|
+
),
|
|
188
|
+
types.SafetySetting(
|
|
189
|
+
category=types.HarmCategory.HARM_CATEGORY_HARASSMENT,
|
|
190
|
+
threshold=types.HarmBlockThreshold.BLOCK_NONE,
|
|
191
|
+
),
|
|
192
|
+
]
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
file_size = os.path.getsize(audio_file)
|
|
196
|
+
logger.info(f"Audio file size: {file_size / (1024 * 1024):.2f} MB")
|
|
197
|
+
if file_size > 20 * 1024 * 1024:
|
|
198
|
+
logger.info("Audio file size exceeds 20MB, uploading file before transcription")
|
|
199
|
+
|
|
200
|
+
my_file = client.files.upload(file=audio_file)
|
|
201
|
+
|
|
202
|
+
response = client.models.count_tokens(
|
|
203
|
+
model='gemini-2.0-flash',
|
|
204
|
+
contents=[my_file]
|
|
205
|
+
)
|
|
206
|
+
logger.info(f"File size in tokens: {response}")
|
|
207
|
+
|
|
208
|
+
logger.info(f"Uploaded file: {my_file.name} - Starting transcription...")
|
|
209
|
+
|
|
210
|
+
response = client.models.generate_content(
|
|
211
|
+
model=self.transcription_model,
|
|
212
|
+
contents=[prompt_template, my_file],
|
|
213
|
+
config=config
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
client.files.delete(name=my_file.name)
|
|
217
|
+
|
|
218
|
+
else:
|
|
219
|
+
logger.info("Audio file size does not exceed 20MB")
|
|
220
|
+
with open(audio_file, "rb") as f:
|
|
221
|
+
audio_data = f.read()
|
|
222
|
+
|
|
223
|
+
# Determine mimetype
|
|
224
|
+
mime_type, _ = mimetypes.guess_type(audio_file)
|
|
225
|
+
if mime_type is None:
|
|
226
|
+
raise ValueError("Audio format not recognized")
|
|
227
|
+
|
|
228
|
+
response = client.models.generate_content(
|
|
229
|
+
model=self.transcription_model,
|
|
230
|
+
contents=[
|
|
231
|
+
prompt_template,
|
|
232
|
+
types.Part.from_bytes(
|
|
233
|
+
data=audio_data,
|
|
234
|
+
mime_type=mime_type,
|
|
235
|
+
)
|
|
236
|
+
],
|
|
237
|
+
config=config
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
end_time = time.time()
|
|
241
|
+
time_elapsed = end_time - start_time
|
|
242
|
+
|
|
243
|
+
logger.info(f"Completion tokens: {response.usage_metadata.candidates_token_count}")
|
|
244
|
+
logger.info(f"Prompt tokens: {response.usage_metadata.prompt_token_count}")
|
|
245
|
+
|
|
246
|
+
response_dict = {"transcript": response.text,
|
|
247
|
+
"completion_tokens": response.usage_metadata.candidates_token_count,
|
|
248
|
+
"prompt_tokens": response.usage_metadata.prompt_token_count}
|
|
249
|
+
|
|
250
|
+
logger.info(f"Transcribed text from {audio_file} using {self.transcription_model} in {time_elapsed:.2f} seconds")
|
|
251
|
+
return response_dict
|
|
252
|
+
|
|
253
|
+
def process_chunk(self, chunk: dict, index: int) -> tuple[int, dict]:
|
|
254
|
+
"""Process a single audio chunk and return its transcript"""
|
|
255
|
+
logger.info(f"Transcribing chunk {index + 1}...")
|
|
256
|
+
transcript_dict = self.transcribe_audio(chunk["file_path"])
|
|
257
|
+
transcript = transcript_dict["transcript"]
|
|
258
|
+
|
|
259
|
+
return index, transcript_dict
|
|
260
|
+
|
|
261
|
+
def transcribe_full_audio(self,
|
|
262
|
+
audio_path: str, save_transcript_chunks: bool = False) -> dict:
|
|
263
|
+
"""
|
|
264
|
+
Process and transcribe a long audio file by chunking, parallel transcription, and merging.
|
|
265
|
+
|
|
266
|
+
Args:
|
|
267
|
+
audio_path (str): Path to the audio file to be transcribed
|
|
268
|
+
save_transcript_chunks (bool, optional): Whether to save chunk transcripts in final output. Defaults to False.
|
|
269
|
+
|
|
270
|
+
Returns:
|
|
271
|
+
dict: Dictionary containing:
|
|
272
|
+
- text (str): The final merged transcript
|
|
273
|
+
- completion_tokens (int): Total number of completion tokens used
|
|
274
|
+
- prompt_tokens (int): Total number of prompt tokens used
|
|
275
|
+
- completion_model (str): Name of the transcription model used
|
|
276
|
+
- completion_model_provider (str): Provider of the transcription model
|
|
277
|
+
|
|
278
|
+
Raises:
|
|
279
|
+
ValueError: If the audio file format is not recognized
|
|
280
|
+
RuntimeError: If there's an error during audio processing or transcription
|
|
281
|
+
"""
|
|
282
|
+
processed_audio_path = None
|
|
283
|
+
logger.info(f"Processing audio file {audio_path}...")
|
|
284
|
+
file_size = os.path.getsize(audio_path)
|
|
285
|
+
logger.info(f"Audio file size: {file_size / (1024 * 1024):.2f} MB")
|
|
286
|
+
|
|
287
|
+
mime_type, _ = mimetypes.guess_type(audio_path)
|
|
288
|
+
logger.info(f"Original MIME type: {mime_type}")
|
|
289
|
+
|
|
290
|
+
# Check if conversion and/or compression is needed
|
|
291
|
+
needs_conversion = mime_type not in SUPPORTED_MIME_TYPES
|
|
292
|
+
needs_compression = file_size > 20 * 1024 * 1024
|
|
293
|
+
|
|
294
|
+
# If you need at least one of the two, apply compress_and_convert_audio
|
|
295
|
+
if needs_conversion: # or needs_compression:
|
|
296
|
+
logger.info("Audio file needs conversion, processing file...")
|
|
297
|
+
processed_audio_path = compress_and_convert_audio(audio_path)
|
|
298
|
+
used_file = processed_audio_path
|
|
299
|
+
logger.info(f"Audio file processed (conversion): {used_file}")
|
|
300
|
+
else:
|
|
301
|
+
used_file = audio_path
|
|
302
|
+
logger.info("Audio file is already in supported format")
|
|
303
|
+
# If you need at least one of the two, apply compress_and_convert_audio
|
|
304
|
+
if needs_conversion: # or needs_compression:
|
|
305
|
+
logger.info("Audio file needs conversion, processing file...")
|
|
306
|
+
processed_audio_path = compress_and_convert_audio(input_path=audio_path,
|
|
307
|
+
bitrate_quality=self.bitrate_quality)
|
|
308
|
+
used_file = processed_audio_path
|
|
309
|
+
logger.info(f"Audio file processed (conversion): {used_file}")
|
|
310
|
+
else:
|
|
311
|
+
used_file = audio_path
|
|
312
|
+
logger.info("Audio file is already in supported format")
|
|
313
|
+
|
|
314
|
+
# Create chunker and extract chunks
|
|
315
|
+
logger.info("Creating AudioChunker instance...")
|
|
316
|
+
chunker = AudioChunker(used_file, max_llm_tokens=self.max_llm_tokens)
|
|
317
|
+
chunks = chunker.extract_chunks()
|
|
318
|
+
|
|
319
|
+
logger.info(f"chunks: {chunks}")
|
|
320
|
+
|
|
321
|
+
logger.info(f"Split audio into {len(chunks)} chunks")
|
|
322
|
+
if len(chunks) > 1 and self.markdown_output:
|
|
323
|
+
logger.info("Audio chunking is needed, returning minimal markdown output")
|
|
324
|
+
# self.markdown_output=False
|
|
325
|
+
self.chunked_audio = True
|
|
326
|
+
|
|
327
|
+
# Transcribe each chunk
|
|
328
|
+
transcript_chunks = [""] * len(chunks) # Pre-allocate list to maintain order
|
|
329
|
+
with ThreadPoolExecutor() as executor:
|
|
330
|
+
# Submit all chunks to the thread pool
|
|
331
|
+
future_to_chunk = {
|
|
332
|
+
executor.submit(self.process_chunk, chunk, i): i
|
|
333
|
+
for i, chunk in enumerate(chunks)
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
# Process completed transcriptions in order of completion
|
|
337
|
+
completion_tokens = 0
|
|
338
|
+
prompt_tokens = 0
|
|
339
|
+
for future in as_completed(future_to_chunk):
|
|
340
|
+
index, transcript_dict = future.result()
|
|
341
|
+
chunks[index]["transcript"] = transcript_dict["transcript"]
|
|
342
|
+
transcript_chunks[index] = transcript_dict["transcript"]
|
|
343
|
+
completion_tokens += transcript_dict["completion_tokens"]
|
|
344
|
+
prompt_tokens += transcript_dict["prompt_tokens"]
|
|
345
|
+
|
|
346
|
+
text_merger = TextMerger()
|
|
347
|
+
# Merge all transcripts
|
|
348
|
+
full_text_merged_dict = text_merger.merge_chunks_with_llm_sequential(chunks=transcript_chunks)
|
|
349
|
+
|
|
350
|
+
result_dict = {
|
|
351
|
+
"text": full_text_merged_dict["full_text_merged"],
|
|
352
|
+
"completion_tokens": completion_tokens + full_text_merged_dict["completion_tokens"],
|
|
353
|
+
"prompt_tokens": prompt_tokens + full_text_merged_dict["prompt_tokens"],
|
|
354
|
+
"completion_model": self.transcription_model,
|
|
355
|
+
"completion_model_provider": self.transcription_model_provider
|
|
356
|
+
}
|
|
357
|
+
if save_transcript_chunks:
|
|
358
|
+
result_dict["text_chunks"] = transcript_chunks
|
|
359
|
+
|
|
360
|
+
# Clean up temporary files
|
|
361
|
+
if len(chunks) > 1:
|
|
362
|
+
chunker.cleanup_temp_files(chunks)
|
|
363
|
+
|
|
364
|
+
# Clean up the temporary compressed file
|
|
365
|
+
if processed_audio_path and os.path.exists(processed_audio_path):
|
|
366
|
+
os.remove(processed_audio_path)
|
|
367
|
+
|
|
368
|
+
return result_dict
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class BaseConverter:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
pass
|
|
7
|
+
"""
|
|
8
|
+
Base class for all converters
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
@staticmethod
|
|
12
|
+
def format_subtitles(text: str) -> str:
|
|
13
|
+
"""
|
|
14
|
+
Format a text by adjusting spacing around Markdown-style subtitles (## or ###).
|
|
15
|
+
|
|
16
|
+
Args:
|
|
17
|
+
text: Text to be formatted.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
Formatted text with consistent spacing around subtitles and no excessive blank lines.
|
|
21
|
+
"""
|
|
22
|
+
# Ensure there is a blank line before and after ## or ### subtitles
|
|
23
|
+
# Prevent breaking the line between the subtitle and its content
|
|
24
|
+
pattern = r'(?<!\n)\n?(#{2,3} .+?)\n?(?!\n)'
|
|
25
|
+
|
|
26
|
+
def replacer(match):
|
|
27
|
+
subtitle = match.group(1).strip()
|
|
28
|
+
return f"\n\n{subtitle}\n\n"
|
|
29
|
+
|
|
30
|
+
formatted_text = re.sub(pattern, replacer, text)
|
|
31
|
+
|
|
32
|
+
# Remove any excessive blank lines
|
|
33
|
+
formatted_text = re.sub(r'\n{3,}', '\n\n', formatted_text)
|
|
34
|
+
|
|
35
|
+
return formatted_text
|