quintessentia 1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- quintessentia-1.1/LICENSE +21 -0
- quintessentia-1.1/MANIFEST.in +3 -0
- quintessentia-1.1/PKG-INFO +279 -0
- quintessentia-1.1/README.md +223 -0
- quintessentia-1.1/pyproject.toml +16 -0
- quintessentia-1.1/quint/__init__.py +8 -0
- quintessentia-1.1/quint/api/__init__.py +0 -0
- quintessentia-1.1/quint/api/fast.py +207 -0
- quintessentia-1.1/quint/chunking/__init__.py +8 -0
- quintessentia-1.1/quint/chunking/generate.py +46 -0
- quintessentia-1.1/quint/chunking/similarities.py +54 -0
- quintessentia-1.1/quint/data/__init__.py +0 -0
- quintessentia-1.1/quint/data/youtube.py +40 -0
- quintessentia-1.1/quint/highlighting/__init__.py +8 -0
- quintessentia-1.1/quint/highlighting/highlights.py +25 -0
- quintessentia-1.1/quint/highlighting/words_outline.py +14 -0
- quintessentia-1.1/quint/params.py +1 -0
- quintessentia-1.1/quint/preprocessing/__init__.py +0 -0
- quintessentia-1.1/quint/preprocessing/audio.py +48 -0
- quintessentia-1.1/quint/summarizing/__init__.py +0 -0
- quintessentia-1.1/quint/summarizing/summarizer.py +55 -0
- quintessentia-1.1/quint/tools/__init__.py +8 -0
- quintessentia-1.1/quint/tools/activations.py +5 -0
- quintessentia-1.1/quint/tools/embedding.py +28 -0
- quintessentia-1.1/quint/tools/time.py +17 -0
- quintessentia-1.1/quint/transcription/__init__.py +8 -0
- quintessentia-1.1/quint/transcription/transcriber.py +100 -0
- quintessentia-1.1/quintessentia.egg-info/PKG-INFO +279 -0
- quintessentia-1.1/quintessentia.egg-info/SOURCES.txt +35 -0
- quintessentia-1.1/quintessentia.egg-info/dependency_links.txt +1 -0
- quintessentia-1.1/quintessentia.egg-info/requires.txt +22 -0
- quintessentia-1.1/quintessentia.egg-info/top_level.txt +1 -0
- quintessentia-1.1/requirements.txt +35 -0
- quintessentia-1.1/setup.cfg +4 -0
- quintessentia-1.1/setup.py +43 -0
- quintessentia-1.1/tests/test_jax_device.py +20 -0
- quintessentia-1.1/tests/test_youtube_downloader.py +31 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2023-2026 Polovinkin Nikita
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,279 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: quintessentia
|
|
3
|
+
Version: 1.1
|
|
4
|
+
Summary: Transcribe, chunk and summarize podcasts (FastAPI + Whisper + OpenAI)
|
|
5
|
+
Home-page: https://github.com/poloniki/quint
|
|
6
|
+
Author: Polovinkin Nikita
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Source, https://github.com/poloniki/quint
|
|
9
|
+
Project-URL: Issues, https://github.com/poloniki/quint/issues
|
|
10
|
+
Keywords: whisper,transcription,summarization,podcast,fastapi,nlp
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Framework :: FastAPI
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Topic :: Multimedia :: Sound/Audio :: Speech
|
|
18
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
19
|
+
Requires-Python: >=3.10
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: numpy
|
|
23
|
+
Requires-Dist: pandas
|
|
24
|
+
Requires-Dist: scikit-learn
|
|
25
|
+
Requires-Dist: pytest
|
|
26
|
+
Requires-Dist: fastapi
|
|
27
|
+
Requires-Dist: uvicorn
|
|
28
|
+
Requires-Dist: sentence_transformers
|
|
29
|
+
Requires-Dist: python-multipart
|
|
30
|
+
Requires-Dist: pydub
|
|
31
|
+
Requires-Dist: soundfile
|
|
32
|
+
Requires-Dist: protobuf==3.20.2
|
|
33
|
+
Requires-Dist: spacy~=3.4.4
|
|
34
|
+
Requires-Dist: scipy~=1.9.3
|
|
35
|
+
Requires-Dist: pydantic
|
|
36
|
+
Requires-Dist: nltk~=3.8.1
|
|
37
|
+
Requires-Dist: ml-dtypes==0.2.0
|
|
38
|
+
Requires-Dist: pytube
|
|
39
|
+
Requires-Dist: cached_property
|
|
40
|
+
Requires-Dist: gunicorn
|
|
41
|
+
Requires-Dist: mock
|
|
42
|
+
Requires-Dist: pysbd
|
|
43
|
+
Requires-Dist: openai~=0.28.1
|
|
44
|
+
Dynamic: author
|
|
45
|
+
Dynamic: classifier
|
|
46
|
+
Dynamic: description
|
|
47
|
+
Dynamic: description-content-type
|
|
48
|
+
Dynamic: home-page
|
|
49
|
+
Dynamic: keywords
|
|
50
|
+
Dynamic: license
|
|
51
|
+
Dynamic: license-file
|
|
52
|
+
Dynamic: project-url
|
|
53
|
+
Dynamic: requires-dist
|
|
54
|
+
Dynamic: requires-python
|
|
55
|
+
Dynamic: summary
|
|
56
|
+
|
|
57
|
+
# Quint: transcribe | chunk | summarize
|
|
58
|
+
|
|
59
|
+
<p align="center">
|
|
60
|
+
<img src="https://raw.githubusercontent.com/poloniki/quint/master/frontend/logo.png" alt="Quint logo">
|
|
61
|
+
</p>
|
|
62
|
+
|
|
63
|
+
<p align="center">
|
|
64
|
+
<a href="https://github.com/poloniki/quint/actions/workflows/build.yml">
|
|
65
|
+
<img src="https://img.shields.io/github/actions/workflow/status/poloniki/quint/build.yml?branch=master&style=for-the-badge&logo=github&label=CI" alt="CI">
|
|
66
|
+
</a>
|
|
67
|
+
<a href="LICENSE">
|
|
68
|
+
<img src="https://img.shields.io/badge/License-MIT-green?style=for-the-badge" alt="License: MIT">
|
|
69
|
+
</a>
|
|
70
|
+
<a href="https://fastapi.tiangolo.com">
|
|
71
|
+
<img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI">
|
|
72
|
+
</a>
|
|
73
|
+
<a href="https://www.python.org/downloads/release/python-3100/">
|
|
74
|
+
<img src="https://img.shields.io/badge/python-3.10-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54" alt="Python 3.10">
|
|
75
|
+
</a>
|
|
76
|
+
<a href="https://hub.docker.com/r/poloniki/quint">
|
|
77
|
+
<img src="https://img.shields.io/badge/docker-%230db7ed.svg?style=for-the-badge&logo=docker&logoColor=white" alt="Docker">
|
|
78
|
+
</a>
|
|
79
|
+
</p>
|
|
80
|
+
|
|
81
|
+
"Quint" is designed to enhance the podcast experience. It simplifies the process for users, making it easier for them to understand and navigate podcasts by providing concise summaries, highlights, and transcripts.
|
|
82
|
+
|
|
83
|
+
## Table of Contents
|
|
84
|
+
|
|
85
|
+
- [Main Functionality](#-main-functionality)
|
|
86
|
+
- [Quickstart](#-quickstart)
|
|
87
|
+
- [License](#-license)
|
|
88
|
+
- [Deploy on a GPU cloud](#-how-to-deploy-this-api-on-cloud)
|
|
89
|
+
|
|
90
|
+
## π Main Functionality
|
|
91
|
+
|
|
92
|
+
Below is a list of the core API endpoints offered by Quint:
|
|
93
|
+
|
|
94
|
+
Once the API is running (see [Quickstart](#-quickstart)), interactive docs are available at `/docs`.
|
|
95
|
+
|
|
96
|
+
### π₯ YouTube Video Transcription
|
|
97
|
+
|
|
98
|
+
Provide a YouTube video ID. Quint fetches the video, extracts its audio, and returns a transcription.
|
|
99
|
+
|
|
100
|
+
```http
|
|
101
|
+
GET /youtube_transcript?video_id=YOUR_YOUTUBE_VIDEO_ID
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
```json
|
|
105
|
+
{ "transcript": "The transcribed text of the video goes here..." }
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### ποΈ Transcription from Audio File
|
|
109
|
+
|
|
110
|
+
Upload an audio file and receive its transcription in text format.
|
|
111
|
+
|
|
112
|
+
```http
|
|
113
|
+
POST /file_transcript
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
```json
|
|
117
|
+
{ "transcript": "The transcribed text of the audio goes here..." }
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### π Text Chunking
|
|
121
|
+
|
|
122
|
+
Submit a lengthy text and get it divided into semantically meaningful chunks or paragraphs.
|
|
123
|
+
|
|
124
|
+
```http
|
|
125
|
+
POST /chunk
|
|
126
|
+
{ "body": "Your lengthy continuous text here..." }
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
```json
|
|
130
|
+
{ "output": ["Chunk 1", "Chunk 2", "..."] }
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### π Highlight the Best Sentence
|
|
134
|
+
|
|
135
|
+
Submit a text and Quint returns the index of the most descriptive sentence based on the embeddings.
|
|
136
|
+
|
|
137
|
+
```http
|
|
138
|
+
POST /best_sentence
|
|
139
|
+
{ "body": "Your raw text here..." }
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
```json
|
|
143
|
+
{ "best_sentence_index": 5 }
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
### π YouTube Summary
|
|
147
|
+
|
|
148
|
+
Provide a YouTube video ID to get back a list of chunked summaries of the video.
|
|
149
|
+
|
|
150
|
+
```http
|
|
151
|
+
GET /youtube_summarize?video_id=YOUR_YOUTUBE_VIDEO_ID
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
```json
|
|
155
|
+
{ "summary": ["Summary of part 1", "Summary of part 2", "..."] }
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## π§βπ» Quickstart
|
|
159
|
+
|
|
160
|
+
Run the API locally β CPU is fine for chunking and summarization; transcription is far faster on a GPU (see [deploy](#-how-to-deploy-this-api-on-cloud)).
|
|
161
|
+
|
|
162
|
+
```shell
|
|
163
|
+
git clone https://github.com/poloniki/quint.git
|
|
164
|
+
cd quint
|
|
165
|
+
make install # pip install -e .
|
|
166
|
+
cp env.sample .env # then set OPENAI_API_KEY
|
|
167
|
+
make run_api # serves on http://localhost:8083
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
Then open `http://localhost:8083/docs` for the interactive API docs.
|
|
171
|
+
|
|
172
|
+
### Web UI (optional)
|
|
173
|
+
|
|
174
|
+
A small [Streamlit](https://streamlit.io) frontend lives in [`frontend/`](frontend/app.py). With the API running:
|
|
175
|
+
|
|
176
|
+
```shell
|
|
177
|
+
pip install -r frontend/requirements.txt
|
|
178
|
+
streamlit run frontend/app.py
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Set `QUINT_API_URL` if the API isn't on `http://localhost:8083`.
|
|
182
|
+
|
|
183
|
+
## π License
|
|
184
|
+
|
|
185
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
186
|
+
|
|
187
|
+
## π How to deploy this API on cloud
|
|
188
|
+
|
|
189
|
+
Important note: I highly recommend using the JAX solution, as it is much faster than the OpenAI-proposed way. Please refer to this repo [Whisper JAX](https://github.com/sanchit-gandhi/whisper-jax) for more details. I will attach one of the tables from that repo:
|
|
190
|
+
|
|
191
|
+
**Table 1:** Average inference time in seconds for audio files of increasing length. GPU device is a single A100 40GB GPU.
|
|
192
|
+
TPU device is a single TPU v4-8.
|
|
193
|
+
|
|
194
|
+
<div align="center">
|
|
195
|
+
|
|
196
|
+
| | OpenAI | Transformers | Whisper JAX | Whisper JAX |
|
|
197
|
+
| --------- | ------- | ------------ | ----------- | ----------- |
|
|
198
|
+
| | | | | |
|
|
199
|
+
| Framework | PyTorch | PyTorch | JAX | JAX |
|
|
200
|
+
| Backend | GPU | GPU | GPU | TPU |
|
|
201
|
+
| | | | | |
|
|
202
|
+
| 1 min | 13.8 | 4.54 | 1.72 | 0.45 |
|
|
203
|
+
| 10 min | 108.3 | 20.2 | 9.38 | 2.01 |
|
|
204
|
+
| 1 hour | 1001.0 | 126.1 | 75.3 | 13.8 |
|
|
205
|
+
| | | | | |
|
|
206
|
+
|
|
207
|
+
</div>
|
|
208
|
+
|
|
209
|
+
### Choosing a GPU cloud provider
|
|
210
|
+
|
|
211
|
+
Quint runs on any machine with an NVIDIA GPU, so you are free to use whichever cloud provider (AWS, GCP, Azure, Lambda, Paperspace, RunPod, β¦) or on-prem hardware you prefer. For the best price/performance on transcription, look for an **Ada-generation card** such as the RTX 6000 Ada or A6000 β these are typically far cheaper than A100-class GPUs while offering comparable [CUDA compute capability](https://developer.nvidia.com/cuda-gpus).
|
|
212
|
+
|
|
213
|
+
Whatever you pick, you only need an instance that provides:
|
|
214
|
+
|
|
215
|
+
- An **NVIDIA GPU** (Ampere/Ada or newer recommended)
|
|
216
|
+
- **Ubuntu 22.04** (or similar) with **CUDA 12** and **Docker**
|
|
217
|
+
- **SSH access** (root or sudo)
|
|
218
|
+
|
|
219
|
+
The steps below are provider-neutral: provision the instance however your provider requires, then follow along.
|
|
220
|
+
|
|
221
|
+
### 1. Configure your environment
|
|
222
|
+
|
|
223
|
+
```shell
|
|
224
|
+
cp env.sample .env # then edit .env
|
|
225
|
+
direnv reload # or: source .env
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Set the following in `.env`:
|
|
229
|
+
|
|
230
|
+
| Variable | Used by | Purpose |
|
|
231
|
+
| --- | --- | --- |
|
|
232
|
+
| `OPENAI_API_KEY` | API (summarization) | Key for the summarization step |
|
|
233
|
+
| `GPU_TYPE` | API (optional) | Set to `A100` to enable bfloat16 on the JAX backend; any other value (or unset) uses float16 |
|
|
234
|
+
| `EMAIL` | deploy helper | Labels / generates your SSH key |
|
|
235
|
+
| `HOST` | deploy helper | Public IP or hostname of your GPU instance |
|
|
236
|
+
| `SSH_USER` | deploy helper | SSH login user for your image (often `root`, but `ubuntu` on AWS, your username on GCP, `azureuser` on Azure) |
|
|
237
|
+
|
|
238
|
+
### 2. Provision and connect to the instance
|
|
239
|
+
|
|
240
|
+
Create a GPU instance with your provider using an **Ubuntu 22.04 + CUDA 12 + Docker** image and your SSH public key. Once it is running, note its public IP (set it as `HOST` in `.env`) and connect:
|
|
241
|
+
|
|
242
|
+
```shell
|
|
243
|
+
ssh $SSH_USER@$HOST -i ~/.ssh/<your_key>
|
|
244
|
+
```
|
|
245
|
+
|
|
246
|
+
> Use the login user your provider specifies for the image. `root` works on many bare-VM providers, but AWS Ubuntu AMIs use `ubuntu`, GCP uses your username, Azure uses `azureuser`, etc. Set it as `SSH_USER` in `.env`.
|
|
247
|
+
|
|
248
|
+
The notebook [`notebooks/Deploy_gpu_instance.ipynb`](notebooks/Deploy_gpu_instance.ipynb) automates the provider-neutral parts: generating an SSH key, copying the code to the host, and building/running the container.
|
|
249
|
+
|
|
250
|
+
### 3. Install NVIDIA drivers (if your image doesn't include them)
|
|
251
|
+
|
|
252
|
+
If the instance image already ships with working drivers, skip this. Otherwise run the bundled script on the instance and reboot to load them:
|
|
253
|
+
|
|
254
|
+
```shell
|
|
255
|
+
bash scripts/install_nvidia_driver.sh
|
|
256
|
+
sudo reboot
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
### 4. Get the code onto the instance
|
|
260
|
+
|
|
261
|
+
Clone it directly:
|
|
262
|
+
|
|
263
|
+
```shell
|
|
264
|
+
git clone https://github.com/poloniki/quint.git
|
|
265
|
+
cd quint
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
β¦or copy your local checkout up with `scp` (the deploy notebook does this for you).
|
|
269
|
+
|
|
270
|
+
### 5. Build and run
|
|
271
|
+
|
|
272
|
+
```shell
|
|
273
|
+
docker build -t quint --file Dockerfile.jax .
|
|
274
|
+
docker run --gpus all -p 80:80 --shm-size=1g --env-file .env quint
|
|
275
|
+
```
|
|
276
|
+
|
|
277
|
+
> The `--env-file .env` flag passes `OPENAI_API_KEY` (and optional `GPU_TYPE`) into the container, so make sure `.env` is present on the instance. Also ensure your provider's firewall / security group allows inbound TCP on port **80** β most clouds only open SSH (port 22) by default.
|
|
278
|
+
|
|
279
|
+
Your API is now available on the instance's public IP (port 80).
|
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# Quint: transcribe | chunk | summarize
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://raw.githubusercontent.com/poloniki/quint/master/frontend/logo.png" alt="Quint logo">
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
<p align="center">
|
|
8
|
+
<a href="https://github.com/poloniki/quint/actions/workflows/build.yml">
|
|
9
|
+
<img src="https://img.shields.io/github/actions/workflow/status/poloniki/quint/build.yml?branch=master&style=for-the-badge&logo=github&label=CI" alt="CI">
|
|
10
|
+
</a>
|
|
11
|
+
<a href="LICENSE">
|
|
12
|
+
<img src="https://img.shields.io/badge/License-MIT-green?style=for-the-badge" alt="License: MIT">
|
|
13
|
+
</a>
|
|
14
|
+
<a href="https://fastapi.tiangolo.com">
|
|
15
|
+
<img src="https://img.shields.io/badge/FastAPI-005571?style=for-the-badge&logo=fastapi" alt="FastAPI">
|
|
16
|
+
</a>
|
|
17
|
+
<a href="https://www.python.org/downloads/release/python-3100/">
|
|
18
|
+
<img src="https://img.shields.io/badge/python-3.10-3670A0?style=for-the-badge&logo=python&logoColor=ffdd54" alt="Python 3.10">
|
|
19
|
+
</a>
|
|
20
|
+
<a href="https://hub.docker.com/r/poloniki/quint">
|
|
21
|
+
<img src="https://img.shields.io/badge/docker-%230db7ed.svg?style=for-the-badge&logo=docker&logoColor=white" alt="Docker">
|
|
22
|
+
</a>
|
|
23
|
+
</p>
|
|
24
|
+
|
|
25
|
+
"Quint" is designed to enhance the podcast experience. It simplifies the process for users, making it easier for them to understand and navigate podcasts by providing concise summaries, highlights, and transcripts.
|
|
26
|
+
|
|
27
|
+
## Table of Contents
|
|
28
|
+
|
|
29
|
+
- [Main Functionality](#-main-functionality)
|
|
30
|
+
- [Quickstart](#-quickstart)
|
|
31
|
+
- [License](#-license)
|
|
32
|
+
- [Deploy on a GPU cloud](#-how-to-deploy-this-api-on-cloud)
|
|
33
|
+
|
|
34
|
+
## π Main Functionality
|
|
35
|
+
|
|
36
|
+
Below is a list of the core API endpoints offered by Quint:
|
|
37
|
+
|
|
38
|
+
Once the API is running (see [Quickstart](#-quickstart)), interactive docs are available at `/docs`.
|
|
39
|
+
|
|
40
|
+
### π₯ YouTube Video Transcription
|
|
41
|
+
|
|
42
|
+
Provide a YouTube video ID. Quint fetches the video, extracts its audio, and returns a transcription.
|
|
43
|
+
|
|
44
|
+
```http
|
|
45
|
+
GET /youtube_transcript?video_id=YOUR_YOUTUBE_VIDEO_ID
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
```json
|
|
49
|
+
{ "transcript": "The transcribed text of the video goes here..." }
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
### ποΈ Transcription from Audio File
|
|
53
|
+
|
|
54
|
+
Upload an audio file and receive its transcription in text format.
|
|
55
|
+
|
|
56
|
+
```http
|
|
57
|
+
POST /file_transcript
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
```json
|
|
61
|
+
{ "transcript": "The transcribed text of the audio goes here..." }
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### π Text Chunking
|
|
65
|
+
|
|
66
|
+
Submit a lengthy text and get it divided into semantically meaningful chunks or paragraphs.
|
|
67
|
+
|
|
68
|
+
```http
|
|
69
|
+
POST /chunk
|
|
70
|
+
{ "body": "Your lengthy continuous text here..." }
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
```json
|
|
74
|
+
{ "output": ["Chunk 1", "Chunk 2", "..."] }
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
### π Highlight the Best Sentence
|
|
78
|
+
|
|
79
|
+
Submit a text and Quint returns the index of the most descriptive sentence based on the embeddings.
|
|
80
|
+
|
|
81
|
+
```http
|
|
82
|
+
POST /best_sentence
|
|
83
|
+
{ "body": "Your raw text here..." }
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
```json
|
|
87
|
+
{ "best_sentence_index": 5 }
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
### π YouTube Summary
|
|
91
|
+
|
|
92
|
+
Provide a YouTube video ID to get back a list of chunked summaries of the video.
|
|
93
|
+
|
|
94
|
+
```http
|
|
95
|
+
GET /youtube_summarize?video_id=YOUR_YOUTUBE_VIDEO_ID
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
```json
|
|
99
|
+
{ "summary": ["Summary of part 1", "Summary of part 2", "..."] }
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## π§βπ» Quickstart
|
|
103
|
+
|
|
104
|
+
Run the API locally β CPU is fine for chunking and summarization; transcription is far faster on a GPU (see [deploy](#-how-to-deploy-this-api-on-cloud)).
|
|
105
|
+
|
|
106
|
+
```shell
|
|
107
|
+
git clone https://github.com/poloniki/quint.git
|
|
108
|
+
cd quint
|
|
109
|
+
make install # pip install -e .
|
|
110
|
+
cp env.sample .env # then set OPENAI_API_KEY
|
|
111
|
+
make run_api # serves on http://localhost:8083
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Then open `http://localhost:8083/docs` for the interactive API docs.
|
|
115
|
+
|
|
116
|
+
### Web UI (optional)
|
|
117
|
+
|
|
118
|
+
A small [Streamlit](https://streamlit.io) frontend lives in [`frontend/`](frontend/app.py). With the API running:
|
|
119
|
+
|
|
120
|
+
```shell
|
|
121
|
+
pip install -r frontend/requirements.txt
|
|
122
|
+
streamlit run frontend/app.py
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
Set `QUINT_API_URL` if the API isn't on `http://localhost:8083`.
|
|
126
|
+
|
|
127
|
+
## π License
|
|
128
|
+
|
|
129
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
130
|
+
|
|
131
|
+
## π How to deploy this API on cloud
|
|
132
|
+
|
|
133
|
+
Important note: I highly recommend using the JAX solution, as it is much faster than the OpenAI-proposed way. Please refer to this repo [Whisper JAX](https://github.com/sanchit-gandhi/whisper-jax) for more details. I will attach one of the tables from that repo:
|
|
134
|
+
|
|
135
|
+
**Table 1:** Average inference time in seconds for audio files of increasing length. GPU device is a single A100 40GB GPU.
|
|
136
|
+
TPU device is a single TPU v4-8.
|
|
137
|
+
|
|
138
|
+
<div align="center">
|
|
139
|
+
|
|
140
|
+
| | OpenAI | Transformers | Whisper JAX | Whisper JAX |
|
|
141
|
+
| --------- | ------- | ------------ | ----------- | ----------- |
|
|
142
|
+
| | | | | |
|
|
143
|
+
| Framework | PyTorch | PyTorch | JAX | JAX |
|
|
144
|
+
| Backend | GPU | GPU | GPU | TPU |
|
|
145
|
+
| | | | | |
|
|
146
|
+
| 1 min | 13.8 | 4.54 | 1.72 | 0.45 |
|
|
147
|
+
| 10 min | 108.3 | 20.2 | 9.38 | 2.01 |
|
|
148
|
+
| 1 hour | 1001.0 | 126.1 | 75.3 | 13.8 |
|
|
149
|
+
| | | | | |
|
|
150
|
+
|
|
151
|
+
</div>
|
|
152
|
+
|
|
153
|
+
### Choosing a GPU cloud provider
|
|
154
|
+
|
|
155
|
+
Quint runs on any machine with an NVIDIA GPU, so you are free to use whichever cloud provider (AWS, GCP, Azure, Lambda, Paperspace, RunPod, β¦) or on-prem hardware you prefer. For the best price/performance on transcription, look for an **Ada-generation card** such as the RTX 6000 Ada or A6000 β these are typically far cheaper than A100-class GPUs while offering comparable [CUDA compute capability](https://developer.nvidia.com/cuda-gpus).
|
|
156
|
+
|
|
157
|
+
Whatever you pick, you only need an instance that provides:
|
|
158
|
+
|
|
159
|
+
- An **NVIDIA GPU** (Ampere/Ada or newer recommended)
|
|
160
|
+
- **Ubuntu 22.04** (or similar) with **CUDA 12** and **Docker**
|
|
161
|
+
- **SSH access** (root or sudo)
|
|
162
|
+
|
|
163
|
+
The steps below are provider-neutral: provision the instance however your provider requires, then follow along.
|
|
164
|
+
|
|
165
|
+
### 1. Configure your environment
|
|
166
|
+
|
|
167
|
+
```shell
|
|
168
|
+
cp env.sample .env # then edit .env
|
|
169
|
+
direnv reload # or: source .env
|
|
170
|
+
```
|
|
171
|
+
|
|
172
|
+
Set the following in `.env`:
|
|
173
|
+
|
|
174
|
+
| Variable | Used by | Purpose |
|
|
175
|
+
| --- | --- | --- |
|
|
176
|
+
| `OPENAI_API_KEY` | API (summarization) | Key for the summarization step |
|
|
177
|
+
| `GPU_TYPE` | API (optional) | Set to `A100` to enable bfloat16 on the JAX backend; any other value (or unset) uses float16 |
|
|
178
|
+
| `EMAIL` | deploy helper | Labels / generates your SSH key |
|
|
179
|
+
| `HOST` | deploy helper | Public IP or hostname of your GPU instance |
|
|
180
|
+
| `SSH_USER` | deploy helper | SSH login user for your image (often `root`, but `ubuntu` on AWS, your username on GCP, `azureuser` on Azure) |
|
|
181
|
+
|
|
182
|
+
### 2. Provision and connect to the instance
|
|
183
|
+
|
|
184
|
+
Create a GPU instance with your provider using an **Ubuntu 22.04 + CUDA 12 + Docker** image and your SSH public key. Once it is running, note its public IP (set it as `HOST` in `.env`) and connect:
|
|
185
|
+
|
|
186
|
+
```shell
|
|
187
|
+
ssh $SSH_USER@$HOST -i ~/.ssh/<your_key>
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
> Use the login user your provider specifies for the image. `root` works on many bare-VM providers, but AWS Ubuntu AMIs use `ubuntu`, GCP uses your username, Azure uses `azureuser`, etc. Set it as `SSH_USER` in `.env`.
|
|
191
|
+
|
|
192
|
+
The notebook [`notebooks/Deploy_gpu_instance.ipynb`](notebooks/Deploy_gpu_instance.ipynb) automates the provider-neutral parts: generating an SSH key, copying the code to the host, and building/running the container.
|
|
193
|
+
|
|
194
|
+
### 3. Install NVIDIA drivers (if your image doesn't include them)
|
|
195
|
+
|
|
196
|
+
If the instance image already ships with working drivers, skip this. Otherwise run the bundled script on the instance and reboot to load them:
|
|
197
|
+
|
|
198
|
+
```shell
|
|
199
|
+
bash scripts/install_nvidia_driver.sh
|
|
200
|
+
sudo reboot
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### 4. Get the code onto the instance
|
|
204
|
+
|
|
205
|
+
Clone it directly:
|
|
206
|
+
|
|
207
|
+
```shell
|
|
208
|
+
git clone https://github.com/poloniki/quint.git
|
|
209
|
+
cd quint
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
β¦or copy your local checkout up with `scp` (the deploy notebook does this for you).
|
|
213
|
+
|
|
214
|
+
### 5. Build and run
|
|
215
|
+
|
|
216
|
+
```shell
|
|
217
|
+
docker build -t quint --file Dockerfile.jax .
|
|
218
|
+
docker run --gpus all -p 80:80 --shm-size=1g --env-file .env quint
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
> The `--env-file .env` flag passes `OPENAI_API_KEY` (and optional `GPU_TYPE`) into the container, so make sure `.env` is present on the instance. Also ensure your provider's firewall / security group allows inbound TCP on port **80** β most clouds only open SSH (port 22) by default.
|
|
222
|
+
|
|
223
|
+
Your API is now available on the instance's public IP (port 80).
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
# Build backend + tooling config. Package metadata lives in setup.py and
|
|
2
|
+
# dependencies in requirements.txt.
|
|
3
|
+
|
|
4
|
+
[build-system]
|
|
5
|
+
requires = ["setuptools>=61", "wheel"]
|
|
6
|
+
build-backend = "setuptools.build_meta"
|
|
7
|
+
|
|
8
|
+
[tool.ruff]
|
|
9
|
+
target-version = "py310"
|
|
10
|
+
line-length = 88
|
|
11
|
+
# Notebooks are exploratory; skip them.
|
|
12
|
+
extend-exclude = ["notebooks"]
|
|
13
|
+
|
|
14
|
+
[tool.ruff.lint]
|
|
15
|
+
# Mirror ruff's default rule set explicitly so CI is deterministic across versions.
|
|
16
|
+
select = ["E4", "E7", "E9", "F"]
|
|
File without changes
|