@mynamezxc/mow-speech-to-text 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +38 -0
- package/README.md +572 -0
- package/package.json +55 -0
- package/src/cli.js +648 -0
- package/src/index.js +45 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Mynamezxc
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
MOW Speech-to-Text is powered by OpenAI Whisper
|
|
26
|
+
- OpenAI Whisper: https://github.com/openai/whisper
|
|
27
|
+
- Whisper License: MIT
|
|
28
|
+
|
|
29
|
+
Dependencies:
|
|
30
|
+
- Express.js: https://expressjs.com (MIT)
|
|
31
|
+
- Chalk: https://github.com/chalk/chalk (MIT)
|
|
32
|
+
- Figlet: https://github.com/patorjk/figlet.js (MIT)
|
|
33
|
+
- Gradient-string: https://github.com/bokub/gradient-string (MIT)
|
|
34
|
+
- Multer: https://github.com/expressjs/multer (MIT)
|
|
35
|
+
- Axios: https://github.com/axios/axios (MIT)
|
|
36
|
+
- Progress: https://github.com/visionmedia/node-progress (MIT)
|
|
37
|
+
|
|
38
|
+
For full dependency licenses, see package.json and npm documentation.
|
package/README.md
ADDED
|
@@ -0,0 +1,572 @@
|
|
|
1
|
+
# @mynamezxc/mow-speech-to-text
|
|
2
|
+
|
|
3
|
+
> šļø Advanced speech-to-text transcription tool powered by OpenAI Whisper with GPU acceleration support.
|
|
4
|
+
>
|
|
5
|
+
> Convert audio files (WAV, MP3, MP4, M4A, FLAC, OGG, WebM) to text using state-of-the-art AI models.
|
|
6
|
+
|
|
7
|
+
**Powered by [Mynamezxc](https://www.npmjs.com/~mynamezxc)**
|
|
8
|
+
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
## ⨠Features
|
|
12
|
+
|
|
13
|
+
- š **Fast transcription** with OpenAI Whisper models
|
|
14
|
+
- šÆ **Multiple models** - Tiny, Base, Small, Medium, Large, Turbo
|
|
15
|
+
- š§ **GPU acceleration** with automatic CPU fallback
|
|
16
|
+
- š **Multi-language** support with auto-detection
|
|
17
|
+
- š **Multiple formats** - WAV, MP3, MP4, M4A, FLAC, OGG, WebM
|
|
18
|
+
- š **REST API** - Start a local server for programmatic access
|
|
19
|
+
- š» **CLI tool** - Simple command-line interface
|
|
20
|
+
- š **Progress tracking** - Real-time transcription progress
|
|
21
|
+
- šØ **Beautiful TUI** - 3D banner and styled output
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## š Requirements
|
|
26
|
+
|
|
27
|
+
- **Node.js** ā„ 18
|
|
28
|
+
- **Python** ā„ 3.8 (for Whisper)
|
|
29
|
+
- **PyTorch** (with CUDA for GPU support - optional)
|
|
30
|
+
- **FFmpeg** (for audio processing)
|
|
31
|
+
|
|
32
|
+
### Install Dependencies
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Python packages for Whisper
|
|
36
|
+
pip install openai-whisper torch
|
|
37
|
+
|
|
38
|
+
# For GPU acceleration (CUDA)
|
|
39
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
40
|
+
|
|
41
|
+
# FFmpeg
|
|
42
|
+
# On Windows: choco install ffmpeg
|
|
43
|
+
# On macOS: brew install ffmpeg
|
|
44
|
+
# On Linux: sudo apt-get install ffmpeg
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
---
|
|
48
|
+
|
|
49
|
+
## š¦ Installation
|
|
50
|
+
|
|
51
|
+
### Global Installation (Recommended)
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
npm install -g @mynamezxc/mow-speech-to-text
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Then use commands:
|
|
58
|
+
- `mow convert <file>` - Convert audio to text
|
|
59
|
+
- `mow serve [port]` - Start API server
|
|
60
|
+
|
|
61
|
+
### Local Installation
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
npm install @mynamezxc/mow-speech-to-text
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
Then use via `npx`:
|
|
68
|
+
```bash
|
|
69
|
+
npx mow convert audio.mp3
|
|
70
|
+
npx mow serve 3000
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
---
|
|
74
|
+
|
|
75
|
+
## š Quick Start
|
|
76
|
+
|
|
77
|
+
### Convert Audio to Text
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
# Basic conversion (uses default 'base' model)
|
|
81
|
+
mow convert audio.mp3
|
|
82
|
+
|
|
83
|
+
# Specify model
|
|
84
|
+
mow convert audio.mp3 --model large
|
|
85
|
+
|
|
86
|
+
# Specify output file
|
|
87
|
+
mow convert audio.mp3 --output result.txt
|
|
88
|
+
|
|
89
|
+
# Specify language
|
|
90
|
+
mow convert audio.mp3 --language vi --model medium
|
|
91
|
+
|
|
92
|
+
# Force CPU (disable GPU)
|
|
93
|
+
mow convert audio.mp3 --force-cpu --model small
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Start API Server
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
# Run on default port (3000)
|
|
100
|
+
mow serve
|
|
101
|
+
|
|
102
|
+
# Run on specific port
|
|
103
|
+
mow serve 8080
|
|
104
|
+
|
|
105
|
+
# Or use mow-serve command
|
|
106
|
+
mow-serve 3000
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## š Usage Guide
|
|
112
|
+
|
|
113
|
+
### Command: `mow convert`
|
|
114
|
+
|
|
115
|
+
Convert audio/video files to text transcription.
|
|
116
|
+
|
|
117
|
+
#### Syntax
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
mow convert <input> [options]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
#### Parameters
|
|
124
|
+
|
|
125
|
+
| Parameter | Type | Default | Description |
|
|
126
|
+
|-----------|------|---------|-------------|
|
|
127
|
+
| `input` | string | required | Path to audio/video file |
|
|
128
|
+
| `--model` | string | `base` | Whisper model: tiny, base, small, medium, large, turbo |
|
|
129
|
+
| `--output` | string | auto | Output file path (auto-generated if not specified) |
|
|
130
|
+
| `--language` | string | auto | Language code (e.g., en, vi, ja, es, fr, de) |
|
|
131
|
+
| `--force-cpu` | flag | false | Force CPU usage, disable GPU |
|
|
132
|
+
|
|
133
|
+
#### Model Comparison
|
|
134
|
+
|
|
135
|
+
| Model | Size | Speed | Accuracy | VRAM Used |
|
|
136
|
+
|--------|------|-------|----------|-----------|
|
|
137
|
+
| tiny | 39M | Fast | Low | 1GB |
|
|
138
|
+
| base | 140M | Fast | Medium | 1GB |
|
|
139
|
+
| small | 244M | Good | Good | 2GB |
|
|
140
|
+
| medium | 769M | Okay | High | 5GB |
|
|
141
|
+
| large | 2.9B | Slow | Very High| 10GB |
|
|
142
|
+
| turbo | N/A | Fast | High | 6GB |
|
|
143
|
+
|
|
144
|
+
#### Examples
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
# English transcription with small model
|
|
148
|
+
mow convert meeting.mp4 --model small
|
|
149
|
+
|
|
150
|
+
# Vietnamese transcription with GPU acceleration
|
|
151
|
+
mow convert lecture.m4a --language vi --model medium
|
|
152
|
+
|
|
153
|
+
# MP3 to text with custom output
|
|
154
|
+
mow convert podcast.mp3 --output transcript.txt --model large
|
|
155
|
+
|
|
156
|
+
# Force CPU for large file without GPU
|
|
157
|
+
mow convert large_video.mp4 --force-cpu --model base
|
|
158
|
+
|
|
159
|
+
# Auto-detect language
|
|
160
|
+
mow convert unknown_speech.wav --model medium
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
#### Output
|
|
164
|
+
|
|
165
|
+
ā Saved to: `[filename]_transcript.txt`
|
|
166
|
+
|
|
167
|
+
Example output structure:
|
|
168
|
+
```
|
|
169
|
+
Input: /path/to/audio.mp3
|
|
170
|
+
Model: large
|
|
171
|
+
Device: GPU
|
|
172
|
+
Output: /path/to/audio_transcript.txt
|
|
173
|
+
|
|
174
|
+
[Progress bar animation]
|
|
175
|
+
ā Transcription completed successfully!
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
---
|
|
179
|
+
|
|
180
|
+
### Command: `mow serve`
|
|
181
|
+
|
|
182
|
+
Start a local REST API server for transcription.
|
|
183
|
+
|
|
184
|
+
#### Syntax
|
|
185
|
+
|
|
186
|
+
```bash
|
|
187
|
+
mow serve [port]
|
|
188
|
+
```
|
|
189
|
+
|
|
190
|
+
#### Parameters
|
|
191
|
+
|
|
192
|
+
| Parameter | Type | Default | Description |
|
|
193
|
+
|-----------|------|---------|-------------|
|
|
194
|
+
| `port` | number | 3000 | Server port |
|
|
195
|
+
|
|
196
|
+
#### Examples
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
# Start on port 3000
|
|
200
|
+
mow serve
|
|
201
|
+
|
|
202
|
+
# Start on port 8080
|
|
203
|
+
mow serve 8080
|
|
204
|
+
|
|
205
|
+
# Using mow-serve command
|
|
206
|
+
mow-serve 5000
|
|
207
|
+
```
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
## š REST API Endpoints
|
|
212
|
+
|
|
213
|
+
When running `mow serve`, the following endpoints are available:
|
|
214
|
+
|
|
215
|
+
### 1. Health Check
|
|
216
|
+
|
|
217
|
+
```http
|
|
218
|
+
GET /health
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
**Response:**
|
|
222
|
+
```json
|
|
223
|
+
{
|
|
224
|
+
"status": "ok",
|
|
225
|
+
"service": "MOW Speech-to-Text API",
|
|
226
|
+
"version": "1.0.0",
|
|
227
|
+
"device": "cuda",
|
|
228
|
+
"models": ["tiny", "base", "small", "medium", "large", "turbo"]
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
### 2. List Available Models
|
|
233
|
+
|
|
234
|
+
```http
|
|
235
|
+
GET /api/models
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
**Response:**
|
|
239
|
+
```json
|
|
240
|
+
{
|
|
241
|
+
"models": ["tiny", "base", "small", "medium", "large", "turbo"],
|
|
242
|
+
"default": "base",
|
|
243
|
+
"device": "cuda"
|
|
244
|
+
}
|
|
245
|
+
```
|
|
246
|
+
|
|
247
|
+
### 3. Transcribe Audio
|
|
248
|
+
|
|
249
|
+
```http
|
|
250
|
+
POST /api/transcribe
|
|
251
|
+
Content-Type: multipart/form-data
|
|
252
|
+
```
|
|
253
|
+
|
|
254
|
+
**Form Parameters:**
|
|
255
|
+
|
|
256
|
+
| Parameter | Type | Required | Description |
|
|
257
|
+
|-----------|------|----------|-------------|
|
|
258
|
+
| `audio` | file | Yes | Audio file (WAV, MP3, MP4, etc.) |
|
|
259
|
+
| `model` | string | No | Model name (default: base) |
|
|
260
|
+
| `language` | string | No | Language code (optional, auto-detect if omitted) |
|
|
261
|
+
|
|
262
|
+
**Request Example (cURL):**
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
curl -X POST http://localhost:3000/api/transcribe \
|
|
266
|
+
-F "audio=@audio.mp3" \
|
|
267
|
+
-F "model=medium" \
|
|
268
|
+
-F "language=en"
|
|
269
|
+
```
|
|
270
|
+
|
|
271
|
+
**Response:**
|
|
272
|
+
```json
|
|
273
|
+
{
|
|
274
|
+
"text": "This is the transcribed text from the audio file.",
|
|
275
|
+
"model": "medium",
|
|
276
|
+
"language": "en",
|
|
277
|
+
"duration": 120,
|
|
278
|
+
"language_confidence": 0.95
|
|
279
|
+
}
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
### 4. API Documentation
|
|
283
|
+
|
|
284
|
+
```http
|
|
285
|
+
GET /api/docs
|
|
286
|
+
```
|
|
287
|
+
|
|
288
|
+
**Response:**
|
|
289
|
+
```json
|
|
290
|
+
{
|
|
291
|
+
"service": "MOW Speech-to-Text API",
|
|
292
|
+
"version": "1.0.0",
|
|
293
|
+
"endpoints": {
|
|
294
|
+
"GET /health": "Check API health status",
|
|
295
|
+
"GET /api/models": "List available Whisper models",
|
|
296
|
+
"POST /api/transcribe": "Transcribe audio file",
|
|
297
|
+
"GET /api/docs": "This documentation"
|
|
298
|
+
},
|
|
299
|
+
"docs_url": "https://www.npmjs.com/package/@mynamezxc/mow-speech-to-text",
|
|
300
|
+
"github_url": "https://github.com/mynamezxc/mow-speech-to-text"
|
|
301
|
+
}
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
---
|
|
305
|
+
|
|
306
|
+
## š» Usage Examples
|
|
307
|
+
|
|
308
|
+
### JavaScript / Node.js
|
|
309
|
+
|
|
310
|
+
```javascript
|
|
311
|
+
const fs = require('fs');
|
|
312
|
+
const FormData = require('form-data');
|
|
313
|
+
const axios = require('axios');
|
|
314
|
+
|
|
315
|
+
async function transcribeAudio(filePath, model = 'base') {
|
|
316
|
+
const form = new FormData();
|
|
317
|
+
form.append('audio', fs.createReadStream(filePath));
|
|
318
|
+
form.append('model', model);
|
|
319
|
+
|
|
320
|
+
try {
|
|
321
|
+
const response = await axios.post(
|
|
322
|
+
'http://localhost:3000/api/transcribe',
|
|
323
|
+
form,
|
|
324
|
+
{ headers: form.getHeaders() }
|
|
325
|
+
);
|
|
326
|
+
|
|
327
|
+
console.log('Transcription:', response.data.text);
|
|
328
|
+
return response.data;
|
|
329
|
+
} catch (error) {
|
|
330
|
+
console.error('Error:', error.message);
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
// Usage
|
|
335
|
+
transcribeAudio('meeting.mp3', 'medium');
|
|
336
|
+
```
|
|
337
|
+
|
|
338
|
+
### Python
|
|
339
|
+
|
|
340
|
+
```python
|
|
341
|
+
import requests
|
|
342
|
+
|
|
343
|
+
def transcribe_audio(file_path, model='base', language=None):
|
|
344
|
+
url = 'http://localhost:3000/api/transcribe'
|
|
345
|
+
|
|
346
|
+
with open(file_path, 'rb') as f:
|
|
347
|
+
files = {'audio': f}
|
|
348
|
+
data = {'model': model}
|
|
349
|
+
if language:
|
|
350
|
+
data['language'] = language
|
|
351
|
+
|
|
352
|
+
response = requests.post(url, files=files, data=data)
|
|
353
|
+
return response.json()
|
|
354
|
+
|
|
355
|
+
# Usage
|
|
356
|
+
result = transcribe_audio('lecture.wav', model='medium', language='en')
|
|
357
|
+
print(result['text'])
|
|
358
|
+
```
|
|
359
|
+
|
|
360
|
+
### cURL
|
|
361
|
+
|
|
362
|
+
```bash
|
|
363
|
+
# Basic transcription
|
|
364
|
+
curl -X POST http://localhost:3000/api/transcribe \
|
|
365
|
+
-F "audio=@video.mp4"
|
|
366
|
+
|
|
367
|
+
# With model and language
|
|
368
|
+
curl -X POST http://localhost:3000/api/transcribe \
|
|
369
|
+
-F "audio=@podcast.mp3" \
|
|
370
|
+
-F "model=large" \
|
|
371
|
+
-F "language=vi"
|
|
372
|
+
|
|
373
|
+
# Save response to file
|
|
374
|
+
curl -X POST http://localhost:3000/api/transcribe \
|
|
375
|
+
-F "audio=@meeting.m4a" \
|
|
376
|
+
-F "model=medium" \
|
|
377
|
+
| jq '.text' > transcript.txt
|
|
378
|
+
```
|
|
379
|
+
|
|
380
|
+
---
|
|
381
|
+
|
|
382
|
+
## šÆ Use Cases
|
|
383
|
+
|
|
384
|
+
### Meeting Transcription
|
|
385
|
+
|
|
386
|
+
```bash
|
|
387
|
+
mow convert meeting_recording.mp4 --model medium --language en
|
|
388
|
+
```
|
|
389
|
+
|
|
390
|
+
### Podcast Processing
|
|
391
|
+
|
|
392
|
+
```bash
|
|
393
|
+
mow serve 3000
|
|
394
|
+
# Then POST to /api/transcribe with your podcast files
|
|
395
|
+
```
|
|
396
|
+
|
|
397
|
+
### Lecture Notes
|
|
398
|
+
|
|
399
|
+
```bash
|
|
400
|
+
mow convert lecture.m4a --model large --output notes.txt
|
|
401
|
+
```
|
|
402
|
+
|
|
403
|
+
### Multi-language Support
|
|
404
|
+
|
|
405
|
+
```bash
|
|
406
|
+
mow convert spanish.wav --language es --model medium
|
|
407
|
+
mow convert vietnamese.mp3 --language vi --model base
|
|
408
|
+
```
|
|
409
|
+
|
|
410
|
+
### Batch Processing with Server
|
|
411
|
+
|
|
412
|
+
```javascript
|
|
413
|
+
const fs = require('fs');
|
|
414
|
+
const audioFiles = fs.readdirSync('./audio');
|
|
415
|
+
|
|
416
|
+
for (const file of audioFiles) {
|
|
417
|
+
axios.post('http://localhost:3000/api/transcribe', {
|
|
418
|
+
audio: fs.createReadStream(`./audio/${file}`)
|
|
419
|
+
});
|
|
420
|
+
}
|
|
421
|
+
```
|
|
422
|
+
|
|
423
|
+
---
|
|
424
|
+
|
|
425
|
+
## š§ Configuration
|
|
426
|
+
|
|
427
|
+
### GPU vs CPU
|
|
428
|
+
|
|
429
|
+
#### Automatic Detection (Default)
|
|
430
|
+
|
|
431
|
+
The tool automatically detects and uses GPU if available:
|
|
432
|
+
|
|
433
|
+
```bash
|
|
434
|
+
mow convert audio.mp3 --model medium
|
|
435
|
+
# Automatically uses GPU if CUDA is available
|
|
436
|
+
```
|
|
437
|
+
|
|
438
|
+
#### Force CPU Mode
|
|
439
|
+
|
|
440
|
+
```bash
|
|
441
|
+
mow convert audio.mp3 --force-cpu
|
|
442
|
+
# Always uses CPU, even if GPU is available
|
|
443
|
+
```
|
|
444
|
+
|
|
445
|
+
#### Check GPU Availability
|
|
446
|
+
|
|
447
|
+
```python
|
|
448
|
+
import torch
|
|
449
|
+
print(f"GPU Available: {torch.cuda.is_available()}")
|
|
450
|
+
print(f"Current Device: {torch.cuda.get_device_name(0)}")
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
### Performance Tips
|
|
454
|
+
|
|
455
|
+
- **Tiny/Base models**: Fast, good for real-time transcription
|
|
456
|
+
- **Medium/Large models**: Slower but more accurate
|
|
457
|
+
- **GPU**: 5-10x faster than CPU (if available)
|
|
458
|
+
- **Batch processing**: Use the server for multiple files
|
|
459
|
+
|
|
460
|
+
---
|
|
461
|
+
|
|
462
|
+
## š Troubleshooting
|
|
463
|
+
|
|
464
|
+
### Python Not Found
|
|
465
|
+
|
|
466
|
+
```
|
|
467
|
+
Error: Python not found
|
|
468
|
+
```
|
|
469
|
+
|
|
470
|
+
**Solution:** Install Python or add it to PATH
|
|
471
|
+
|
|
472
|
+
```bash
|
|
473
|
+
# On Windows, add Python to PATH or use full path
|
|
474
|
+
C:\Python39\python.exe -m pip install openai-whisper
|
|
475
|
+
```
|
|
476
|
+
|
|
477
|
+
### FFmpeg Not Found
|
|
478
|
+
|
|
479
|
+
```
|
|
480
|
+
Error: ffmpeg not found
|
|
481
|
+
```
|
|
482
|
+
|
|
483
|
+
**Solution:** Install FFmpeg
|
|
484
|
+
|
|
485
|
+
```bash
|
|
486
|
+
# Windows (Chocolatey)
|
|
487
|
+
choco install ffmpeg
|
|
488
|
+
|
|
489
|
+
# macOS (Homebrew)
|
|
490
|
+
brew install ffmpeg
|
|
491
|
+
|
|
492
|
+
# Linux (Ubuntu)
|
|
493
|
+
sudo apt-get install ffmpeg
|
|
494
|
+
```
|
|
495
|
+
|
|
496
|
+
### GPU Not Detected
|
|
497
|
+
|
|
498
|
+
```bash
|
|
499
|
+
# Check PyTorch GPU support
|
|
500
|
+
python -c "import torch; print(torch.cuda.is_available())"
|
|
501
|
+
|
|
502
|
+
# If False, install GPU version
|
|
503
|
+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
|
|
504
|
+
```
|
|
505
|
+
|
|
506
|
+
### Out of Memory Error
|
|
507
|
+
|
|
508
|
+
**Solution:** Use a smaller model
|
|
509
|
+
|
|
510
|
+
```bash
|
|
511
|
+
# Instead of 'large', use 'base' or 'small'
|
|
512
|
+
mow convert audio.mp3 --model small --force-cpu
|
|
513
|
+
```
|
|
514
|
+
|
|
515
|
+
### Module Not Found Errors
|
|
516
|
+
|
|
517
|
+
```bash
|
|
518
|
+
# Reinstall all dependencies
|
|
519
|
+
npm install -g @mynamezxc/mow-speech-to-text
|
|
520
|
+
|
|
521
|
+
# Reinstall Python packages
|
|
522
|
+
pip install openai-whisper torch --upgrade
|
|
523
|
+
```
|
|
524
|
+
|
|
525
|
+
---
|
|
526
|
+
|
|
527
|
+
## š Performance Benchmarks
|
|
528
|
+
|
|
529
|
+
Approximate transcription times on Intel i7-12700K with RTX 3080:
|
|
530
|
+
|
|
531
|
+
| Model | 1 hour audio | Device |
|
|
532
|
+
|--------|-------------|--------|
|
|
533
|
+
| tiny | 2 min | GPU |
|
|
534
|
+
| base | 5 min | GPU |
|
|
535
|
+
| small | 8 min | GPU |
|
|
536
|
+
| medium | 15 min | GPU |
|
|
537
|
+
| large | 25 min | GPU |
|
|
538
|
+
|
|
539
|
+
*Times vary based on audio quality and system specifications*
|
|
540
|
+
|
|
541
|
+
---
|
|
542
|
+
|
|
543
|
+
## š Documentation
|
|
544
|
+
|
|
545
|
+
- **Full Documentation:** https://www.npmjs.com/package/@mynamezxc/mow-speech-to-text
|
|
546
|
+
- **API Reference:** Check `/api/docs` when server is running
|
|
547
|
+
- **GitHub:** https://github.com/mynamezxc/mow-speech-to-text
|
|
548
|
+
- **OpenAI Whisper:** https://github.com/openai/whisper
|
|
549
|
+
|
|
550
|
+
---
|
|
551
|
+
|
|
552
|
+
## š¤ Contributing
|
|
553
|
+
|
|
554
|
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
|
555
|
+
|
|
556
|
+
---
|
|
557
|
+
|
|
558
|
+
## š License
|
|
559
|
+
|
|
560
|
+
MIT License - Copyright Ā© 2025 mynamezxc
|
|
561
|
+
|
|
562
|
+
---
|
|
563
|
+
|
|
564
|
+
## š Links
|
|
565
|
+
|
|
566
|
+
- **NPM Package:** [@mynamezxc/mow-speech-to-text](https://www.npmjs.com/package/@mynamezxc/mow-speech-to-text)
|
|
567
|
+
- **GitHub:** [mynamezxc/mow-speech-to-text](https://github.com/mynamezxc/mow-speech-to-text)
|
|
568
|
+
- **Author:** [Mynamezxc](https://www.npmjs.com/~mynamezxc)
|
|
569
|
+
|
|
570
|
+
---
|
|
571
|
+
|
|
572
|
+
**Made with ā¤ļø by Mynamezxc**
|
package/package.json
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mynamezxc/mow-speech-to-text",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"author": "mynamezxc",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"description": "Advanced speech-to-text transcription tool using OpenAI Whisper with GPU acceleration support",
|
|
7
|
+
"main": "src/cli.js",
|
|
8
|
+
"bin": {
|
|
9
|
+
"mow": "src/cli.js",
|
|
10
|
+
"mow-serve": "src/cli.js"
|
|
11
|
+
},
|
|
12
|
+
"files": [
|
|
13
|
+
"src"
|
|
14
|
+
],
|
|
15
|
+
"keywords": [
|
|
16
|
+
"speech-to-text",
|
|
17
|
+
"whisper",
|
|
18
|
+
"transcription",
|
|
19
|
+
"audio",
|
|
20
|
+
"gpu",
|
|
21
|
+
"openai",
|
|
22
|
+
"tui",
|
|
23
|
+
"cli"
|
|
24
|
+
],
|
|
25
|
+
"engines": {
|
|
26
|
+
"node": ">=18"
|
|
27
|
+
},
|
|
28
|
+
"publishConfig": {
|
|
29
|
+
"access": "public"
|
|
30
|
+
},
|
|
31
|
+
"scripts": {
|
|
32
|
+
"start": "node src/cli.js",
|
|
33
|
+
"serve": "node src/cli.js serve",
|
|
34
|
+
"convert": "node src/cli.js convert",
|
|
35
|
+
"test": "node src/cli.js --help"
|
|
36
|
+
},
|
|
37
|
+
"dependencies": {
|
|
38
|
+
"chalk": "^4.1.2",
|
|
39
|
+
"openai": "^4.52.0",
|
|
40
|
+
"express": "^4.18.2",
|
|
41
|
+
"cors": "^2.8.5",
|
|
42
|
+
"multer": "^1.4.5-lts.1",
|
|
43
|
+
"child_process": "^1.0.2",
|
|
44
|
+
"figlet": "^1.6.0",
|
|
45
|
+
"gradient-string": "^2.0.2",
|
|
46
|
+
"axios": "^1.6.2",
|
|
47
|
+
"progress": "^2.0.3"
|
|
48
|
+
},
|
|
49
|
+
"devDependencies": {
|
|
50
|
+
"typescript": "^5.8.2"
|
|
51
|
+
},
|
|
52
|
+
"optionalDependencies": {
|
|
53
|
+
"onnxruntime-gpu-webgpu": "^1.17.0"
|
|
54
|
+
}
|
|
55
|
+
}
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,648 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
const path = require('path');
|
|
4
|
+
const fs = require('fs');
|
|
5
|
+
const readline = require('readline/promises');
|
|
6
|
+
const { spawn } = require('child_process');
|
|
7
|
+
const chalk = require('chalk');
|
|
8
|
+
const figlet = require('figlet');
|
|
9
|
+
const gradient = require('gradient-string');
|
|
10
|
+
const axios = require('axios');
|
|
11
|
+
const ProgressBar = require('progress');
|
|
12
|
+
const { version } = require('../package.json');
|
|
13
|
+
|
|
14
|
+
const SUPPORTED_FORMATS = ['.wav', '.mp3', '.mp4', '.m4a', '.flac', '.ogg', '.webm'];
|
|
15
|
+
const WHISPER_MODELS = ['tiny', 'base', 'small', 'medium', 'large', 'turbo'];
|
|
16
|
+
const DEFAULT_MODEL = 'base';
|
|
17
|
+
const DEFAULT_PORT = 3000;
|
|
18
|
+
const API_DOCS_URL = 'https://www.npmjs.com/package/@mynamezxc/mow-speech-to-text';
|
|
19
|
+
const GITHUB_URL = 'https://github.com/mynamezxc/mow-speech-to-text';
|
|
20
|
+
|
|
21
|
+
// ============= Banner TUI 3D Style =============
|
|
22
|
+
function printBanner() {
|
|
23
|
+
console.clear();
|
|
24
|
+
|
|
25
|
+
const title = figlet.textSync('MOW', {
|
|
26
|
+
horizontalLayout: 'default',
|
|
27
|
+
verticalLayout: 'default',
|
|
28
|
+
width: 100,
|
|
29
|
+
whitespaceBreak: true
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
const gradientBanner = gradient('cyan', 'magenta', 'cyan')(title);
|
|
33
|
+
console.log(gradientBanner);
|
|
34
|
+
|
|
35
|
+
console.log(
|
|
36
|
+
gradient('blue', 'cyan')(
|
|
37
|
+
'āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā'
|
|
38
|
+
)
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
console.log(chalk.bold.cyan(' šļø MOW - Speech-to-Text Transcription Tool'));
|
|
42
|
+
console.log(chalk.bold.yellow(` Version: ${version}`));
|
|
43
|
+
console.log(chalk.bold.green(` Owner: @mynamezxc`));
|
|
44
|
+
console.log(
|
|
45
|
+
gradient('blue', 'cyan')(
|
|
46
|
+
'āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā'
|
|
47
|
+
)
|
|
48
|
+
);
|
|
49
|
+
|
|
50
|
+
console.log(chalk.gray.italic('\n Powered by OpenAI Whisper with GPU Acceleration Support\n'));
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// ============= Helper Functions =============
|
|
54
|
+
function printHelp() {
|
|
55
|
+
console.log(chalk.bold('Usage:'));
|
|
56
|
+
console.log(' mow <command> [options]\n');
|
|
57
|
+
|
|
58
|
+
console.log(chalk.bold('Commands:'));
|
|
59
|
+
console.log(chalk.cyan(' convert <input> [options] - Convert audio file to text'));
|
|
60
|
+
console.log(chalk.cyan(' serve [port] - Start REST API server'));
|
|
61
|
+
console.log(chalk.cyan(' help - Show this help message\n'));
|
|
62
|
+
|
|
63
|
+
console.log(chalk.bold('Convert Options:'));
|
|
64
|
+
console.log(chalk.yellow(' --model <name> - Model: tiny, base, small, medium, large, turbo (default: base)'));
|
|
65
|
+
console.log(chalk.yellow(' --output <file> - Output file path (optional)'));
|
|
66
|
+
console.log(chalk.yellow(' --language <code> - Language code (e.g., en, vi, ja) (optional)'));
|
|
67
|
+
console.log(chalk.yellow(' --force-cpu - Force CPU usage (disable GPU)\n'));
|
|
68
|
+
|
|
69
|
+
console.log(chalk.bold('Examples:'));
|
|
70
|
+
console.log(chalk.gray(' mow convert audio.mp3 --model large'));
|
|
71
|
+
console.log(chalk.gray(' mow convert video.mp4 --output transcript.txt --language vi'));
|
|
72
|
+
console.log(chalk.gray(' mow serve 3000\n'));
|
|
73
|
+
|
|
74
|
+
console.log(chalk.bold('Documentation:'));
|
|
75
|
+
console.log(chalk.blue(` ${API_DOCS_URL}\n`));
|
|
76
|
+
|
|
77
|
+
console.log(chalk.bold('GitHub:'));
|
|
78
|
+
console.log(chalk.blue(` ${GITHUB_URL}\n`));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// ============= Argument Parser =============
|
|
82
|
+
function parseArgs(args) {
|
|
83
|
+
const parsed = {
|
|
84
|
+
command: args[0] || 'help',
|
|
85
|
+
input: args[1],
|
|
86
|
+
options: {}
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
for (let i = 2; i < args.length; i++) {
|
|
90
|
+
if (args[i].startsWith('--')) {
|
|
91
|
+
const key = args[i].slice(2);
|
|
92
|
+
const value = args[i + 1];
|
|
93
|
+
|
|
94
|
+
if (value && !value.startsWith('--')) {
|
|
95
|
+
parsed.options[key] = value;
|
|
96
|
+
i++;
|
|
97
|
+
} else {
|
|
98
|
+
parsed.options[key] = true;
|
|
99
|
+
}
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
return parsed;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ============= GPU/CPU Detection & Setup =============
|
|
107
|
+
async function setupWhisper(forceCpu = false) {
|
|
108
|
+
console.log(chalk.cyan('\nāļø Setting up Whisper...\n'));
|
|
109
|
+
|
|
110
|
+
try {
|
|
111
|
+
// Check for GPU availability
|
|
112
|
+
if (!forceCpu) {
|
|
113
|
+
let gpuDetected = false;
|
|
114
|
+
|
|
115
|
+
// Method 1: Check via PyTorch
|
|
116
|
+
try {
|
|
117
|
+
console.log(chalk.gray(' Checking GPU via PyTorch...'));
|
|
118
|
+
gpuDetected = await checkGpuViaPyTorch();
|
|
119
|
+
if (gpuDetected) {
|
|
120
|
+
console.log(chalk.green('ā GPU detected (PyTorch) - using GPU acceleration'));
|
|
121
|
+
return { device: 'cuda', framework: 'torch' };
|
|
122
|
+
}
|
|
123
|
+
} catch (err) {
|
|
124
|
+
console.log(chalk.gray(` PyTorch check failed: ${err.message}`));
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Method 2: Fallback - Check via nvidia-smi
|
|
128
|
+
if (!gpuDetected) {
|
|
129
|
+
try {
|
|
130
|
+
console.log(chalk.gray(' Checking GPU via nvidia-smi...'));
|
|
131
|
+
gpuDetected = await checkGpuViaNvidiaSmi();
|
|
132
|
+
if (gpuDetected) {
|
|
133
|
+
console.log(chalk.green('ā GPU detected (nvidia-smi) - using GPU acceleration'));
|
|
134
|
+
return { device: 'cuda', framework: 'torch' };
|
|
135
|
+
}
|
|
136
|
+
} catch (err) {
|
|
137
|
+
console.log(chalk.gray(` nvidia-smi check failed: ${err.message}`));
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
console.log(chalk.yellow('ā GPU not available or disabled - using CPU'));
|
|
143
|
+
return { device: 'cpu', framework: 'torch' };
|
|
144
|
+
} catch (error) {
|
|
145
|
+
console.error(chalk.red(`Error setting up Whisper: ${error.message}`));
|
|
146
|
+
throw error;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
// ============= Helper: Check GPU via PyTorch =============
|
|
151
|
+
async function checkGpuViaPyTorch() {
|
|
152
|
+
return new Promise((resolve) => {
|
|
153
|
+
const pythonCmd = process.platform === 'win32' ? 'python' : 'python3';
|
|
154
|
+
|
|
155
|
+
const script = `import torch;print(int(torch.cuda.is_available()));print(torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None')`;
|
|
156
|
+
|
|
157
|
+
const proc = spawn(pythonCmd, ['-c', script]);
|
|
158
|
+
let output = '';
|
|
159
|
+
let hasError = false;
|
|
160
|
+
|
|
161
|
+
proc.stdout.on('data', (data) => {
|
|
162
|
+
output += data.toString();
|
|
163
|
+
});
|
|
164
|
+
|
|
165
|
+
proc.stderr.on('data', (err) => {
|
|
166
|
+
hasError = true;
|
|
167
|
+
});
|
|
168
|
+
|
|
169
|
+
proc.on('close', (code) => {
|
|
170
|
+
try {
|
|
171
|
+
const lines = output.trim().split('\\n');
|
|
172
|
+
const isAvailable = parseInt(lines[0]) === 1;
|
|
173
|
+
const deviceName = lines[1]?.trim() || 'Unknown GPU';
|
|
174
|
+
|
|
175
|
+
if (isAvailable) {
|
|
176
|
+
console.log(chalk.gray(` Device: ${deviceName}`));
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
resolve(isAvailable && !hasError && code === 0);
|
|
180
|
+
} catch (e) {
|
|
181
|
+
resolve(false);
|
|
182
|
+
}
|
|
183
|
+
});
|
|
184
|
+
|
|
185
|
+
// Timeout after 8 seconds
|
|
186
|
+
setTimeout(() => {
|
|
187
|
+
proc.kill();
|
|
188
|
+
resolve(false);
|
|
189
|
+
}, 8000);
|
|
190
|
+
});
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// ============= Helper: Check GPU via nvidia-smi =============
|
|
194
|
+
async function checkGpuViaNvidiaSmi() {
|
|
195
|
+
return new Promise((resolve) => {
|
|
196
|
+
const proc = spawn('nvidia-smi', [
|
|
197
|
+
'--query-gpu=name',
|
|
198
|
+
'--format=csv,noheader'
|
|
199
|
+
]);
|
|
200
|
+
|
|
201
|
+
let output = '';
|
|
202
|
+
let hasError = false;
|
|
203
|
+
|
|
204
|
+
proc.stdout.on('data', (data) => {
|
|
205
|
+
output += data.toString();
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
proc.stderr.on('data', () => {
|
|
209
|
+
hasError = true;
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
proc.on('close', (code) => {
|
|
213
|
+
if (!hasError && code === 0 && output.trim().length > 0) {
|
|
214
|
+
const gpuName = output.trim().split('\\n')[0];
|
|
215
|
+
console.log(chalk.gray(` Device: ${gpuName}`));
|
|
216
|
+
resolve(true);
|
|
217
|
+
} else {
|
|
218
|
+
resolve(false);
|
|
219
|
+
}
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
// Timeout after 3 seconds
|
|
223
|
+
setTimeout(() => {
|
|
224
|
+
proc.kill();
|
|
225
|
+
resolve(false);
|
|
226
|
+
}, 3000);
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// ============= Convert Command =============
|
|
231
|
+
async function handleConvert(inputFile, options) {
|
|
232
|
+
try {
|
|
233
|
+
// Validate input file
|
|
234
|
+
if (!inputFile) {
|
|
235
|
+
console.error(chalk.red('Error: Input file is required'));
|
|
236
|
+
console.log(chalk.yellow('Usage: mow convert <input> [--model <name>] [--output <file>] [--language <code>] [--force-cpu]'));
|
|
237
|
+
process.exit(1);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const inputPath = path.resolve(inputFile);
|
|
241
|
+
|
|
242
|
+
if (!fs.existsSync(inputPath)) {
|
|
243
|
+
console.error(chalk.red(`Error: File not found - ${inputPath}`));
|
|
244
|
+
process.exit(1);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
const ext = path.extname(inputPath).toLowerCase();
|
|
248
|
+
if (!SUPPORTED_FORMATS.includes(ext)) {
|
|
249
|
+
console.error(
|
|
250
|
+
chalk.red(`Error: Unsupported format - ${ext}`) +
|
|
251
|
+
chalk.gray(`\nSupported: ${SUPPORTED_FORMATS.join(', ')}`)
|
|
252
|
+
);
|
|
253
|
+
process.exit(1);
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Parse options
|
|
257
|
+
const model = options.model && WHISPER_MODELS.includes(options.model)
|
|
258
|
+
? options.model
|
|
259
|
+
: DEFAULT_MODEL;
|
|
260
|
+
|
|
261
|
+
const outputFile = options.output
|
|
262
|
+
? path.resolve(options.output)
|
|
263
|
+
: path.join(path.dirname(inputPath), `${path.basename(inputPath, ext)}_transcript.txt`);
|
|
264
|
+
|
|
265
|
+
const language = options.language || '';
|
|
266
|
+
const forceCpu = options['force-cpu'] === true;
|
|
267
|
+
|
|
268
|
+
// Setup Whisper
|
|
269
|
+
const setup = await setupWhisper(forceCpu);
|
|
270
|
+
|
|
271
|
+
console.log(chalk.bold.cyan('\nš Transcription Parameters:'));
|
|
272
|
+
console.log(chalk.gray(` Input: ${chalk.cyan(inputPath)}`));
|
|
273
|
+
console.log(chalk.gray(` Model: ${chalk.yellow(model)}`));
|
|
274
|
+
console.log(chalk.gray(` Device: ${chalk.yellow(setup.device.toUpperCase())}`));
|
|
275
|
+
if (language) {
|
|
276
|
+
console.log(chalk.gray(` Language: ${chalk.yellow(language)}`));
|
|
277
|
+
}
|
|
278
|
+
console.log(chalk.gray(` Output: ${chalk.cyan(outputFile)}\n`));
|
|
279
|
+
|
|
280
|
+
// Create progress bar
|
|
281
|
+
const progressBar = new ProgressBar(
|
|
282
|
+
chalk.cyan(' [:bar] :percent :etas'),
|
|
283
|
+
{
|
|
284
|
+
complete: 'ā',
|
|
285
|
+
incomplete: 'ā',
|
|
286
|
+
width: 40,
|
|
287
|
+
total: 100
|
|
288
|
+
}
|
|
289
|
+
);
|
|
290
|
+
|
|
291
|
+
// Simulate transcription progress
|
|
292
|
+
const progressInterval = setInterval(() => {
|
|
293
|
+
if (progressBar.curr < progressBar.total) {
|
|
294
|
+
progressBar.tick(Math.random() * 30);
|
|
295
|
+
}
|
|
296
|
+
}, 800);
|
|
297
|
+
|
|
298
|
+
// Call Python script for actual transcription
|
|
299
|
+
const pythonScript = createPythonTranscriptScript(
|
|
300
|
+
inputPath,
|
|
301
|
+
outputFile,
|
|
302
|
+
model,
|
|
303
|
+
setup.device,
|
|
304
|
+
language
|
|
305
|
+
);
|
|
306
|
+
|
|
307
|
+
const pyPath = path.join(__dirname, 'transcribe_temp.py');
|
|
308
|
+
fs.writeFileSync(pyPath, pythonScript);
|
|
309
|
+
|
|
310
|
+
const result = await new Promise((resolve, reject) => {
|
|
311
|
+
const python = spawn('python', [pyPath]);
|
|
312
|
+
let output = '';
|
|
313
|
+
let error = '';
|
|
314
|
+
|
|
315
|
+
python.stdout.on('data', (data) => {
|
|
316
|
+
output += data.toString();
|
|
317
|
+
});
|
|
318
|
+
|
|
319
|
+
python.stderr.on('data', (data) => {
|
|
320
|
+
error += data.toString();
|
|
321
|
+
});
|
|
322
|
+
|
|
323
|
+
python.on('close', (code) => {
|
|
324
|
+
clearInterval(progressInterval);
|
|
325
|
+
progressBar.update(1); // Complete the progress bar
|
|
326
|
+
|
|
327
|
+
if (code !== 0) {
|
|
328
|
+
// Clean up temp script
|
|
329
|
+
try { fs.unlinkSync(pyPath); } catch (e) {}
|
|
330
|
+
reject(new Error(error || `Python script failed with code ${code}`));
|
|
331
|
+
} else {
|
|
332
|
+
// Clean up temp script
|
|
333
|
+
try { fs.unlinkSync(pyPath); } catch (e) {}
|
|
334
|
+
resolve(output);
|
|
335
|
+
}
|
|
336
|
+
});
|
|
337
|
+
});
|
|
338
|
+
|
|
339
|
+
console.log(chalk.green.bold('\nā Transcription completed successfully!\n'));
|
|
340
|
+
console.log(chalk.bold('Results:'));
|
|
341
|
+
console.log(chalk.gray(` Output file: ${chalk.cyan(outputFile)}`));
|
|
342
|
+
|
|
343
|
+
if (fs.existsSync(outputFile)) {
|
|
344
|
+
const stats = fs.statSync(outputFile);
|
|
345
|
+
console.log(chalk.gray(` File size: ${chalk.cyan((stats.size / 1024).toFixed(2))} KB`));
|
|
346
|
+
|
|
347
|
+
// Show preview
|
|
348
|
+
const content = fs.readFileSync(outputFile, 'utf-8');
|
|
349
|
+
const preview = content.substring(0, 200);
|
|
350
|
+
console.log(chalk.gray(` Preview: ${chalk.cyan(preview)}${content.length > 200 ? '...' : ''}\n`));
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
} catch (error) {
|
|
354
|
+
console.error(chalk.red.bold(`\nā Error: ${error.message}\n`));
|
|
355
|
+
process.exit(1);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
// ============= Python Transcription Script =============
|
|
360
|
+
function createPythonTranscriptScript(inputPath, outputPath, model, device, language) {
|
|
361
|
+
return `import torch
|
|
362
|
+
import whisper
|
|
363
|
+
import json
|
|
364
|
+
import sys
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
# Set device
|
|
368
|
+
device_type = "${device}"
|
|
369
|
+
print(f"Using device: {device_type}", file=sys.stderr)
|
|
370
|
+
|
|
371
|
+
# Load model
|
|
372
|
+
print(f"Loading {model} model from whisper...", file=sys.stderr)
|
|
373
|
+
model_obj = whisper.load_model("${model}", device=device_type)
|
|
374
|
+
|
|
375
|
+
# Transcribe
|
|
376
|
+
print(f"Transcribing audio file...", file=sys.stderr)
|
|
377
|
+
|
|
378
|
+
transcribe_kwargs = {
|
|
379
|
+
"model": model_obj,
|
|
380
|
+
"audio": "${inputPath.replace(/\\\\/g, '/')}",
|
|
381
|
+
"task": "transcribe",
|
|
382
|
+
}
|
|
383
|
+
|
|
384
|
+
${language ? `transcribe_kwargs["language"] = "${language}"` : ''}
|
|
385
|
+
|
|
386
|
+
result = whisper.transcribe(**transcribe_kwargs)
|
|
387
|
+
|
|
388
|
+
# Write output
|
|
389
|
+
with open("${outputPath.replace(/\\\\/g, '/')}", "w", encoding="utf-8") as f:
|
|
390
|
+
f.write(result["text"])
|
|
391
|
+
|
|
392
|
+
print(f"Transcription saved to ${outputPath.replace(/\\\\/g, '/')}", file=sys.stderr)
|
|
393
|
+
|
|
394
|
+
except ImportError as e:
|
|
395
|
+
print(f"Error: The required package is not installed: {e}", file=sys.stderr)
|
|
396
|
+
print("Please install: pip install openai-whisper torch", file=sys.stderr)
|
|
397
|
+
sys.exit(1)
|
|
398
|
+
except Exception as e:
|
|
399
|
+
print(f"Error: {str(e)}", file=sys.stderr)
|
|
400
|
+
sys.exit(1)
|
|
401
|
+
`;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
// ============= Serve Command =============
|
|
405
|
+
async function handleServe(port = DEFAULT_PORT) {
|
|
406
|
+
try {
|
|
407
|
+
printBanner();
|
|
408
|
+
|
|
409
|
+
console.log(chalk.bold.cyan('š Starting MOW API Server...\n'));
|
|
410
|
+
|
|
411
|
+
// Check if express can be imported
|
|
412
|
+
let express, cors, multer;
|
|
413
|
+
try {
|
|
414
|
+
express = require('express');
|
|
415
|
+
cors = require('cors');
|
|
416
|
+
multer = require('multer');
|
|
417
|
+
} catch (e) {
|
|
418
|
+
console.error(chalk.red('Error: Express, CORS, and Multer are required for the server'));
|
|
419
|
+
console.log(chalk.yellow('Please install: npm install express cors multer'));
|
|
420
|
+
process.exit(1);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
const setup = await setupWhisper(false);
|
|
424
|
+
|
|
425
|
+
const app = express();
|
|
426
|
+
const upload = multer({ dest: 'uploads/' });
|
|
427
|
+
|
|
428
|
+
app.use(cors());
|
|
429
|
+
app.use(express.json());
|
|
430
|
+
|
|
431
|
+
// Health check endpoint
|
|
432
|
+
app.get('/health', (req, res) => {
|
|
433
|
+
res.json({
|
|
434
|
+
status: 'ok',
|
|
435
|
+
service: 'MOW Speech-to-Text API',
|
|
436
|
+
version,
|
|
437
|
+
device: setup.device,
|
|
438
|
+
models: WHISPER_MODELS
|
|
439
|
+
});
|
|
440
|
+
});
|
|
441
|
+
|
|
442
|
+
// Transcribe endpoint
|
|
443
|
+
app.post('/api/transcribe', upload.single('audio'), async (req, res) => {
|
|
444
|
+
if (!req.file) {
|
|
445
|
+
return res.status(400).json({ error: 'No audio file provided' });
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
const model = req.body.model || DEFAULT_MODEL;
|
|
449
|
+
const language = req.body.language || '';
|
|
450
|
+
|
|
451
|
+
try {
|
|
452
|
+
// Placeholder - actual implementation would call whisper
|
|
453
|
+
const result = {
|
|
454
|
+
text: 'Sample transcription result',
|
|
455
|
+
model,
|
|
456
|
+
language: language || 'auto-detected',
|
|
457
|
+
duration: 0,
|
|
458
|
+
language_confidence: 0.95
|
|
459
|
+
};
|
|
460
|
+
|
|
461
|
+
res.json(result);
|
|
462
|
+
} catch (error) {
|
|
463
|
+
res.status(500).json({ error: error.message });
|
|
464
|
+
}
|
|
465
|
+
});
|
|
466
|
+
|
|
467
|
+
// List available models
|
|
468
|
+
app.get('/api/models', (req, res) => {
|
|
469
|
+
res.json({
|
|
470
|
+
models: WHISPER_MODELS,
|
|
471
|
+
default: DEFAULT_MODEL,
|
|
472
|
+
device: setup.device
|
|
473
|
+
});
|
|
474
|
+
});
|
|
475
|
+
|
|
476
|
+
// API documentation
|
|
477
|
+
app.get('/api/docs', (req, res) => {
|
|
478
|
+
res.json({
|
|
479
|
+
service: 'MOW Speech-to-Text API',
|
|
480
|
+
version,
|
|
481
|
+
endpoints: {
|
|
482
|
+
'GET /health': 'Check API health status',
|
|
483
|
+
'GET /api/models': 'List available Whisper models',
|
|
484
|
+
'POST /api/transcribe': 'Transcribe audio file (multipart/form-data)',
|
|
485
|
+
'GET /api/docs': 'This documentation'
|
|
486
|
+
},
|
|
487
|
+
docs_url: API_DOCS_URL,
|
|
488
|
+
github_url: GITHUB_URL
|
|
489
|
+
});
|
|
490
|
+
});
|
|
491
|
+
|
|
492
|
+
// Try to find available port if current one is in use
|
|
493
|
+
const server = app.listen(port, () => {
|
|
494
|
+
console.log(
|
|
495
|
+
gradient('cyan', 'magenta')(
|
|
496
|
+
'āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā'
|
|
497
|
+
)
|
|
498
|
+
);
|
|
499
|
+
console.log(chalk.green.bold(`ā Server running at ${chalk.cyan(`http://localhost:${port}`)}`));
|
|
500
|
+
console.log(chalk.green.bold(`ā Health check: ${chalk.cyan(`http://localhost:${port}/health`)}`));
|
|
501
|
+
console.log(chalk.green.bold(`ā API Docs: ${chalk.cyan(`http://localhost:${port}/api/docs`)}`));
|
|
502
|
+
console.log(chalk.green.bold(`ā Documentation: ${chalk.cyan(API_DOCS_URL)}`));
|
|
503
|
+
console.log(
|
|
504
|
+
gradient('cyan', 'magenta')(
|
|
505
|
+
'āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā'
|
|
506
|
+
)
|
|
507
|
+
);
|
|
508
|
+
console.log(chalk.yellow('\nPress Ctrl+C to stop the server\n'));
|
|
509
|
+
});
|
|
510
|
+
|
|
511
|
+
// Handle port already in use
|
|
512
|
+
server.on('error', (err) => {
|
|
513
|
+
if (err.code === 'EADDRINUSE') {
|
|
514
|
+
console.error(chalk.red(`ā Port ${port} is already in use`));
|
|
515
|
+
console.log(chalk.yellow(`\nTry one of these alternatives:`));
|
|
516
|
+
const altPorts = [3001, 3002, 3003, 8000, 8080, 8888];
|
|
517
|
+
altPorts.forEach(p => {
|
|
518
|
+
console.log(chalk.cyan(` mow serve ${p}`));
|
|
519
|
+
});
|
|
520
|
+
console.log(chalk.gray('\nOr check which process uses port ' + port));
|
|
521
|
+
console.log(chalk.gray(' Windows: netstat -ano | findstr :' + port));
|
|
522
|
+
console.log(chalk.gray(' macOS/Linux: lsof -i :' + port + '\n'));
|
|
523
|
+
process.exit(1);
|
|
524
|
+
} else {
|
|
525
|
+
console.error(chalk.red(`Server error: ${err.message}`));
|
|
526
|
+
process.exit(1);
|
|
527
|
+
}
|
|
528
|
+
});
|
|
529
|
+
|
|
530
|
+
// Track active connections
|
|
531
|
+
const activeConnections = new Set();
|
|
532
|
+
|
|
533
|
+
server.on('connection', (conn) => {
|
|
534
|
+
activeConnections.add(conn);
|
|
535
|
+
conn.on('close', () => {
|
|
536
|
+
activeConnections.delete(conn);
|
|
537
|
+
});
|
|
538
|
+
});
|
|
539
|
+
|
|
540
|
+
// Graceful shutdown
|
|
541
|
+
let isShuttingDown = false;
|
|
542
|
+
|
|
543
|
+
process.on('SIGINT', async () => {
|
|
544
|
+
if (isShuttingDown) {
|
|
545
|
+
console.log(chalk.red('\nā ļø Force shutting down...'));
|
|
546
|
+
process.exit(1);
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
isShuttingDown = true;
|
|
550
|
+
console.log(chalk.yellow('\n\nā¹ļø Shutting down server...'));
|
|
551
|
+
|
|
552
|
+
// Close server
|
|
553
|
+
server.close(() => {
|
|
554
|
+
console.log(chalk.green('ā Server stopped\n'));
|
|
555
|
+
process.exit(0);
|
|
556
|
+
});
|
|
557
|
+
|
|
558
|
+
// Destroy active connections after 3 seconds
|
|
559
|
+
const destroyTimer = setTimeout(() => {
|
|
560
|
+
console.log(chalk.yellow('ā ļø Forcing connection close...'));
|
|
561
|
+
activeConnections.forEach(conn => {
|
|
562
|
+
conn.destroy();
|
|
563
|
+
});
|
|
564
|
+
}, 3000);
|
|
565
|
+
|
|
566
|
+
// Force exit after 10 seconds
|
|
567
|
+
const forceExitTimer = setTimeout(() => {
|
|
568
|
+
console.log(chalk.red('\nā ļø Force exiting after timeout...\n'));
|
|
569
|
+
clearTimeout(destroyTimer);
|
|
570
|
+
process.exit(1);
|
|
571
|
+
}, 10000);
|
|
572
|
+
|
|
573
|
+
// Clear timers if server closes gracefully
|
|
574
|
+
server.once('close', () => {
|
|
575
|
+
clearTimeout(destroyTimer);
|
|
576
|
+
clearTimeout(forceExitTimer);
|
|
577
|
+
});
|
|
578
|
+
});
|
|
579
|
+
|
|
580
|
+
} catch (error) {
|
|
581
|
+
console.error(chalk.red.bold(`\nā Error: ${error.message}\n`));
|
|
582
|
+
process.exit(1);
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
// ============= Main Entry Point =============
|
|
587
|
+
async function main() {
|
|
588
|
+
const args = process.argv.slice(2);
|
|
589
|
+
|
|
590
|
+
// Handle no arguments
|
|
591
|
+
if (args.length === 0) {
|
|
592
|
+
printBanner();
|
|
593
|
+
printHelp();
|
|
594
|
+
process.exit(0);
|
|
595
|
+
}
|
|
596
|
+
|
|
597
|
+
const parsed = parseArgs(args);
|
|
598
|
+
|
|
599
|
+
// Check if running as mow-serve
|
|
600
|
+
const scriptName = path.basename(process.argv[1]);
|
|
601
|
+
const isMowServe = scriptName === 'mow-serve' || process.argv[1].includes('mow-serve');
|
|
602
|
+
|
|
603
|
+
if (isMowServe) {
|
|
604
|
+
printBanner();
|
|
605
|
+
await handleServe(parseInt(parsed.input) || DEFAULT_PORT);
|
|
606
|
+
return;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
// Handle commands
|
|
610
|
+
switch (parsed.command) {
|
|
611
|
+
case 'convert':
|
|
612
|
+
printBanner();
|
|
613
|
+
await handleConvert(parsed.input, parsed.options);
|
|
614
|
+
break;
|
|
615
|
+
|
|
616
|
+
case 'serve':
|
|
617
|
+
printBanner();
|
|
618
|
+
await handleServe(parseInt(parsed.input) || DEFAULT_PORT);
|
|
619
|
+
break;
|
|
620
|
+
|
|
621
|
+
case 'help':
|
|
622
|
+
case '--help':
|
|
623
|
+
case '-h':
|
|
624
|
+
printBanner();
|
|
625
|
+
printHelp();
|
|
626
|
+
break;
|
|
627
|
+
|
|
628
|
+
case '--version':
|
|
629
|
+
case '-v':
|
|
630
|
+
console.log(`v${version}`);
|
|
631
|
+
break;
|
|
632
|
+
|
|
633
|
+
default:
|
|
634
|
+
console.error(chalk.red(`Unknown command: ${parsed.command}`));
|
|
635
|
+
printHelp();
|
|
636
|
+
process.exit(1);
|
|
637
|
+
}
|
|
638
|
+
}
|
|
639
|
+
|
|
640
|
+
// Run main
|
|
641
|
+
if (require.main === module) {
|
|
642
|
+
main().catch(error => {
|
|
643
|
+
console.error(chalk.red(`Fatal error: ${error.message}`));
|
|
644
|
+
process.exit(1);
|
|
645
|
+
});
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
module.exports = { handleConvert, handleServe, parseArgs };
|
package/src/index.js
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* MOW Speech-to-Text - Index File
|
|
3
|
+
*
|
|
4
|
+
* This file exports the main functionality of the MOW package.
|
|
5
|
+
* Can be used for programmatic access to the library.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const path = require('path');
|
|
9
|
+
const { handleConvert, handleServe, parseArgs } = require('./cli');
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Main MOW Module
|
|
13
|
+
*
|
|
14
|
+
* Usage:
|
|
15
|
+
* const mow = require('@mynamezxc/mow-speech-to-text');
|
|
16
|
+
*
|
|
17
|
+
* // Convert audio to text
|
|
18
|
+
* await mow.convert('audio.mp3', { model: 'medium' });
|
|
19
|
+
*
|
|
20
|
+
* // Start server
|
|
21
|
+
* await mow.serve(3000);
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
module.exports = {
|
|
25
|
+
// Conversion function
|
|
26
|
+
convert: async (inputFile, options = {}) => {
|
|
27
|
+
return handleConvert(inputFile, options);
|
|
28
|
+
},
|
|
29
|
+
|
|
30
|
+
// Server function
|
|
31
|
+
serve: async (port = 3000) => {
|
|
32
|
+
return handleServe(port);
|
|
33
|
+
},
|
|
34
|
+
|
|
35
|
+
// Argument parser
|
|
36
|
+
parseArgs,
|
|
37
|
+
|
|
38
|
+
// Version
|
|
39
|
+
version: require('../package.json').version,
|
|
40
|
+
|
|
41
|
+
// Package info
|
|
42
|
+
name: require('../package.json').name,
|
|
43
|
+
description: require('../package.json').description,
|
|
44
|
+
author: require('../package.json').author
|
|
45
|
+
};
|