@wovin/tranz 0.1.36 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/{audio.min.js → audio.js} +32 -18
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/{index.min.js → index.js} +161 -29
- package/dist/providers.d.ts +1 -1
- package/dist/providers.d.ts.map +1 -1
- package/dist/{providers.min.js → providers.js} +68 -24
- package/dist/utils/audio/merge-results.d.ts +14 -12
- package/dist/utils/audio/merge-results.d.ts.map +1 -1
- package/dist/utils/transcription/format.d.ts +27 -0
- package/dist/utils/transcription/format.d.ts.map +1 -1
- package/dist/utils/transcription/providers.d.ts +30 -1
- package/dist/utils/transcription/providers.d.ts.map +1 -1
- package/dist/utils/transcription/transcribe.d.ts +5 -0
- package/dist/utils/transcription/transcribe.d.ts.map +1 -1
- package/package.json +10 -8
- package/src/audio.ts +25 -0
- package/src/index.ts +61 -0
- package/src/providers.ts +23 -0
- package/src/realtime.ts +58 -0
- package/src/utils/audio/index.ts +6 -0
- package/src/utils/audio/merge-results.ts +198 -0
- package/src/utils/audio/split.ts +504 -0
- package/src/utils/file-utils.ts +16 -0
- package/src/utils/transcription/format.ts +208 -0
- package/src/utils/transcription/mime-detection.ts +80 -0
- package/src/utils/transcription/providers.ts +572 -0
- package/src/utils/transcription/realtime.ts +821 -0
- package/src/utils/transcription/runtime.ts +40 -0
- package/src/utils/transcription/transcribe.ts +366 -0
- /package/dist/{realtime.min.js → realtime.js} +0 -0
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format transcript with line breaks based on pauses between words
|
|
3
|
+
* @param transcript The punctuated transcript text from the API
|
|
4
|
+
* @param words Array of word objects with timing data
|
|
5
|
+
* @param shortPauseThreshold Threshold for single line break (default: 1.0s)
|
|
6
|
+
* @param longPauseThreshold Threshold for paragraph break/double newline (default: 5.0s)
|
|
7
|
+
*/
|
|
8
|
+
export function formatTranscriptWithPauses(
|
|
9
|
+
transcript: string,
|
|
10
|
+
words: Array<{ word: string; start: number; end: number; confidence: number }>,
|
|
11
|
+
shortPauseThreshold = 1.0,
|
|
12
|
+
longPauseThreshold = 5.0,
|
|
13
|
+
): string {
|
|
14
|
+
if (!words || words.length === 0) {
|
|
15
|
+
return transcript
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
// Find pause positions (word indices where we should insert line breaks)
|
|
19
|
+
const pausePositions: Map<number, 'short' | 'long'> = new Map()
|
|
20
|
+
|
|
21
|
+
for (let i = 0; i < words.length - 1; i++) {
|
|
22
|
+
const word = words[i]
|
|
23
|
+
const nextWord = words[i + 1]
|
|
24
|
+
const gap = nextWord.start - word.end
|
|
25
|
+
|
|
26
|
+
if (gap >= longPauseThreshold) {
|
|
27
|
+
pausePositions.set(i, 'long')
|
|
28
|
+
} else if (gap >= shortPauseThreshold) {
|
|
29
|
+
pausePositions.set(i, 'short')
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
// If no pauses found, return original transcript
|
|
34
|
+
if (pausePositions.size === 0) {
|
|
35
|
+
return transcript
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
// Split transcript into words, preserving punctuation
|
|
39
|
+
// This regex splits on spaces but keeps punctuation attached to words
|
|
40
|
+
const transcriptWords = transcript.split(/\s+/)
|
|
41
|
+
|
|
42
|
+
// Build output with line breaks at pause positions
|
|
43
|
+
const result: string[] = []
|
|
44
|
+
|
|
45
|
+
for (let i = 0; i < transcriptWords.length; i++) {
|
|
46
|
+
result.push(transcriptWords[i])
|
|
47
|
+
|
|
48
|
+
// Check if there's a pause after this word
|
|
49
|
+
const pauseType = pausePositions.get(i)
|
|
50
|
+
if (pauseType === 'long') {
|
|
51
|
+
result.push('\n\n')
|
|
52
|
+
} else if (pauseType === 'short') {
|
|
53
|
+
result.push('\n')
|
|
54
|
+
} else if (i < transcriptWords.length - 1) {
|
|
55
|
+
// No pause, add regular space
|
|
56
|
+
result.push(' ')
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
return result.join('')
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
import type { TranscriptionResult, TranscriptSegment } from './providers.ts'
|
|
64
|
+
import type { MergedTranscriptionResult } from '../audio/merge-results.ts'
|
|
65
|
+
|
|
66
|
+
export interface FormatMarkdownOptions {
|
|
67
|
+
/** Silence gap (seconds) that ends a paragraph. Default 1.5. */
|
|
68
|
+
gapSec?: number
|
|
69
|
+
/** Include `· Speaker N` in each paragraph header when diarization labels are present. Default true. */
|
|
70
|
+
speakerLabel?: boolean
|
|
71
|
+
/** Prepend an `# <source>` title + bulleted metadata block. Default false. */
|
|
72
|
+
includeHeader?: boolean
|
|
73
|
+
/** Source filename to use in the `# ` title and `Source:` line (when includeHeader=true). */
|
|
74
|
+
source?: string
|
|
75
|
+
/** Total audio duration in seconds — used for `Duration:` line and for picking mm:ss vs h:mm:ss formatting. */
|
|
76
|
+
durationSec?: number
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function formatTime(seconds: number, useHours: boolean): string {
|
|
80
|
+
const total = Math.max(0, Math.floor(seconds))
|
|
81
|
+
const h = Math.floor(total / 3600)
|
|
82
|
+
const m = Math.floor((total % 3600) / 60)
|
|
83
|
+
const s = total % 60
|
|
84
|
+
if (useHours) {
|
|
85
|
+
return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`
|
|
86
|
+
}
|
|
87
|
+
return `${m}:${String(s).padStart(2, '0')}`
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function formatDurationHuman(seconds: number): string {
|
|
91
|
+
const total = Math.max(0, Math.floor(seconds))
|
|
92
|
+
const h = Math.floor(total / 3600)
|
|
93
|
+
const m = Math.floor((total % 3600) / 60)
|
|
94
|
+
const s = total % 60
|
|
95
|
+
if (h > 0) return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`
|
|
96
|
+
return `${m}:${String(s).padStart(2, '0')}`
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Format a transcription result as readable Markdown with timestamped paragraphs.
|
|
101
|
+
*
|
|
102
|
+
* Groups adjacent segments into paragraphs, starting a new paragraph on either
|
|
103
|
+
* a silence gap ≥ `gapSec` OR a change in diarization label. Each paragraph is
|
|
104
|
+
* preceded by `**[mm:ss · Speaker N]**` (or `**[h:mm:ss · Speaker N]**` for
|
|
105
|
+
* audio ≥ 1h). The speaker suffix is dropped when no diarization labels are
|
|
106
|
+
* present or all segments share the same label.
|
|
107
|
+
*
|
|
108
|
+
* If `segments` is missing/empty, falls back to emitting `result.text` as a
|
|
109
|
+
* single (un-timestamped) paragraph.
|
|
110
|
+
*/
|
|
111
|
+
export function formatTranscriptAsMarkdown(
|
|
112
|
+
result: TranscriptionResult | MergedTranscriptionResult,
|
|
113
|
+
opts: FormatMarkdownOptions = {},
|
|
114
|
+
): string {
|
|
115
|
+
const gapSec = opts.gapSec ?? 1.5
|
|
116
|
+
const wantSpeakerLabel = opts.speakerLabel ?? true
|
|
117
|
+
const includeHeader = opts.includeHeader ?? false
|
|
118
|
+
const gapMs = gapSec * 1000
|
|
119
|
+
|
|
120
|
+
const segments = result.segments ?? []
|
|
121
|
+
const totalDurationSec =
|
|
122
|
+
opts.durationSec ??
|
|
123
|
+
result.duration ??
|
|
124
|
+
(segments.length > 0 ? segments[segments.length - 1].endMs / 1000 : 0)
|
|
125
|
+
const useHours = totalDurationSec >= 3600
|
|
126
|
+
|
|
127
|
+
const speakerSet = new Set<string | number>()
|
|
128
|
+
for (const seg of segments) {
|
|
129
|
+
if (seg.diarization !== undefined && seg.diarization !== null) {
|
|
130
|
+
speakerSet.add(seg.diarization)
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
const hasMultipleSpeakers = speakerSet.size > 1
|
|
134
|
+
const showSpeakers = wantSpeakerLabel && hasMultipleSpeakers
|
|
135
|
+
|
|
136
|
+
let headerBlock = ''
|
|
137
|
+
if (includeHeader) {
|
|
138
|
+
const bullets: string[] = []
|
|
139
|
+
if (opts.source) bullets.push(`- Source: \`${opts.source}\``)
|
|
140
|
+
if (totalDurationSec > 0) bullets.push(`- Duration: ${formatDurationHuman(totalDurationSec)}`)
|
|
141
|
+
if (result.model) bullets.push(`- Model: ${result.model}`)
|
|
142
|
+
if (segments.length > 0) bullets.push(`- Segments: ${segments.length}`)
|
|
143
|
+
if (speakerSet.size > 0) bullets.push(`- Speakers (diarized): ${speakerSet.size}`)
|
|
144
|
+
|
|
145
|
+
const parts: string[] = []
|
|
146
|
+
if (opts.source) parts.push(`# ${opts.source}`)
|
|
147
|
+
if (bullets.length > 0) parts.push(bullets.join('\n'))
|
|
148
|
+
parts.push('## Transcript')
|
|
149
|
+
headerBlock = parts.join('\n\n')
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
const body: string[] = []
|
|
153
|
+
|
|
154
|
+
if (segments.length === 0) {
|
|
155
|
+
const text = (result.text ?? '').trim()
|
|
156
|
+
if (text) body.push(text)
|
|
157
|
+
} else {
|
|
158
|
+
interface Paragraph {
|
|
159
|
+
startMs: number
|
|
160
|
+
lastEndMs: number
|
|
161
|
+
speaker?: string | number
|
|
162
|
+
texts: string[]
|
|
163
|
+
}
|
|
164
|
+
const paragraphs: Paragraph[] = []
|
|
165
|
+
let current: Paragraph | undefined
|
|
166
|
+
|
|
167
|
+
for (const seg of segments) {
|
|
168
|
+
const text = (seg.text ?? '').trim()
|
|
169
|
+
if (!text) continue
|
|
170
|
+
|
|
171
|
+
const gap = current ? seg.startMs - current.lastEndMs : Infinity
|
|
172
|
+
const speakerChanged = current ? current.speaker !== seg.diarization : false
|
|
173
|
+
|
|
174
|
+
if (!current || gap >= gapMs || speakerChanged) {
|
|
175
|
+
current = {
|
|
176
|
+
startMs: seg.startMs,
|
|
177
|
+
lastEndMs: seg.endMs,
|
|
178
|
+
speaker: seg.diarization,
|
|
179
|
+
texts: [text],
|
|
180
|
+
}
|
|
181
|
+
paragraphs.push(current)
|
|
182
|
+
} else {
|
|
183
|
+
current.texts.push(text)
|
|
184
|
+
current.lastEndMs = seg.endMs
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
for (const p of paragraphs) {
|
|
189
|
+
const time = formatTime(p.startMs / 1000, useHours)
|
|
190
|
+
const speakerSuffix =
|
|
191
|
+
showSpeakers && p.speaker !== undefined && p.speaker !== null
|
|
192
|
+
? ` · Speaker ${formatSpeakerLabel(p.speaker)}`
|
|
193
|
+
: ''
|
|
194
|
+
body.push(`**[${time}${speakerSuffix}]** ${p.texts.join(' ')}`)
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
const bodyBlock = body.join('\n\n')
|
|
199
|
+
const out = [headerBlock, bodyBlock].filter((s) => s.length > 0).join('\n\n')
|
|
200
|
+
return out.trimEnd() + (out.length > 0 ? '\n' : '')
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
function formatSpeakerLabel(value: string | number): string {
|
|
204
|
+
const s = String(value)
|
|
205
|
+
// Mistral emits `speaker_1`, `speaker_2`, … — strip the redundant prefix
|
|
206
|
+
// so the rendered label reads "Speaker 1" not "Speaker speaker_1".
|
|
207
|
+
return s.replace(/^speaker[_-]/, '')
|
|
208
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Buffer } from 'node:buffer'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Detect audio format from buffer magic bytes
|
|
5
|
+
*
|
|
6
|
+
* @param buffer - The audio buffer to analyze
|
|
7
|
+
* @returns The detected MIME type string ('audio/mpeg', 'audio/ogg', 'audio/wav', 'audio/flac')
|
|
8
|
+
*
|
|
9
|
+
* @description
|
|
10
|
+
* This function analyzes the magic bytes at the beginning of an audio buffer to determine
|
|
11
|
+
* its format. It handles MP3 files with ID3 tags by skipping over them before checking
|
|
12
|
+
* for the actual audio frame header.
|
|
13
|
+
*
|
|
14
|
+
* Supported formats:
|
|
15
|
+
* - MP3 (MPEG Audio): FF FB, FF FA, FF F3, FF F2 with optional ID3 tag
|
|
16
|
+
* - OGG: "OggS" (4F 67 67 53)
|
|
17
|
+
* - WAV (RIFF): "RIFF" (52 49 46 46)
|
|
18
|
+
* - FLAC: "fLaC" (66 4C 61 43)
|
|
19
|
+
*
|
|
20
|
+
* @example
|
|
21
|
+
* const buffer = readFileSync('audio.mp3')
|
|
22
|
+
* const mimeType = detectAudioMimeType(buffer)
|
|
23
|
+
* console.log(mimeType) // 'audio/mpeg'
|
|
24
|
+
*/
|
|
25
|
+
export function detectAudioMimeType(buffer: Buffer): string {
|
|
26
|
+
if (buffer.length < 4) return 'audio/ogg'
|
|
27
|
+
|
|
28
|
+
let offset = 0
|
|
29
|
+
|
|
30
|
+
// Skip ID3 tags (used in MP3 files)
|
|
31
|
+
if (buffer[0] === 0x49 && buffer[1] === 0x44 && buffer[2] === 0x33) {
|
|
32
|
+
// ID3v2 header: "ID3" + version (2 bytes) + flags (1 byte) + size (4 bytes synchsafe)
|
|
33
|
+
if (buffer.length >= 10) {
|
|
34
|
+
const size = ((buffer[6] & 0x7f) << 21) | ((buffer[7] & 0x7f) << 14) | ((buffer[8] & 0x7f) << 7) |
|
|
35
|
+
(buffer[9] & 0x7f)
|
|
36
|
+
offset = 10 + size
|
|
37
|
+
// Make sure we don't skip past the buffer
|
|
38
|
+
if (offset >= buffer.length) offset = 0
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// Now check the actual audio format at the correct offset
|
|
43
|
+
if (buffer.length - offset >= 4) {
|
|
44
|
+
// MP3: FF FB or FF FA (MPEG Audio)
|
|
45
|
+
if (
|
|
46
|
+
(buffer[offset] === 0xff && (buffer[offset + 1] === 0xfb || buffer[offset + 1] === 0xfa)) ||
|
|
47
|
+
(buffer[offset] === 0xff && buffer[offset + 1] === 0xf3) ||
|
|
48
|
+
(buffer[offset] === 0xff && buffer[offset + 1] === 0xf2)
|
|
49
|
+
) {
|
|
50
|
+
return 'audio/mpeg'
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
// OGG: OggS
|
|
54
|
+
if (
|
|
55
|
+
buffer[offset] === 0x4f && buffer[offset + 1] === 0x67 && buffer[offset + 2] === 0x67 &&
|
|
56
|
+
buffer[offset + 3] === 0x53
|
|
57
|
+
) {
|
|
58
|
+
return 'audio/ogg'
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// WAV: RIFF
|
|
62
|
+
if (
|
|
63
|
+
buffer[offset] === 0x52 && buffer[offset + 1] === 0x49 && buffer[offset + 2] === 0x46 &&
|
|
64
|
+
buffer[offset + 3] === 0x46
|
|
65
|
+
) {
|
|
66
|
+
return 'audio/wav'
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
// FLAC: fLaC
|
|
70
|
+
if (
|
|
71
|
+
buffer[offset] === 0x66 && buffer[offset + 1] === 0x4c && buffer[offset + 2] === 0x61 &&
|
|
72
|
+
buffer[offset + 3] === 0x43
|
|
73
|
+
) {
|
|
74
|
+
return 'audio/flac'
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Default to OGG
|
|
79
|
+
return 'audio/ogg'
|
|
80
|
+
}
|