@wovin/tranz 0.1.36 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,208 @@
1
+ /**
2
+ * Format transcript with line breaks based on pauses between words
3
+ * @param transcript The punctuated transcript text from the API
4
+ * @param words Array of word objects with timing data
5
+ * @param shortPauseThreshold Threshold for single line break (default: 1.0s)
6
+ * @param longPauseThreshold Threshold for paragraph break/double newline (default: 5.0s)
7
+ */
8
+ export function formatTranscriptWithPauses(
9
+ transcript: string,
10
+ words: Array<{ word: string; start: number; end: number; confidence: number }>,
11
+ shortPauseThreshold = 1.0,
12
+ longPauseThreshold = 5.0,
13
+ ): string {
14
+ if (!words || words.length === 0) {
15
+ return transcript
16
+ }
17
+
18
+ // Find pause positions (word indices where we should insert line breaks)
19
+ const pausePositions: Map<number, 'short' | 'long'> = new Map()
20
+
21
+ for (let i = 0; i < words.length - 1; i++) {
22
+ const word = words[i]
23
+ const nextWord = words[i + 1]
24
+ const gap = nextWord.start - word.end
25
+
26
+ if (gap >= longPauseThreshold) {
27
+ pausePositions.set(i, 'long')
28
+ } else if (gap >= shortPauseThreshold) {
29
+ pausePositions.set(i, 'short')
30
+ }
31
+ }
32
+
33
+ // If no pauses found, return original transcript
34
+ if (pausePositions.size === 0) {
35
+ return transcript
36
+ }
37
+
38
+ // Split transcript into words, preserving punctuation
39
+ // This regex splits on spaces but keeps punctuation attached to words
40
+ const transcriptWords = transcript.split(/\s+/)
41
+
42
+ // Build output with line breaks at pause positions
43
+ const result: string[] = []
44
+
45
+ for (let i = 0; i < transcriptWords.length; i++) {
46
+ result.push(transcriptWords[i])
47
+
48
+ // Check if there's a pause after this word
49
+ const pauseType = pausePositions.get(i)
50
+ if (pauseType === 'long') {
51
+ result.push('\n\n')
52
+ } else if (pauseType === 'short') {
53
+ result.push('\n')
54
+ } else if (i < transcriptWords.length - 1) {
55
+ // No pause, add regular space
56
+ result.push(' ')
57
+ }
58
+ }
59
+
60
+ return result.join('')
61
+ }
62
+
63
+ import type { TranscriptionResult, TranscriptSegment } from './providers.ts'
64
+ import type { MergedTranscriptionResult } from '../audio/merge-results.ts'
65
+
66
+ export interface FormatMarkdownOptions {
67
+ /** Silence gap (seconds) that ends a paragraph. Default 1.5. */
68
+ gapSec?: number
69
+ /** Include `· Speaker N` in each paragraph header when diarization labels are present. Default true. */
70
+ speakerLabel?: boolean
71
+ /** Prepend an `# <source>` title + bulleted metadata block. Default false. */
72
+ includeHeader?: boolean
73
+ /** Source filename to use in the `# ` title and `Source:` line (when includeHeader=true). */
74
+ source?: string
75
+ /** Total audio duration in seconds — used for `Duration:` line and for picking mm:ss vs h:mm:ss formatting. */
76
+ durationSec?: number
77
+ }
78
+
79
+ function formatTime(seconds: number, useHours: boolean): string {
80
+ const total = Math.max(0, Math.floor(seconds))
81
+ const h = Math.floor(total / 3600)
82
+ const m = Math.floor((total % 3600) / 60)
83
+ const s = total % 60
84
+ if (useHours) {
85
+ return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`
86
+ }
87
+ return `${m}:${String(s).padStart(2, '0')}`
88
+ }
89
+
90
+ function formatDurationHuman(seconds: number): string {
91
+ const total = Math.max(0, Math.floor(seconds))
92
+ const h = Math.floor(total / 3600)
93
+ const m = Math.floor((total % 3600) / 60)
94
+ const s = total % 60
95
+ if (h > 0) return `${h}:${String(m).padStart(2, '0')}:${String(s).padStart(2, '0')}`
96
+ return `${m}:${String(s).padStart(2, '0')}`
97
+ }
98
+
99
+ /**
100
+ * Format a transcription result as readable Markdown with timestamped paragraphs.
101
+ *
102
+ * Groups adjacent segments into paragraphs, starting a new paragraph on either
103
+ * a silence gap ≥ `gapSec` OR a change in diarization label. Each paragraph is
104
+ * preceded by `**[mm:ss · Speaker N]**` (or `**[h:mm:ss · Speaker N]**` for
105
+ * audio ≥ 1h). The speaker suffix is dropped when no diarization labels are
106
+ * present or all segments share the same label.
107
+ *
108
+ * If `segments` is missing/empty, falls back to emitting `result.text` as a
109
+ * single (un-timestamped) paragraph.
110
+ */
111
+ export function formatTranscriptAsMarkdown(
112
+ result: TranscriptionResult | MergedTranscriptionResult,
113
+ opts: FormatMarkdownOptions = {},
114
+ ): string {
115
+ const gapSec = opts.gapSec ?? 1.5
116
+ const wantSpeakerLabel = opts.speakerLabel ?? true
117
+ const includeHeader = opts.includeHeader ?? false
118
+ const gapMs = gapSec * 1000
119
+
120
+ const segments = result.segments ?? []
121
+ const totalDurationSec =
122
+ opts.durationSec ??
123
+ result.duration ??
124
+ (segments.length > 0 ? segments[segments.length - 1].endMs / 1000 : 0)
125
+ const useHours = totalDurationSec >= 3600
126
+
127
+ const speakerSet = new Set<string | number>()
128
+ for (const seg of segments) {
129
+ if (seg.diarization !== undefined && seg.diarization !== null) {
130
+ speakerSet.add(seg.diarization)
131
+ }
132
+ }
133
+ const hasMultipleSpeakers = speakerSet.size > 1
134
+ const showSpeakers = wantSpeakerLabel && hasMultipleSpeakers
135
+
136
+ let headerBlock = ''
137
+ if (includeHeader) {
138
+ const bullets: string[] = []
139
+ if (opts.source) bullets.push(`- Source: \`${opts.source}\``)
140
+ if (totalDurationSec > 0) bullets.push(`- Duration: ${formatDurationHuman(totalDurationSec)}`)
141
+ if (result.model) bullets.push(`- Model: ${result.model}`)
142
+ if (segments.length > 0) bullets.push(`- Segments: ${segments.length}`)
143
+ if (speakerSet.size > 0) bullets.push(`- Speakers (diarized): ${speakerSet.size}`)
144
+
145
+ const parts: string[] = []
146
+ if (opts.source) parts.push(`# ${opts.source}`)
147
+ if (bullets.length > 0) parts.push(bullets.join('\n'))
148
+ parts.push('## Transcript')
149
+ headerBlock = parts.join('\n\n')
150
+ }
151
+
152
+ const body: string[] = []
153
+
154
+ if (segments.length === 0) {
155
+ const text = (result.text ?? '').trim()
156
+ if (text) body.push(text)
157
+ } else {
158
+ interface Paragraph {
159
+ startMs: number
160
+ lastEndMs: number
161
+ speaker?: string | number
162
+ texts: string[]
163
+ }
164
+ const paragraphs: Paragraph[] = []
165
+ let current: Paragraph | undefined
166
+
167
+ for (const seg of segments) {
168
+ const text = (seg.text ?? '').trim()
169
+ if (!text) continue
170
+
171
+ const gap = current ? seg.startMs - current.lastEndMs : Infinity
172
+ const speakerChanged = current ? current.speaker !== seg.diarization : false
173
+
174
+ if (!current || gap >= gapMs || speakerChanged) {
175
+ current = {
176
+ startMs: seg.startMs,
177
+ lastEndMs: seg.endMs,
178
+ speaker: seg.diarization,
179
+ texts: [text],
180
+ }
181
+ paragraphs.push(current)
182
+ } else {
183
+ current.texts.push(text)
184
+ current.lastEndMs = seg.endMs
185
+ }
186
+ }
187
+
188
+ for (const p of paragraphs) {
189
+ const time = formatTime(p.startMs / 1000, useHours)
190
+ const speakerSuffix =
191
+ showSpeakers && p.speaker !== undefined && p.speaker !== null
192
+ ? ` · Speaker ${formatSpeakerLabel(p.speaker)}`
193
+ : ''
194
+ body.push(`**[${time}${speakerSuffix}]** ${p.texts.join(' ')}`)
195
+ }
196
+ }
197
+
198
+ const bodyBlock = body.join('\n\n')
199
+ const out = [headerBlock, bodyBlock].filter((s) => s.length > 0).join('\n\n')
200
+ return out.trimEnd() + (out.length > 0 ? '\n' : '')
201
+ }
202
+
203
+ function formatSpeakerLabel(value: string | number): string {
204
+ const s = String(value)
205
+ // Mistral emits `speaker_1`, `speaker_2`, … — strip the redundant prefix
206
+ // so the rendered label reads "Speaker 1" not "Speaker speaker_1".
207
+ return s.replace(/^speaker[_-]/, '')
208
+ }
@@ -0,0 +1,80 @@
1
+ import { Buffer } from 'node:buffer'
2
+
3
+ /**
4
+ * Detect audio format from buffer magic bytes
5
+ *
6
+ * @param buffer - The audio buffer to analyze
7
+ * @returns The detected MIME type string ('audio/mpeg', 'audio/ogg', 'audio/wav', 'audio/flac')
8
+ *
9
+ * @description
10
+ * This function analyzes the magic bytes at the beginning of an audio buffer to determine
11
+ * its format. It handles MP3 files with ID3 tags by skipping over them before checking
12
+ * for the actual audio frame header.
13
+ *
14
+ * Supported formats:
15
+ * - MP3 (MPEG Audio): FF FB, FF FA, FF F3, FF F2 with optional ID3 tag
16
+ * - OGG: "OggS" (4F 67 67 53)
17
+ * - WAV (RIFF): "RIFF" (52 49 46 46)
18
+ * - FLAC: "fLaC" (66 4C 61 43)
19
+ *
20
+ * @example
21
+ * const buffer = readFileSync('audio.mp3')
22
+ * const mimeType = detectAudioMimeType(buffer)
23
+ * console.log(mimeType) // 'audio/mpeg'
24
+ */
25
+ export function detectAudioMimeType(buffer: Buffer): string {
26
+ if (buffer.length < 4) return 'audio/ogg'
27
+
28
+ let offset = 0
29
+
30
+ // Skip ID3 tags (used in MP3 files)
31
+ if (buffer[0] === 0x49 && buffer[1] === 0x44 && buffer[2] === 0x33) {
32
+ // ID3v2 header: "ID3" + version (2 bytes) + flags (1 byte) + size (4 bytes synchsafe)
33
+ if (buffer.length >= 10) {
34
+ const size = ((buffer[6] & 0x7f) << 21) | ((buffer[7] & 0x7f) << 14) | ((buffer[8] & 0x7f) << 7) |
35
+ (buffer[9] & 0x7f)
36
+ offset = 10 + size
37
+ // Make sure we don't skip past the buffer
38
+ if (offset >= buffer.length) offset = 0
39
+ }
40
+ }
41
+
42
+ // Now check the actual audio format at the correct offset
43
+ if (buffer.length - offset >= 4) {
44
+ // MP3: FF FB or FF FA (MPEG Audio)
45
+ if (
46
+ (buffer[offset] === 0xff && (buffer[offset + 1] === 0xfb || buffer[offset + 1] === 0xfa)) ||
47
+ (buffer[offset] === 0xff && buffer[offset + 1] === 0xf3) ||
48
+ (buffer[offset] === 0xff && buffer[offset + 1] === 0xf2)
49
+ ) {
50
+ return 'audio/mpeg'
51
+ }
52
+
53
+ // OGG: OggS
54
+ if (
55
+ buffer[offset] === 0x4f && buffer[offset + 1] === 0x67 && buffer[offset + 2] === 0x67 &&
56
+ buffer[offset + 3] === 0x53
57
+ ) {
58
+ return 'audio/ogg'
59
+ }
60
+
61
+ // WAV: RIFF
62
+ if (
63
+ buffer[offset] === 0x52 && buffer[offset + 1] === 0x49 && buffer[offset + 2] === 0x46 &&
64
+ buffer[offset + 3] === 0x46
65
+ ) {
66
+ return 'audio/wav'
67
+ }
68
+
69
+ // FLAC: fLaC
70
+ if (
71
+ buffer[offset] === 0x66 && buffer[offset + 1] === 0x4c && buffer[offset + 2] === 0x61 &&
72
+ buffer[offset + 3] === 0x43
73
+ ) {
74
+ return 'audio/flac'
75
+ }
76
+ }
77
+
78
+ // Default to OGG
79
+ return 'audio/ogg'
80
+ }