@wovin/tranz 0.1.36 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/{audio.min.js → audio.js} +32 -18
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/{index.min.js → index.js} +161 -29
- package/dist/providers.d.ts +1 -1
- package/dist/providers.d.ts.map +1 -1
- package/dist/{providers.min.js → providers.js} +68 -24
- package/dist/utils/audio/merge-results.d.ts +14 -12
- package/dist/utils/audio/merge-results.d.ts.map +1 -1
- package/dist/utils/transcription/format.d.ts +27 -0
- package/dist/utils/transcription/format.d.ts.map +1 -1
- package/dist/utils/transcription/providers.d.ts +30 -1
- package/dist/utils/transcription/providers.d.ts.map +1 -1
- package/dist/utils/transcription/transcribe.d.ts +5 -0
- package/dist/utils/transcription/transcribe.d.ts.map +1 -1
- package/package.json +10 -8
- package/src/audio.ts +25 -0
- package/src/index.ts +61 -0
- package/src/providers.ts +23 -0
- package/src/realtime.ts +58 -0
- package/src/utils/audio/index.ts +6 -0
- package/src/utils/audio/merge-results.ts +198 -0
- package/src/utils/audio/split.ts +504 -0
- package/src/utils/file-utils.ts +16 -0
- package/src/utils/transcription/format.ts +208 -0
- package/src/utils/transcription/mime-detection.ts +80 -0
- package/src/utils/transcription/providers.ts +572 -0
- package/src/utils/transcription/realtime.ts +821 -0
- package/src/utils/transcription/runtime.ts +40 -0
- package/src/utils/transcription/transcribe.ts +366 -0
- /package/dist/{realtime.min.js → realtime.js} +0 -0
package/README.md
CHANGED
|
@@ -277,17 +277,20 @@ Result from transcription:
|
|
|
277
277
|
- `text: string` - Full transcription text
|
|
278
278
|
- `duration?: number` - Audio duration in seconds
|
|
279
279
|
- `language?: string` - Detected or specified language
|
|
280
|
-
- `
|
|
281
|
-
- `
|
|
280
|
+
- `segments?: TranscriptSegment[]` - Segment-level data (text, ms timestamps, anonymous diarization label). Populated when `timestamp_granularities=['segment']` or the provider returns segments.
|
|
281
|
+
- `words?: WordData[]` - Word-level timestamps and confidence. Populated only when granularity='word' is supported and the provider returned word data. Left `undefined` otherwise (never `[]`).
|
|
282
282
|
- `error?: string` - Error message if transcription failed
|
|
283
283
|
|
|
284
|
+
`TranscriptSegment.diarization` (`string | number`) is the anonymous diarization label the provider assigned — Mistral returns `"speaker_1"`, Deepgram returns `0`, etc. It is NOT a real-world speaker identity. See `docs/annotation-schema.md`.
|
|
285
|
+
|
|
284
286
|
### `MergedTranscriptionResult`
|
|
285
287
|
|
|
286
|
-
Extended result for multi-
|
|
288
|
+
Extended result for multi-chunk transcriptions (auto-split):
|
|
287
289
|
|
|
288
290
|
- All fields from `TranscriptionResult`
|
|
289
|
-
- `
|
|
290
|
-
|
|
291
|
+
- `audioChunks?: AudioChunk[]` - Per-chunk metadata (index, startSec, endSec, text). Present only when more than one chunk was merged.
|
|
292
|
+
|
|
293
|
+
When merging multiple chunks, each segment's `diarization` value is rewritten as `` `chunk${index}/${value}` `` because per-chunk labels (`speaker_1`, `0`, …) are not comparable across chunks. Cross-chunk re-identification is a separate concern handled by speaker-identification (out of scope here).
|
|
291
294
|
|
|
292
295
|
## Providers
|
|
293
296
|
|
|
@@ -289,6 +289,10 @@ async function analyzeSplitPoints(audioPath, config = {}) {
|
|
|
289
289
|
}
|
|
290
290
|
|
|
291
291
|
// src/utils/audio/merge-results.ts
|
|
292
|
+
function prefixChunkLabel(chunkIndex, value) {
|
|
293
|
+
if (value === void 0) return void 0;
|
|
294
|
+
return `chunk${chunkIndex}/${String(value)}`;
|
|
295
|
+
}
|
|
292
296
|
function mergeTranscriptionResults(results, segments) {
|
|
293
297
|
if (results.length === 0) {
|
|
294
298
|
return {
|
|
@@ -297,10 +301,7 @@ function mergeTranscriptionResults(results, segments) {
|
|
|
297
301
|
};
|
|
298
302
|
}
|
|
299
303
|
if (results.length === 1) {
|
|
300
|
-
return
|
|
301
|
-
...results[0],
|
|
302
|
-
totalSegments: 1
|
|
303
|
-
};
|
|
304
|
+
return results[0];
|
|
304
305
|
}
|
|
305
306
|
const errors = results.map((r, i) => r.error ? `Segment ${i}: ${r.error}` : null).filter(Boolean);
|
|
306
307
|
if (errors.length > 0) {
|
|
@@ -321,12 +322,26 @@ function mergeTranscriptionResults(results, segments) {
|
|
|
321
322
|
start: (word.start || 0) + segment.startSec,
|
|
322
323
|
end: (word.end || 0) + segment.startSec,
|
|
323
324
|
confidence: word.confidence,
|
|
324
|
-
speaker: word.speaker
|
|
325
|
+
speaker: prefixChunkLabel(i, word.speaker)
|
|
326
|
+
});
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
const mergedSegments = [];
|
|
330
|
+
for (let i = 0; i < results.length; i++) {
|
|
331
|
+
const result = results[i];
|
|
332
|
+
const chunkOffsetMs = Math.round(segments[i].startSec * 1e3);
|
|
333
|
+
if (!result.segments) continue;
|
|
334
|
+
for (const seg of result.segments) {
|
|
335
|
+
mergedSegments.push({
|
|
336
|
+
startMs: seg.startMs + chunkOffsetMs,
|
|
337
|
+
endMs: seg.endMs + chunkOffsetMs,
|
|
338
|
+
text: seg.text,
|
|
339
|
+
...seg.diarization !== void 0 ? { diarization: prefixChunkLabel(i, seg.diarization) } : {}
|
|
325
340
|
});
|
|
326
341
|
}
|
|
327
342
|
}
|
|
328
343
|
const totalDuration = segments.reduce((sum, seg) => sum + seg.durationSec, 0);
|
|
329
|
-
const
|
|
344
|
+
const audioChunks = results.map((r, i) => ({
|
|
330
345
|
index: i,
|
|
331
346
|
startSec: segments[i].startSec,
|
|
332
347
|
endSec: segments[i].endSec,
|
|
@@ -334,37 +349,36 @@ function mergeTranscriptionResults(results, segments) {
|
|
|
334
349
|
}));
|
|
335
350
|
const mergedRawResponse = {
|
|
336
351
|
merged: true,
|
|
337
|
-
|
|
338
|
-
|
|
352
|
+
chunkCount: results.length,
|
|
353
|
+
chunks: results.map((r, i) => ({
|
|
339
354
|
index: i,
|
|
340
355
|
startSec: segments[i].startSec,
|
|
341
356
|
rawResponse: r.rawResponse
|
|
342
|
-
}))
|
|
343
|
-
words: mergedWords
|
|
357
|
+
}))
|
|
344
358
|
};
|
|
345
359
|
const firstResult = results[0];
|
|
346
360
|
return {
|
|
347
361
|
text: mergedText,
|
|
348
|
-
words: mergedWords,
|
|
349
362
|
duration: totalDuration,
|
|
350
363
|
language: firstResult.language,
|
|
351
364
|
model: firstResult.model,
|
|
352
365
|
rawResponse: mergedRawResponse,
|
|
353
|
-
|
|
354
|
-
|
|
366
|
+
audioChunks,
|
|
367
|
+
...mergedWords.length > 0 ? { words: mergedWords } : {},
|
|
368
|
+
...mergedSegments.length > 0 ? { segments: mergedSegments } : {}
|
|
355
369
|
};
|
|
356
370
|
}
|
|
357
371
|
function formatMergedText(result, includeMarkers = false) {
|
|
358
|
-
if (!result.
|
|
372
|
+
if (!result.audioChunks || result.audioChunks.length <= 1) {
|
|
359
373
|
return result.text;
|
|
360
374
|
}
|
|
361
375
|
if (!includeMarkers) {
|
|
362
376
|
return result.text;
|
|
363
377
|
}
|
|
364
|
-
return result.
|
|
365
|
-
const timeStr = formatTimestamp(
|
|
366
|
-
return `[
|
|
367
|
-
${
|
|
378
|
+
return result.audioChunks.map((chunk, i) => {
|
|
379
|
+
const timeStr = formatTimestamp(chunk.startSec);
|
|
380
|
+
return `[Chunk ${i + 1} @ ${timeStr}]
|
|
381
|
+
${chunk.text}`;
|
|
368
382
|
}).join("\n\n");
|
|
369
383
|
}
|
|
370
384
|
function formatTimestamp(seconds) {
|
package/dist/index.d.ts
CHANGED
|
@@ -1,10 +1,10 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @wovin/tranz - Audio transcription library
|
|
3
3
|
*/
|
|
4
|
-
export { createProvider, MistralProvider, WhisperProvider, GreenPTProvider, VOXTRAL_LIMITS, type ProviderName, type TranscribeParams, type TranscriptionResult, type TranscriptionProvider, } from './utils/transcription/providers.ts';
|
|
4
|
+
export { createProvider, MistralProvider, WhisperProvider, GreenPTProvider, VOXTRAL_LIMITS, type ProviderName, type TranscribeParams, type TranscriptionResult, type TranscriptionProvider, type TranscriptSegment, } from './utils/transcription/providers.ts';
|
|
5
5
|
export { autoSplitAudio, analyzeSplitPoints, detectSilenceRegions, getAudioDuration, findOptimalSplitPoints, splitAudioAtPoints, DEFAULT_SPLIT_CONFIG, type SplitConfig, type SilenceRegion, type SplitPoint, type AudioSegment, type SplitAnalysis, } from './utils/audio/split.ts';
|
|
6
|
-
export { mergeTranscriptionResults, formatMergedText, type MergedTranscriptionResult, type WordData, } from './utils/audio/merge-results.ts';
|
|
7
|
-
export { formatTranscriptWithPauses } from './utils/transcription/format.ts';
|
|
6
|
+
export { mergeTranscriptionResults, formatMergedText, type MergedTranscriptionResult, type WordData, type AudioChunk, } from './utils/audio/merge-results.ts';
|
|
7
|
+
export { formatTranscriptWithPauses, formatTranscriptAsMarkdown, type FormatMarkdownOptions, } from './utils/transcription/format.ts';
|
|
8
8
|
export { detectAudioMimeType } from './utils/transcription/mime-detection.ts';
|
|
9
9
|
export { createMistralTranscriber, transcribe, type TranscribeOptions, type MistralTranscriberConfig, type MistralTranscriber, } from './utils/transcription/transcribe.ts';
|
|
10
10
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EACL,cAAc,EACd,eAAe,EACf,eAAe,EACf,eAAe,EACf,cAAc,EACd,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;GAEG;AAGH,OAAO,EACL,cAAc,EACd,eAAe,EACf,eAAe,EACf,eAAe,EACf,cAAc,EACd,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,EAC1B,KAAK,iBAAiB,GACvB,MAAM,oCAAoC,CAAA;AAG3C,OAAO,EACL,cAAc,EACd,kBAAkB,EAClB,oBAAoB,EACpB,gBAAgB,EAChB,sBAAsB,EACtB,kBAAkB,EAClB,oBAAoB,EACpB,KAAK,WAAW,EAChB,KAAK,aAAa,EAClB,KAAK,UAAU,EACf,KAAK,YAAY,EACjB,KAAK,aAAa,GACnB,MAAM,wBAAwB,CAAA;AAG/B,OAAO,EACL,yBAAyB,EACzB,gBAAgB,EAChB,KAAK,yBAAyB,EAC9B,KAAK,QAAQ,EACb,KAAK,UAAU,GAChB,MAAM,gCAAgC,CAAA;AAGvC,OAAO,EACL,0BAA0B,EAC1B,0BAA0B,EAC1B,KAAK,qBAAqB,GAC3B,MAAM,iCAAiC,CAAA;AAGxC,OAAO,EAAE,mBAAmB,EAAE,MAAM,yCAAyC,CAAA;AAG7E,OAAO,EACL,wBAAwB,EACxB,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,wBAAwB,EAC7B,KAAK,kBAAkB,GACxB,MAAM,qCAAqC,CAAA"}
|
|
@@ -245,6 +245,16 @@ var MistralProvider = class {
|
|
|
245
245
|
if (timestampGranularity) {
|
|
246
246
|
formData.append("timestamp_granularities", timestampGranularity);
|
|
247
247
|
}
|
|
248
|
+
if (params.contextBias && params.contextBias.length > 0) {
|
|
249
|
+
if (params.contextBias.length > VOXTRAL_LIMITS.maxContextBiasingTerms) {
|
|
250
|
+
throw new Error(
|
|
251
|
+
`contextBias has ${params.contextBias.length} terms; Voxtral limit is ${VOXTRAL_LIMITS.maxContextBiasingTerms}`
|
|
252
|
+
);
|
|
253
|
+
}
|
|
254
|
+
for (const term of params.contextBias) {
|
|
255
|
+
formData.append("context_bias[]", term);
|
|
256
|
+
}
|
|
257
|
+
}
|
|
248
258
|
const response = await fetch("https://api.mistral.ai/v1/audio/transcriptions", {
|
|
249
259
|
method: "POST",
|
|
250
260
|
headers: {
|
|
@@ -260,14 +270,27 @@ var MistralProvider = class {
|
|
|
260
270
|
if (!result?.text) {
|
|
261
271
|
return { text: "", error: "No transcription returned", rawResponse: result };
|
|
262
272
|
}
|
|
263
|
-
const
|
|
273
|
+
const segments = Array.isArray(result.segments) && result.segments.length > 0 ? result.segments.map((seg) => ({
|
|
274
|
+
startMs: Math.round((seg.start ?? 0) * 1e3),
|
|
275
|
+
endMs: Math.round((seg.end ?? 0) * 1e3),
|
|
276
|
+
text: seg.text ?? "",
|
|
277
|
+
...seg.speaker_id !== void 0 ? { diarization: seg.speaker_id } : {}
|
|
278
|
+
})) : void 0;
|
|
279
|
+
let words;
|
|
280
|
+
if (Array.isArray(result.words) && result.words.length > 0) {
|
|
281
|
+
words = result.words;
|
|
282
|
+
} else if (Array.isArray(result.segments)) {
|
|
283
|
+
const nested = result.segments.flatMap((seg) => seg.words ?? []);
|
|
284
|
+
if (nested.length > 0) words = nested;
|
|
285
|
+
}
|
|
264
286
|
const duration = result.usage?.prompt_audio_seconds;
|
|
265
287
|
return {
|
|
266
288
|
text: result.text,
|
|
267
289
|
language: result.language ?? params.language,
|
|
268
290
|
model: result.model,
|
|
269
291
|
duration,
|
|
270
|
-
words,
|
|
292
|
+
...words ? { words } : {},
|
|
293
|
+
...segments ? { segments } : {},
|
|
271
294
|
rawResponse: result
|
|
272
295
|
};
|
|
273
296
|
}
|
|
@@ -636,6 +659,10 @@ async function analyzeSplitPoints(audioPath, config = {}) {
|
|
|
636
659
|
}
|
|
637
660
|
|
|
638
661
|
// src/utils/audio/merge-results.ts
|
|
662
|
+
function prefixChunkLabel(chunkIndex, value) {
|
|
663
|
+
if (value === void 0) return void 0;
|
|
664
|
+
return `chunk${chunkIndex}/${String(value)}`;
|
|
665
|
+
}
|
|
639
666
|
function mergeTranscriptionResults(results, segments) {
|
|
640
667
|
if (results.length === 0) {
|
|
641
668
|
return {
|
|
@@ -644,10 +671,7 @@ function mergeTranscriptionResults(results, segments) {
|
|
|
644
671
|
};
|
|
645
672
|
}
|
|
646
673
|
if (results.length === 1) {
|
|
647
|
-
return
|
|
648
|
-
...results[0],
|
|
649
|
-
totalSegments: 1
|
|
650
|
-
};
|
|
674
|
+
return results[0];
|
|
651
675
|
}
|
|
652
676
|
const errors = results.map((r, i) => r.error ? `Segment ${i}: ${r.error}` : null).filter(Boolean);
|
|
653
677
|
if (errors.length > 0) {
|
|
@@ -668,12 +692,26 @@ function mergeTranscriptionResults(results, segments) {
|
|
|
668
692
|
start: (word.start || 0) + segment.startSec,
|
|
669
693
|
end: (word.end || 0) + segment.startSec,
|
|
670
694
|
confidence: word.confidence,
|
|
671
|
-
speaker: word.speaker
|
|
695
|
+
speaker: prefixChunkLabel(i, word.speaker)
|
|
696
|
+
});
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
const mergedSegments = [];
|
|
700
|
+
for (let i = 0; i < results.length; i++) {
|
|
701
|
+
const result = results[i];
|
|
702
|
+
const chunkOffsetMs = Math.round(segments[i].startSec * 1e3);
|
|
703
|
+
if (!result.segments) continue;
|
|
704
|
+
for (const seg of result.segments) {
|
|
705
|
+
mergedSegments.push({
|
|
706
|
+
startMs: seg.startMs + chunkOffsetMs,
|
|
707
|
+
endMs: seg.endMs + chunkOffsetMs,
|
|
708
|
+
text: seg.text,
|
|
709
|
+
...seg.diarization !== void 0 ? { diarization: prefixChunkLabel(i, seg.diarization) } : {}
|
|
672
710
|
});
|
|
673
711
|
}
|
|
674
712
|
}
|
|
675
713
|
const totalDuration = segments.reduce((sum, seg) => sum + seg.durationSec, 0);
|
|
676
|
-
const
|
|
714
|
+
const audioChunks = results.map((r, i) => ({
|
|
677
715
|
index: i,
|
|
678
716
|
startSec: segments[i].startSec,
|
|
679
717
|
endSec: segments[i].endSec,
|
|
@@ -681,37 +719,36 @@ function mergeTranscriptionResults(results, segments) {
|
|
|
681
719
|
}));
|
|
682
720
|
const mergedRawResponse = {
|
|
683
721
|
merged: true,
|
|
684
|
-
|
|
685
|
-
|
|
722
|
+
chunkCount: results.length,
|
|
723
|
+
chunks: results.map((r, i) => ({
|
|
686
724
|
index: i,
|
|
687
725
|
startSec: segments[i].startSec,
|
|
688
726
|
rawResponse: r.rawResponse
|
|
689
|
-
}))
|
|
690
|
-
words: mergedWords
|
|
727
|
+
}))
|
|
691
728
|
};
|
|
692
729
|
const firstResult = results[0];
|
|
693
730
|
return {
|
|
694
731
|
text: mergedText,
|
|
695
|
-
words: mergedWords,
|
|
696
732
|
duration: totalDuration,
|
|
697
733
|
language: firstResult.language,
|
|
698
734
|
model: firstResult.model,
|
|
699
735
|
rawResponse: mergedRawResponse,
|
|
700
|
-
|
|
701
|
-
|
|
736
|
+
audioChunks,
|
|
737
|
+
...mergedWords.length > 0 ? { words: mergedWords } : {},
|
|
738
|
+
...mergedSegments.length > 0 ? { segments: mergedSegments } : {}
|
|
702
739
|
};
|
|
703
740
|
}
|
|
704
741
|
function formatMergedText(result, includeMarkers = false) {
|
|
705
|
-
if (!result.
|
|
742
|
+
if (!result.audioChunks || result.audioChunks.length <= 1) {
|
|
706
743
|
return result.text;
|
|
707
744
|
}
|
|
708
745
|
if (!includeMarkers) {
|
|
709
746
|
return result.text;
|
|
710
747
|
}
|
|
711
|
-
return result.
|
|
712
|
-
const timeStr = formatTimestamp(
|
|
713
|
-
return `[
|
|
714
|
-
${
|
|
748
|
+
return result.audioChunks.map((chunk, i) => {
|
|
749
|
+
const timeStr = formatTimestamp(chunk.startSec);
|
|
750
|
+
return `[Chunk ${i + 1} @ ${timeStr}]
|
|
751
|
+
${chunk.text}`;
|
|
715
752
|
}).join("\n\n");
|
|
716
753
|
}
|
|
717
754
|
function formatTimestamp(seconds) {
|
|
@@ -758,6 +795,93 @@ function formatTranscriptWithPauses(transcript, words, shortPauseThreshold = 1,
|
|
|
758
795
|
}
|
|
759
796
|
return result.join("");
|
|
760
797
|
}
|
|
798
|
+
function formatTime(seconds, useHours) {
|
|
799
|
+
const total = Math.max(0, Math.floor(seconds));
|
|
800
|
+
const h = Math.floor(total / 3600);
|
|
801
|
+
const m = Math.floor(total % 3600 / 60);
|
|
802
|
+
const s = total % 60;
|
|
803
|
+
if (useHours) {
|
|
804
|
+
return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
|
|
805
|
+
}
|
|
806
|
+
return `${m}:${String(s).padStart(2, "0")}`;
|
|
807
|
+
}
|
|
808
|
+
function formatDurationHuman(seconds) {
|
|
809
|
+
const total = Math.max(0, Math.floor(seconds));
|
|
810
|
+
const h = Math.floor(total / 3600);
|
|
811
|
+
const m = Math.floor(total % 3600 / 60);
|
|
812
|
+
const s = total % 60;
|
|
813
|
+
if (h > 0) return `${h}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
|
|
814
|
+
return `${m}:${String(s).padStart(2, "0")}`;
|
|
815
|
+
}
|
|
816
|
+
function formatTranscriptAsMarkdown(result, opts = {}) {
|
|
817
|
+
const gapSec = opts.gapSec ?? 1.5;
|
|
818
|
+
const wantSpeakerLabel = opts.speakerLabel ?? true;
|
|
819
|
+
const includeHeader = opts.includeHeader ?? false;
|
|
820
|
+
const gapMs = gapSec * 1e3;
|
|
821
|
+
const segments = result.segments ?? [];
|
|
822
|
+
const totalDurationSec = opts.durationSec ?? result.duration ?? (segments.length > 0 ? segments[segments.length - 1].endMs / 1e3 : 0);
|
|
823
|
+
const useHours = totalDurationSec >= 3600;
|
|
824
|
+
const speakerSet = /* @__PURE__ */ new Set();
|
|
825
|
+
for (const seg of segments) {
|
|
826
|
+
if (seg.diarization !== void 0 && seg.diarization !== null) {
|
|
827
|
+
speakerSet.add(seg.diarization);
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
const hasMultipleSpeakers = speakerSet.size > 1;
|
|
831
|
+
const showSpeakers = wantSpeakerLabel && hasMultipleSpeakers;
|
|
832
|
+
let headerBlock = "";
|
|
833
|
+
if (includeHeader) {
|
|
834
|
+
const bullets = [];
|
|
835
|
+
if (opts.source) bullets.push(`- Source: \`${opts.source}\``);
|
|
836
|
+
if (totalDurationSec > 0) bullets.push(`- Duration: ${formatDurationHuman(totalDurationSec)}`);
|
|
837
|
+
if (result.model) bullets.push(`- Model: ${result.model}`);
|
|
838
|
+
if (segments.length > 0) bullets.push(`- Segments: ${segments.length}`);
|
|
839
|
+
if (speakerSet.size > 0) bullets.push(`- Speakers (diarized): ${speakerSet.size}`);
|
|
840
|
+
const parts = [];
|
|
841
|
+
if (opts.source) parts.push(`# ${opts.source}`);
|
|
842
|
+
if (bullets.length > 0) parts.push(bullets.join("\n"));
|
|
843
|
+
parts.push("## Transcript");
|
|
844
|
+
headerBlock = parts.join("\n\n");
|
|
845
|
+
}
|
|
846
|
+
const body = [];
|
|
847
|
+
if (segments.length === 0) {
|
|
848
|
+
const text = (result.text ?? "").trim();
|
|
849
|
+
if (text) body.push(text);
|
|
850
|
+
} else {
|
|
851
|
+
const paragraphs = [];
|
|
852
|
+
let current;
|
|
853
|
+
for (const seg of segments) {
|
|
854
|
+
const text = (seg.text ?? "").trim();
|
|
855
|
+
if (!text) continue;
|
|
856
|
+
const gap = current ? seg.startMs - current.lastEndMs : Infinity;
|
|
857
|
+
const speakerChanged = current ? current.speaker !== seg.diarization : false;
|
|
858
|
+
if (!current || gap >= gapMs || speakerChanged) {
|
|
859
|
+
current = {
|
|
860
|
+
startMs: seg.startMs,
|
|
861
|
+
lastEndMs: seg.endMs,
|
|
862
|
+
speaker: seg.diarization,
|
|
863
|
+
texts: [text]
|
|
864
|
+
};
|
|
865
|
+
paragraphs.push(current);
|
|
866
|
+
} else {
|
|
867
|
+
current.texts.push(text);
|
|
868
|
+
current.lastEndMs = seg.endMs;
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
for (const p of paragraphs) {
|
|
872
|
+
const time = formatTime(p.startMs / 1e3, useHours);
|
|
873
|
+
const speakerSuffix = showSpeakers && p.speaker !== void 0 && p.speaker !== null ? ` \xB7 Speaker ${formatSpeakerLabel(p.speaker)}` : "";
|
|
874
|
+
body.push(`**[${time}${speakerSuffix}]** ${p.texts.join(" ")}`);
|
|
875
|
+
}
|
|
876
|
+
}
|
|
877
|
+
const bodyBlock = body.join("\n\n");
|
|
878
|
+
const out = [headerBlock, bodyBlock].filter((s) => s.length > 0).join("\n\n");
|
|
879
|
+
return out.trimEnd() + (out.length > 0 ? "\n" : "");
|
|
880
|
+
}
|
|
881
|
+
function formatSpeakerLabel(value) {
|
|
882
|
+
const s = String(value);
|
|
883
|
+
return s.replace(/^speaker[_-]/, "");
|
|
884
|
+
}
|
|
761
885
|
|
|
762
886
|
// src/utils/transcription/transcribe.ts
|
|
763
887
|
import * as fs3 from "fs";
|
|
@@ -862,6 +986,7 @@ function createMistralTranscriber(config) {
|
|
|
862
986
|
duration: knownDuration,
|
|
863
987
|
language,
|
|
864
988
|
model = defaultModel,
|
|
989
|
+
contextBias,
|
|
865
990
|
diarize = true,
|
|
866
991
|
timestamps = language ? void 0 : "segment",
|
|
867
992
|
autoSplit,
|
|
@@ -881,9 +1006,10 @@ function createMistralTranscriber(config) {
|
|
|
881
1006
|
model,
|
|
882
1007
|
language,
|
|
883
1008
|
diarize,
|
|
884
|
-
timestampGranularity: timestamps
|
|
1009
|
+
timestampGranularity: timestamps,
|
|
1010
|
+
contextBias
|
|
885
1011
|
});
|
|
886
|
-
return
|
|
1012
|
+
return result;
|
|
887
1013
|
}
|
|
888
1014
|
if (audioUrl) {
|
|
889
1015
|
if (autoSplit === false) {
|
|
@@ -894,9 +1020,10 @@ function createMistralTranscriber(config) {
|
|
|
894
1020
|
model,
|
|
895
1021
|
language,
|
|
896
1022
|
diarize,
|
|
897
|
-
timestampGranularity: timestamps
|
|
1023
|
+
timestampGranularity: timestamps,
|
|
1024
|
+
contextBias
|
|
898
1025
|
});
|
|
899
|
-
return
|
|
1026
|
+
return result2;
|
|
900
1027
|
}
|
|
901
1028
|
let duration2 = knownDuration;
|
|
902
1029
|
if (duration2 === void 0) {
|
|
@@ -918,9 +1045,10 @@ function createMistralTranscriber(config) {
|
|
|
918
1045
|
model,
|
|
919
1046
|
language,
|
|
920
1047
|
diarize,
|
|
921
|
-
timestampGranularity: timestamps
|
|
1048
|
+
timestampGranularity: timestamps,
|
|
1049
|
+
contextBias
|
|
922
1050
|
});
|
|
923
|
-
return
|
|
1051
|
+
return result2;
|
|
924
1052
|
}
|
|
925
1053
|
log.info(`Downloading URL to temp file for processing...`);
|
|
926
1054
|
const outDir2 = splitOutputDir || path3.join(os.tmpdir(), `tranz-${Date.now()}`);
|
|
@@ -933,6 +1061,7 @@ function createMistralTranscriber(config) {
|
|
|
933
1061
|
model,
|
|
934
1062
|
diarize,
|
|
935
1063
|
timestamps,
|
|
1064
|
+
contextBias,
|
|
936
1065
|
autoSplit: true,
|
|
937
1066
|
splitOutputDir: outDir2,
|
|
938
1067
|
logger: customLogger,
|
|
@@ -959,9 +1088,10 @@ function createMistralTranscriber(config) {
|
|
|
959
1088
|
model,
|
|
960
1089
|
language,
|
|
961
1090
|
diarize,
|
|
962
|
-
timestampGranularity: timestamps
|
|
1091
|
+
timestampGranularity: timestamps,
|
|
1092
|
+
contextBias
|
|
963
1093
|
});
|
|
964
|
-
return
|
|
1094
|
+
return result;
|
|
965
1095
|
}
|
|
966
1096
|
log.info(`Duration ${duration.toFixed(1)}s > ${maxDuration}s, splitting audio...`);
|
|
967
1097
|
const outDir = splitOutputDir || path3.join(os.tmpdir(), `tranz-split-${Date.now()}`);
|
|
@@ -980,7 +1110,8 @@ function createMistralTranscriber(config) {
|
|
|
980
1110
|
model,
|
|
981
1111
|
language,
|
|
982
1112
|
diarize,
|
|
983
|
-
timestampGranularity: timestamps
|
|
1113
|
+
timestampGranularity: timestamps,
|
|
1114
|
+
contextBias
|
|
984
1115
|
});
|
|
985
1116
|
results.push(result);
|
|
986
1117
|
}
|
|
@@ -1004,6 +1135,7 @@ export {
|
|
|
1004
1135
|
detectSilenceRegions,
|
|
1005
1136
|
findOptimalSplitPoints,
|
|
1006
1137
|
formatMergedText,
|
|
1138
|
+
formatTranscriptAsMarkdown,
|
|
1007
1139
|
formatTranscriptWithPauses,
|
|
1008
1140
|
getAudioDuration,
|
|
1009
1141
|
mergeTranscriptionResults,
|
package/dist/providers.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* @wovin/tranz/providers - Transcription provider implementations
|
|
3
3
|
*/
|
|
4
|
-
export { createProvider, MistralProvider, WhisperProvider, GreenPTProvider, VOXTRAL_LIMITS, type ProviderName, type TranscribeParams, type TranscriptionResult, type TranscriptionProvider, } from './utils/transcription/providers.ts';
|
|
4
|
+
export { createProvider, MistralProvider, WhisperProvider, GreenPTProvider, VOXTRAL_LIMITS, type ProviderName, type TranscribeParams, type TranscriptionResult, type TranscriptionProvider, type TranscriptSegment, } from './utils/transcription/providers.ts';
|
|
5
5
|
export { createMistralTranscriber, transcribe, type TranscribeOptions, type MistralTranscriberConfig, } from './utils/transcription/transcribe.ts';
|
|
6
6
|
//# sourceMappingURL=providers.d.ts.map
|
package/dist/providers.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EACL,cAAc,EACd,eAAe,EACf,eAAe,EACf,eAAe,EACf,cAAc,EACd,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,
|
|
1
|
+
{"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../src/providers.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,EACL,cAAc,EACd,eAAe,EACf,eAAe,EACf,eAAe,EACf,cAAc,EACd,KAAK,YAAY,EACjB,KAAK,gBAAgB,EACrB,KAAK,mBAAmB,EACxB,KAAK,qBAAqB,EAC1B,KAAK,iBAAiB,GACvB,MAAM,oCAAoC,CAAA;AAE3C,OAAO,EACL,wBAAwB,EACxB,UAAU,EACV,KAAK,iBAAiB,EACtB,KAAK,wBAAwB,GAC9B,MAAM,qCAAqC,CAAA"}
|