@steipete/summarize-core 0.11.1 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/esm/content/bun.js +21 -0
- package/dist/esm/content/bun.js.map +1 -0
- package/dist/esm/content/direct-media.js +100 -0
- package/dist/esm/content/direct-media.js.map +1 -0
- package/dist/esm/content/index.js +2 -1
- package/dist/esm/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/client.js +6 -0
- package/dist/esm/content/link-preview/client.js.map +1 -1
- package/dist/esm/content/link-preview/content/fetcher.js +19 -2
- package/dist/esm/content/link-preview/content/fetcher.js.map +1 -1
- package/dist/esm/content/link-preview/content/firecrawl.js.map +1 -1
- package/dist/esm/content/link-preview/content/html.js.map +1 -1
- package/dist/esm/content/link-preview/content/index.js +29 -12
- package/dist/esm/content/link-preview/content/index.js.map +1 -1
- package/dist/esm/content/link-preview/content/utils.js.map +1 -1
- package/dist/esm/content/link-preview/content/video.js +1 -1
- package/dist/esm/content/link-preview/content/video.js.map +1 -1
- package/dist/esm/content/local-file.js +58 -0
- package/dist/esm/content/local-file.js.map +1 -0
- package/dist/esm/content/transcript/index.js +2 -0
- package/dist/esm/content/transcript/index.js.map +1 -1
- package/dist/esm/content/transcript/providers/generic-direct-media.js +47 -0
- package/dist/esm/content/transcript/providers/generic-direct-media.js.map +1 -0
- package/dist/esm/content/transcript/providers/generic-embedded.js +126 -0
- package/dist/esm/content/transcript/providers/generic-embedded.js.map +1 -0
- package/dist/esm/content/transcript/providers/generic-twitter.js +78 -0
- package/dist/esm/content/transcript/providers/generic-twitter.js.map +1 -0
- package/dist/esm/content/transcript/providers/generic.js +12 -248
- package/dist/esm/content/transcript/providers/generic.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/apple-flow.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/media.js +9 -1
- package/dist/esm/content/transcript/providers/podcast/media.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/provider-flow.js +157 -0
- package/dist/esm/content/transcript/providers/podcast/provider-flow.js.map +1 -0
- package/dist/esm/content/transcript/providers/podcast/rss-feed.js +123 -0
- package/dist/esm/content/transcript/providers/podcast/rss-feed.js.map +1 -0
- package/dist/esm/content/transcript/providers/podcast/rss-transcript.js +113 -0
- package/dist/esm/content/transcript/providers/podcast/rss-transcript.js.map +1 -0
- package/dist/esm/content/transcript/providers/podcast/rss.js +2 -226
- package/dist/esm/content/transcript/providers/podcast/rss.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast/spotify-flow.js.map +1 -1
- package/dist/esm/content/transcript/providers/podcast.js +26 -155
- package/dist/esm/content/transcript/providers/podcast.js.map +1 -1
- package/dist/esm/content/transcript/providers/transcription-capability.js +22 -0
- package/dist/esm/content/transcript/providers/transcription-capability.js.map +1 -0
- package/dist/esm/content/transcript/providers/transcription-start.js +43 -32
- package/dist/esm/content/transcript/providers/transcription-start.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/api.js +3 -2
- package/dist/esm/content/transcript/providers/youtube/api.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/captions-player.js +173 -0
- package/dist/esm/content/transcript/providers/youtube/captions-player.js.map +1 -0
- package/dist/esm/content/transcript/providers/youtube/captions-shared.js +8 -0
- package/dist/esm/content/transcript/providers/youtube/captions-shared.js.map +1 -0
- package/dist/esm/content/transcript/providers/youtube/captions-transcript.js +361 -0
- package/dist/esm/content/transcript/providers/youtube/captions-transcript.js.map +1 -0
- package/dist/esm/content/transcript/providers/youtube/captions.js +2 -557
- package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube/provider-flow.js +217 -0
- package/dist/esm/content/transcript/providers/youtube/provider-flow.js.map +1 -0
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +33 -9
- package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -1
- package/dist/esm/content/transcript/providers/youtube.js +42 -194
- package/dist/esm/content/transcript/providers/youtube.js.map +1 -1
- package/dist/esm/content/transcript/transcription-config.js +24 -4
- package/dist/esm/content/transcript/transcription-config.js.map +1 -1
- package/dist/esm/content/url.js +5 -33
- package/dist/esm/content/url.js.map +1 -1
- package/dist/esm/processes.js.map +1 -1
- package/dist/esm/prompts/format.js +6 -0
- package/dist/esm/prompts/format.js.map +1 -1
- package/dist/esm/prompts/link-summary.js +27 -3
- package/dist/esm/prompts/link-summary.js.map +1 -1
- package/dist/esm/transcription/onnx-cli.js.map +1 -1
- package/dist/esm/transcription/whisper/assemblyai.js +132 -0
- package/dist/esm/transcription/whisper/assemblyai.js.map +1 -0
- package/dist/esm/transcription/whisper/chunking.js +64 -0
- package/dist/esm/transcription/whisper/chunking.js.map +1 -0
- package/dist/esm/transcription/whisper/cloud-providers.js +69 -0
- package/dist/esm/transcription/whisper/cloud-providers.js.map +1 -0
- package/dist/esm/transcription/whisper/core.js +320 -390
- package/dist/esm/transcription/whisper/core.js.map +1 -1
- package/dist/esm/transcription/whisper/gemini.js +324 -0
- package/dist/esm/transcription/whisper/gemini.js.map +1 -0
- package/dist/esm/transcription/whisper/groq.js +62 -1
- package/dist/esm/transcription/whisper/groq.js.map +1 -1
- package/dist/esm/transcription/whisper/preferences.js +16 -0
- package/dist/esm/transcription/whisper/preferences.js.map +1 -0
- package/dist/esm/transcription/whisper/provider-setup.js +62 -0
- package/dist/esm/transcription/whisper/provider-setup.js.map +1 -0
- package/dist/esm/transcription/whisper/remote-provider-attempts.js +189 -0
- package/dist/esm/transcription/whisper/remote-provider-attempts.js.map +1 -0
- package/dist/esm/transcription/whisper/remote.js +220 -0
- package/dist/esm/transcription/whisper/remote.js.map +1 -0
- package/dist/esm/transcription/whisper/whisper-cpp.js +21 -18
- package/dist/esm/transcription/whisper/whisper-cpp.js.map +1 -1
- package/dist/types/content/bun.d.ts +6 -0
- package/dist/types/content/direct-media.d.ts +9 -0
- package/dist/types/content/index.d.ts +2 -1
- package/dist/types/content/link-preview/client.d.ts +3 -1
- package/dist/types/content/link-preview/content/fetcher.d.ts +1 -1
- package/dist/types/content/link-preview/content/html.d.ts +1 -1
- package/dist/types/content/link-preview/deps.d.ts +8 -2
- package/dist/types/content/link-preview/types.d.ts +1 -1
- package/dist/types/content/local-file.d.ts +16 -0
- package/dist/types/content/transcript/providers/generic-direct-media.d.ts +11 -0
- package/dist/types/content/transcript/providers/generic-embedded.d.ts +16 -0
- package/dist/types/content/transcript/providers/generic-twitter.d.ts +11 -0
- package/dist/types/content/transcript/providers/podcast/flow-context.d.ts +3 -0
- package/dist/types/content/transcript/providers/podcast/media.d.ts +4 -2
- package/dist/types/content/transcript/providers/podcast/provider-flow.d.ts +7 -0
- package/dist/types/content/transcript/providers/podcast/rss-feed.d.ts +15 -0
- package/dist/types/content/transcript/providers/podcast/rss-transcript.d.ts +12 -0
- package/dist/types/content/transcript/providers/podcast/rss.d.ts +2 -24
- package/dist/types/content/transcript/providers/transcription-capability.d.ts +18 -0
- package/dist/types/content/transcript/providers/transcription-start.d.ts +11 -3
- package/dist/types/content/transcript/providers/youtube/captions-player.d.ts +12 -0
- package/dist/types/content/transcript/providers/youtube/captions-shared.d.ts +42 -0
- package/dist/types/content/transcript/providers/youtube/captions-transcript.d.ts +4 -0
- package/dist/types/content/transcript/providers/youtube/captions.d.ts +2 -19
- package/dist/types/content/transcript/providers/youtube/provider-flow.d.ts +34 -0
- package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +4 -2
- package/dist/types/content/transcript/transcription-config.d.ts +6 -0
- package/dist/types/content/transcript/types.d.ts +1 -0
- package/dist/types/content/url.d.ts +2 -3
- package/dist/types/prompts/format.d.ts +1 -0
- package/dist/types/prompts/link-summary.d.ts +2 -1
- package/dist/types/transcription/whisper/assemblyai.d.ts +17 -0
- package/dist/types/transcription/whisper/chunking.d.ts +11 -0
- package/dist/types/transcription/whisper/cloud-providers.d.ts +22 -0
- package/dist/types/transcription/whisper/core.d.ts +12 -14
- package/dist/types/transcription/whisper/gemini.d.ts +14 -0
- package/dist/types/transcription/whisper/preferences.d.ts +4 -0
- package/dist/types/transcription/whisper/provider-setup.d.ts +30 -0
- package/dist/types/transcription/whisper/remote-provider-attempts.d.ts +51 -0
- package/dist/types/transcription/whisper/remote.d.ts +51 -0
- package/dist/types/transcription/whisper/types.d.ts +1 -1
- package/dist/types/transcription/whisper/whisper-cpp.d.ts +4 -3
- package/package.json +14 -10
|
@@ -2,459 +2,389 @@ import { randomUUID } from "node:crypto";
|
|
|
2
2
|
import { promises as fs } from "node:fs";
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { basename, join } from "node:path";
|
|
5
|
-
import {
|
|
5
|
+
import { transcribeWithOnnxCli, transcribeWithOnnxCliFile } from "../onnx-cli.js";
|
|
6
|
+
import { transcribeChunkedFile } from "./chunking.js";
|
|
6
7
|
import { DEFAULT_SEGMENT_SECONDS, MAX_OPENAI_UPLOAD_BYTES } from "./constants.js";
|
|
7
|
-
import {
|
|
8
|
-
import { isFfmpegAvailable, runFfmpegSegment, transcodeBytesToMp3 } from "./ffmpeg.js";
|
|
8
|
+
import { isFfmpegAvailable, transcodeBytesToMp3 } from "./ffmpeg.js";
|
|
9
9
|
import { shouldRetryGroqViaFfmpeg, transcribeWithGroq } from "./groq.js";
|
|
10
|
-
import {
|
|
11
|
-
import {
|
|
10
|
+
import { resolveOnnxModelPreference } from "./preferences.js";
|
|
11
|
+
import { transcribeBytesWithRemoteFallbacks, transcribeFileWithRemoteFallbacks, transcribeOversizedBytesViaTempFile, } from "./remote.js";
|
|
12
|
+
import { ensureWhisperFilenameExtension, formatBytes, wrapError } from "./utils.js";
|
|
12
13
|
import { isWhisperCppReady, transcribeWithWhisperCppFile } from "./whisper-cpp.js";
|
|
13
|
-
function
|
|
14
|
-
const raw = env.SUMMARIZE_TRANSCRIBER?.trim().toLowerCase();
|
|
15
|
-
if (raw === "auto" || raw === "whisper" || raw === "parakeet" || raw === "canary")
|
|
16
|
-
return raw;
|
|
17
|
-
return "auto";
|
|
18
|
-
}
|
|
19
|
-
function resolveOnnxModelPreference(env) {
|
|
20
|
-
const preference = resolveTranscriberPreference(env);
|
|
21
|
-
if (preference === "parakeet" || preference === "canary")
|
|
22
|
-
return preference;
|
|
23
|
-
if (preference === "auto")
|
|
24
|
-
return resolvePreferredOnnxModel(env);
|
|
25
|
-
return null;
|
|
26
|
-
}
|
|
27
|
-
export async function transcribeMediaWithWhisper({ bytes, mediaType, filename, groqApiKey, skipGroq = false, openaiApiKey, falApiKey, totalDurationSeconds = null, onProgress, env = process.env, }) {
|
|
14
|
+
export async function transcribeMediaWithWhisper({ bytes, mediaType, filename, groqApiKey, skipGroq = false, assemblyaiApiKey = null, geminiApiKey = null, openaiApiKey, falApiKey, totalDurationSeconds = null, onProgress, env = process.env, }) {
|
|
28
15
|
const notes = [];
|
|
29
|
-
// 1. Groq (cloud, free, fastest)
|
|
30
16
|
let groqError = null;
|
|
31
17
|
if (groqApiKey && !skipGroq) {
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
const canTranscode = await isFfmpegAvailable();
|
|
45
|
-
if (canTranscode) {
|
|
46
|
-
try {
|
|
47
|
-
notes.push("Groq could not decode media; transcoding via ffmpeg and retrying");
|
|
48
|
-
const mp3Bytes = await transcodeBytesToMp3(bytes);
|
|
49
|
-
const retried = await transcribeWithGroq(mp3Bytes, "audio/mpeg", "audio.mp3", groqApiKey);
|
|
50
|
-
if (retried) {
|
|
51
|
-
return { text: retried, provider: "groq", error: null, notes };
|
|
52
|
-
}
|
|
53
|
-
groqError = new Error("Groq transcription returned empty text after ffmpeg transcode");
|
|
54
|
-
bytes = mp3Bytes;
|
|
55
|
-
mediaType = "audio/mpeg";
|
|
56
|
-
filename = "audio.mp3";
|
|
57
|
-
}
|
|
58
|
-
catch (error) {
|
|
59
|
-
notes.push(`ffmpeg transcode failed; cannot retry Groq decode error: ${error instanceof Error ? error.message : String(error)}`);
|
|
60
|
-
}
|
|
61
|
-
}
|
|
62
|
-
else {
|
|
63
|
-
notes.push("Groq could not decode media; install ffmpeg to enable transcoding retry");
|
|
18
|
+
const groqResult = await transcribeWithGroqFirst({
|
|
19
|
+
bytes,
|
|
20
|
+
mediaType,
|
|
21
|
+
filename,
|
|
22
|
+
groqApiKey,
|
|
23
|
+
notes,
|
|
24
|
+
});
|
|
25
|
+
bytes = groqResult.bytes;
|
|
26
|
+
mediaType = groqResult.mediaType;
|
|
27
|
+
filename = groqResult.filename;
|
|
28
|
+
if (groqResult.text) {
|
|
29
|
+
return { text: groqResult.text, provider: "groq", error: null, notes };
|
|
64
30
|
}
|
|
31
|
+
groqError = groqResult.error;
|
|
65
32
|
}
|
|
66
33
|
if (groqError) {
|
|
67
|
-
notes.push(`Groq transcription failed; falling back to local/OpenAI: ${groqError.message}`);
|
|
34
|
+
notes.push(`Groq transcription failed; falling back to local/AssemblyAI/Gemini/OpenAI: ${groqError.message}`);
|
|
68
35
|
}
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
36
|
+
const onnx = await transcribeWithLocalOnnx({
|
|
37
|
+
bytes,
|
|
38
|
+
mediaType,
|
|
39
|
+
filename,
|
|
40
|
+
totalDurationSeconds,
|
|
41
|
+
onProgress,
|
|
42
|
+
env,
|
|
43
|
+
notes,
|
|
44
|
+
});
|
|
45
|
+
if (onnx)
|
|
46
|
+
return onnx;
|
|
47
|
+
const local = await transcribeWithLocalWhisperBytes({
|
|
48
|
+
bytes,
|
|
49
|
+
mediaType,
|
|
50
|
+
filename,
|
|
51
|
+
totalDurationSeconds,
|
|
52
|
+
onProgress,
|
|
53
|
+
env,
|
|
54
|
+
notes,
|
|
55
|
+
});
|
|
56
|
+
if (local)
|
|
57
|
+
return local;
|
|
58
|
+
return await transcribeBytesWithRemoteFallbacks({
|
|
59
|
+
bytes,
|
|
60
|
+
mediaType,
|
|
61
|
+
filename,
|
|
62
|
+
notes,
|
|
63
|
+
groqApiKey,
|
|
64
|
+
groqError,
|
|
65
|
+
assemblyaiApiKey,
|
|
66
|
+
geminiApiKey,
|
|
67
|
+
openaiApiKey,
|
|
68
|
+
falApiKey,
|
|
69
|
+
env,
|
|
70
|
+
onProgress,
|
|
71
|
+
transcribeOversizedBytesWithChunking: ({ bytes, mediaType, filename, onProgress }) => transcribeOversizedBytesViaTempFile({
|
|
74
72
|
bytes,
|
|
75
73
|
mediaType,
|
|
76
74
|
filename,
|
|
75
|
+
onProgress,
|
|
76
|
+
transcribeFile: ({ filePath, mediaType, filename, onProgress }) => transcribeMediaFileWithWhisper({
|
|
77
|
+
filePath,
|
|
78
|
+
mediaType,
|
|
79
|
+
filename,
|
|
80
|
+
groqApiKey,
|
|
81
|
+
assemblyaiApiKey,
|
|
82
|
+
geminiApiKey,
|
|
83
|
+
openaiApiKey,
|
|
84
|
+
falApiKey,
|
|
85
|
+
segmentSeconds: DEFAULT_SEGMENT_SECONDS,
|
|
86
|
+
onProgress,
|
|
87
|
+
env,
|
|
88
|
+
}),
|
|
89
|
+
}),
|
|
90
|
+
});
|
|
91
|
+
}
|
|
92
|
+
export async function transcribeMediaFileWithWhisper({ filePath, mediaType, filename, groqApiKey, assemblyaiApiKey = null, geminiApiKey = null, openaiApiKey, falApiKey, segmentSeconds = DEFAULT_SEGMENT_SECONDS, totalDurationSeconds = null, onProgress = null, env = process.env, }) {
|
|
93
|
+
const notes = [];
|
|
94
|
+
let skipGroqInNestedCalls = false;
|
|
95
|
+
let groqError = null;
|
|
96
|
+
if (groqApiKey) {
|
|
97
|
+
skipGroqInNestedCalls = true;
|
|
98
|
+
const groqResult = await transcribeGroqFileFirst({
|
|
99
|
+
filePath,
|
|
100
|
+
mediaType,
|
|
101
|
+
filename,
|
|
102
|
+
groqApiKey,
|
|
103
|
+
assemblyaiApiKey,
|
|
104
|
+
geminiApiKey,
|
|
105
|
+
openaiApiKey,
|
|
106
|
+
falApiKey,
|
|
107
|
+
segmentSeconds,
|
|
77
108
|
totalDurationSeconds,
|
|
78
109
|
onProgress,
|
|
79
110
|
env,
|
|
80
|
-
});
|
|
81
|
-
if (onnx.text) {
|
|
82
|
-
if (onnx.notes.length > 0)
|
|
83
|
-
notes.push(...onnx.notes);
|
|
84
|
-
return { ...onnx, notes };
|
|
85
|
-
}
|
|
86
|
-
if (onnx.notes.length > 0)
|
|
87
|
-
notes.push(...onnx.notes);
|
|
88
|
-
if (onnx.error) {
|
|
89
|
-
notes.push(`${onnx.provider ?? "onnx"} failed; falling back to Whisper: ${onnx.error.message}`);
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
// 3. whisper.cpp (local)
|
|
93
|
-
const localReady = await isWhisperCppReady();
|
|
94
|
-
let local = null;
|
|
95
|
-
if (localReady) {
|
|
96
|
-
const nameHint = filename?.trim() ? basename(filename.trim()) : "media";
|
|
97
|
-
const tempFile = join(tmpdir(), `summarize-whisper-local-${randomUUID()}-${ensureWhisperFilenameExtension(nameHint, mediaType)}`);
|
|
98
|
-
try {
|
|
99
|
-
await fs.writeFile(tempFile, bytes);
|
|
100
|
-
try {
|
|
101
|
-
local = await transcribeWithWhisperCppFile({
|
|
102
|
-
filePath: tempFile,
|
|
103
|
-
mediaType,
|
|
104
|
-
totalDurationSeconds,
|
|
105
|
-
onProgress,
|
|
106
|
-
});
|
|
107
|
-
}
|
|
108
|
-
catch (error) {
|
|
109
|
-
local = {
|
|
110
|
-
text: null,
|
|
111
|
-
provider: "whisper.cpp",
|
|
112
|
-
error: wrapError("whisper.cpp failed", error),
|
|
113
|
-
notes: [],
|
|
114
|
-
};
|
|
115
|
-
}
|
|
116
|
-
if (local.text) {
|
|
117
|
-
if (local.notes.length > 0)
|
|
118
|
-
notes.push(...local.notes);
|
|
119
|
-
return { ...local, notes };
|
|
120
|
-
}
|
|
121
|
-
if (local.notes.length > 0)
|
|
122
|
-
notes.push(...local.notes);
|
|
123
|
-
if (local.error) {
|
|
124
|
-
notes.push(`whisper.cpp failed; falling back to remote Whisper: ${local.error.message}`);
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
finally {
|
|
128
|
-
await fs.unlink(tempFile).catch(() => { });
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
// 4. OpenAI / FAL (cloud fallbacks)
|
|
132
|
-
if (!groqApiKey && !openaiApiKey && !falApiKey) {
|
|
133
|
-
return {
|
|
134
|
-
text: null,
|
|
135
|
-
provider: null,
|
|
136
|
-
error: new Error("No transcription providers available (install whisper-cpp or set GROQ_API_KEY, OPENAI_API_KEY, or FAL_KEY)"),
|
|
137
111
|
notes,
|
|
138
|
-
};
|
|
112
|
+
});
|
|
113
|
+
if (groqResult.text)
|
|
114
|
+
return groqResult;
|
|
115
|
+
groqError = groqResult.error;
|
|
139
116
|
}
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
117
|
+
const onnx = await transcribeWithLocalOnnxFile({
|
|
118
|
+
filePath,
|
|
119
|
+
mediaType,
|
|
120
|
+
totalDurationSeconds,
|
|
121
|
+
onProgress,
|
|
122
|
+
env,
|
|
123
|
+
notes,
|
|
124
|
+
});
|
|
125
|
+
if (onnx)
|
|
126
|
+
return onnx;
|
|
127
|
+
const local = await transcribeWithLocalWhisperFile({
|
|
128
|
+
filePath,
|
|
129
|
+
mediaType,
|
|
130
|
+
totalDurationSeconds,
|
|
131
|
+
onProgress,
|
|
132
|
+
env,
|
|
133
|
+
notes,
|
|
134
|
+
});
|
|
135
|
+
if (local)
|
|
136
|
+
return local;
|
|
137
|
+
return await transcribeFileWithRemoteFallbacks({
|
|
138
|
+
filePath,
|
|
139
|
+
mediaType,
|
|
140
|
+
filename,
|
|
141
|
+
notes,
|
|
142
|
+
groqApiKey,
|
|
143
|
+
groqError,
|
|
144
|
+
assemblyaiApiKey,
|
|
145
|
+
geminiApiKey,
|
|
146
|
+
openaiApiKey,
|
|
147
|
+
falApiKey,
|
|
148
|
+
env,
|
|
149
|
+
totalDurationSeconds,
|
|
150
|
+
onProgress,
|
|
151
|
+
transcribeChunkedFile: ({ filePath, segmentSeconds, totalDurationSeconds, onProgress }) => transcribeChunkedFile({
|
|
152
|
+
filePath,
|
|
153
|
+
segmentSeconds,
|
|
154
|
+
totalDurationSeconds,
|
|
155
|
+
onProgress,
|
|
156
|
+
transcribeSegment: ({ bytes, filename }) => transcribeMediaWithWhisper({
|
|
157
|
+
bytes,
|
|
158
|
+
mediaType: "audio/mpeg",
|
|
159
|
+
filename,
|
|
160
|
+
groqApiKey,
|
|
161
|
+
skipGroq: skipGroqInNestedCalls,
|
|
162
|
+
assemblyaiApiKey,
|
|
163
|
+
geminiApiKey,
|
|
164
|
+
openaiApiKey,
|
|
165
|
+
falApiKey,
|
|
166
|
+
env,
|
|
167
|
+
}),
|
|
168
|
+
}),
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
async function transcribeWithGroqFirst({ bytes, mediaType, filename, groqApiKey, notes, }) {
|
|
172
|
+
let groqError = null;
|
|
173
|
+
try {
|
|
174
|
+
const text = await transcribeWithGroq(bytes, mediaType, filename, groqApiKey);
|
|
175
|
+
if (text)
|
|
176
|
+
return { text, error: null, bytes, mediaType, filename };
|
|
177
|
+
groqError = new Error("Groq transcription returned empty text");
|
|
165
178
|
}
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
try {
|
|
169
|
-
const text = await transcribeWithOpenAi(bytes, mediaType, filename, openaiApiKey, { env });
|
|
170
|
-
if (text) {
|
|
171
|
-
return { text, provider: "openai", error: null, notes };
|
|
172
|
-
}
|
|
173
|
-
openaiError = new Error("OpenAI transcription returned empty text");
|
|
174
|
-
}
|
|
175
|
-
catch (error) {
|
|
176
|
-
openaiError = wrapError("OpenAI transcription failed", error);
|
|
177
|
-
}
|
|
179
|
+
catch (error) {
|
|
180
|
+
groqError = wrapError("Groq transcription failed", error);
|
|
178
181
|
}
|
|
179
|
-
if (
|
|
182
|
+
if (groqError && shouldRetryGroqViaFfmpeg(groqError)) {
|
|
180
183
|
const canTranscode = await isFfmpegAvailable();
|
|
181
184
|
if (canTranscode) {
|
|
182
185
|
try {
|
|
183
|
-
|
|
184
|
-
// is the most reliable cross-format fallback (and also reduces upload size).
|
|
185
|
-
notes.push("OpenAI could not decode media; transcoding via ffmpeg and retrying");
|
|
186
|
+
notes.push("Groq could not decode media; transcoding via ffmpeg and retrying");
|
|
186
187
|
const mp3Bytes = await transcodeBytesToMp3(bytes);
|
|
187
|
-
const retried = await
|
|
188
|
+
const retried = await transcribeWithGroq(mp3Bytes, "audio/mpeg", "audio.mp3", groqApiKey);
|
|
188
189
|
if (retried) {
|
|
189
|
-
return {
|
|
190
|
+
return {
|
|
191
|
+
text: retried,
|
|
192
|
+
error: null,
|
|
193
|
+
bytes: mp3Bytes,
|
|
194
|
+
mediaType: "audio/mpeg",
|
|
195
|
+
filename: "audio.mp3",
|
|
196
|
+
};
|
|
190
197
|
}
|
|
191
|
-
|
|
198
|
+
groqError = new Error("Groq transcription returned empty text after ffmpeg transcode");
|
|
192
199
|
bytes = mp3Bytes;
|
|
193
200
|
mediaType = "audio/mpeg";
|
|
194
201
|
filename = "audio.mp3";
|
|
195
202
|
}
|
|
196
203
|
catch (error) {
|
|
197
|
-
notes.push(`ffmpeg transcode failed; cannot retry
|
|
204
|
+
notes.push(`ffmpeg transcode failed; cannot retry Groq decode error: ${error instanceof Error ? error.message : String(error)}`);
|
|
198
205
|
}
|
|
199
206
|
}
|
|
200
207
|
else {
|
|
201
|
-
notes.push("
|
|
208
|
+
notes.push("Groq could not decode media; install ffmpeg to enable transcoding retry");
|
|
202
209
|
}
|
|
203
210
|
}
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
if (
|
|
209
|
-
|
|
210
|
-
}
|
|
211
|
-
if (falApiKey && canUseFal) {
|
|
211
|
+
return { text: null, error: groqError, bytes, mediaType, filename };
|
|
212
|
+
}
|
|
213
|
+
async function transcribeGroqFileFirst({ filePath, mediaType, filename, groqApiKey, assemblyaiApiKey, geminiApiKey, openaiApiKey, falApiKey, segmentSeconds, totalDurationSeconds, onProgress, env, notes, }) {
|
|
214
|
+
const stat = await fs.stat(filePath);
|
|
215
|
+
if (stat.size <= MAX_OPENAI_UPLOAD_BYTES) {
|
|
216
|
+
const fileBytes = new Uint8Array(await fs.readFile(filePath));
|
|
212
217
|
try {
|
|
213
|
-
const text = await
|
|
214
|
-
if (text)
|
|
215
|
-
return { text, provider: "
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
provider: "fal",
|
|
220
|
-
error: new Error("FAL transcription returned empty text"),
|
|
221
|
-
notes,
|
|
222
|
-
};
|
|
218
|
+
const text = await transcribeWithGroq(fileBytes, mediaType, filename, groqApiKey);
|
|
219
|
+
if (text)
|
|
220
|
+
return { text, provider: "groq", error: null, notes };
|
|
221
|
+
const error = new Error("Groq transcription returned empty text");
|
|
222
|
+
notes.push("Groq transcription returned empty text; falling back to local/AssemblyAI/Gemini/OpenAI");
|
|
223
|
+
return { text: null, provider: "groq", error, notes };
|
|
223
224
|
}
|
|
224
225
|
catch (error) {
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
error: wrapError("FAL transcription failed", error),
|
|
229
|
-
notes,
|
|
230
|
-
};
|
|
226
|
+
const wrapped = wrapError("Groq transcription failed", error);
|
|
227
|
+
notes.push(`Groq transcription failed; falling back to local/AssemblyAI/Gemini/OpenAI: ${error instanceof Error ? error.message : String(error)}`);
|
|
228
|
+
return { text: null, provider: "groq", error: wrapped, notes };
|
|
231
229
|
}
|
|
232
230
|
}
|
|
233
|
-
const
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
231
|
+
const canChunk = await isFfmpegAvailable();
|
|
232
|
+
if (!canChunk) {
|
|
233
|
+
const error = new Error(`File too large for Groq upload (${formatBytes(stat.size)}); trying local providers`);
|
|
234
|
+
notes.push(error.message);
|
|
235
|
+
return { text: null, provider: "groq", error, notes };
|
|
236
|
+
}
|
|
237
|
+
const chunked = await transcribeChunkedFile({
|
|
238
|
+
filePath,
|
|
239
|
+
segmentSeconds,
|
|
240
|
+
totalDurationSeconds,
|
|
241
|
+
onProgress,
|
|
242
|
+
transcribeSegment: ({ bytes, filename }) => transcribeMediaWithWhisper({
|
|
243
|
+
bytes,
|
|
244
|
+
mediaType: "audio/mpeg",
|
|
245
|
+
filename,
|
|
246
|
+
groqApiKey,
|
|
247
|
+
assemblyaiApiKey,
|
|
248
|
+
geminiApiKey,
|
|
249
|
+
openaiApiKey,
|
|
250
|
+
falApiKey,
|
|
251
|
+
env,
|
|
252
|
+
}),
|
|
253
|
+
});
|
|
254
|
+
if (chunked.notes.length > 0)
|
|
255
|
+
notes.push(...chunked.notes);
|
|
256
|
+
if (chunked.text)
|
|
257
|
+
return { ...chunked, notes };
|
|
258
|
+
const error = chunked.error ?? new Error("Groq chunked transcription failed");
|
|
259
|
+
notes.push(`Groq chunked transcription failed; falling back to local/AssemblyAI/Gemini/OpenAI: ${error.message}`);
|
|
260
|
+
return { text: null, provider: "groq", error, notes };
|
|
242
261
|
}
|
|
243
|
-
|
|
244
|
-
const
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
}
|
|
266
|
-
else {
|
|
267
|
-
groqError = new Error(`File too large for Groq upload (${formatBytes(stat.size)}); trying local providers`);
|
|
268
|
-
notes.push(groqError.message);
|
|
269
|
-
}
|
|
262
|
+
async function transcribeWithLocalOnnx({ bytes, mediaType, filename, totalDurationSeconds, onProgress, env, notes, }) {
|
|
263
|
+
const onnxPreference = resolveOnnxModelPreference(env);
|
|
264
|
+
if (!onnxPreference)
|
|
265
|
+
return null;
|
|
266
|
+
const onnx = await transcribeWithOnnxCli({
|
|
267
|
+
model: onnxPreference,
|
|
268
|
+
bytes,
|
|
269
|
+
mediaType,
|
|
270
|
+
filename,
|
|
271
|
+
totalDurationSeconds,
|
|
272
|
+
onProgress,
|
|
273
|
+
env,
|
|
274
|
+
});
|
|
275
|
+
if (onnx.text) {
|
|
276
|
+
if (onnx.notes.length > 0)
|
|
277
|
+
notes.push(...onnx.notes);
|
|
278
|
+
return { ...onnx, notes };
|
|
279
|
+
}
|
|
280
|
+
if (onnx.notes.length > 0)
|
|
281
|
+
notes.push(...onnx.notes);
|
|
282
|
+
if (onnx.error) {
|
|
283
|
+
notes.push(`${onnx.provider ?? "onnx"} failed; falling back to Whisper: ${onnx.error.message}`);
|
|
270
284
|
}
|
|
271
|
-
|
|
285
|
+
return null;
|
|
286
|
+
}
|
|
287
|
+
async function transcribeWithLocalOnnxFile({ filePath, mediaType, totalDurationSeconds, onProgress, env, notes, }) {
|
|
272
288
|
const onnxPreference = resolveOnnxModelPreference(env);
|
|
273
|
-
if (onnxPreference)
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
notes.push(...onnx.notes);
|
|
291
|
-
return { ...onnx, notes };
|
|
292
|
-
}
|
|
289
|
+
if (!onnxPreference)
|
|
290
|
+
return null;
|
|
291
|
+
onProgress?.({
|
|
292
|
+
partIndex: null,
|
|
293
|
+
parts: null,
|
|
294
|
+
processedDurationSeconds: null,
|
|
295
|
+
totalDurationSeconds,
|
|
296
|
+
});
|
|
297
|
+
const onnx = await transcribeWithOnnxCliFile({
|
|
298
|
+
model: onnxPreference,
|
|
299
|
+
filePath,
|
|
300
|
+
mediaType,
|
|
301
|
+
totalDurationSeconds,
|
|
302
|
+
onProgress,
|
|
303
|
+
env,
|
|
304
|
+
});
|
|
305
|
+
if (onnx.text) {
|
|
293
306
|
if (onnx.notes.length > 0)
|
|
294
307
|
notes.push(...onnx.notes);
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
308
|
+
return { ...onnx, notes };
|
|
309
|
+
}
|
|
310
|
+
if (onnx.notes.length > 0)
|
|
311
|
+
notes.push(...onnx.notes);
|
|
312
|
+
if (onnx.error) {
|
|
313
|
+
notes.push(`${onnx.provider ?? "onnx"} failed; falling back to Whisper: ${onnx.error.message}`);
|
|
298
314
|
}
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
315
|
+
return null;
|
|
316
|
+
}
|
|
317
|
+
async function transcribeWithLocalWhisperBytes({ bytes, mediaType, filename, totalDurationSeconds, onProgress, env, notes, }) {
|
|
318
|
+
const localReady = await isWhisperCppReady(env);
|
|
319
|
+
if (!localReady)
|
|
320
|
+
return null;
|
|
321
|
+
const nameHint = filename?.trim() ? basename(filename.trim()) : "media";
|
|
322
|
+
const tempFile = join(tmpdir(), `summarize-whisper-local-${randomUUID()}-${ensureWhisperFilenameExtension(nameHint, mediaType)}`);
|
|
323
|
+
try {
|
|
324
|
+
await fs.writeFile(tempFile, bytes);
|
|
325
|
+
const result = await safeTranscribeWithWhisperCppFile({
|
|
326
|
+
filePath: tempFile,
|
|
327
|
+
mediaType,
|
|
307
328
|
totalDurationSeconds,
|
|
329
|
+
onProgress,
|
|
330
|
+
env,
|
|
308
331
|
});
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
totalDurationSeconds,
|
|
314
|
-
onProgress,
|
|
315
|
-
});
|
|
316
|
-
}
|
|
317
|
-
catch (error) {
|
|
318
|
-
local = {
|
|
319
|
-
text: null,
|
|
320
|
-
provider: "whisper.cpp",
|
|
321
|
-
error: wrapError("whisper.cpp failed", error),
|
|
322
|
-
notes: [],
|
|
323
|
-
};
|
|
332
|
+
if (result.text) {
|
|
333
|
+
if (result.notes.length > 0)
|
|
334
|
+
notes.push(...result.notes);
|
|
335
|
+
return { ...result, notes };
|
|
324
336
|
}
|
|
325
|
-
if (
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
337
|
+
if (result.notes.length > 0)
|
|
338
|
+
notes.push(...result.notes);
|
|
339
|
+
if (result.error) {
|
|
340
|
+
notes.push(`whisper.cpp failed; falling back to remote Whisper: ${result.error.message}`);
|
|
329
341
|
}
|
|
330
|
-
|
|
331
|
-
notes.push(...local.notes);
|
|
332
|
-
if (local.error) {
|
|
333
|
-
notes.push(`whisper.cpp failed; falling back to remote Whisper: ${local.error.message}`);
|
|
334
|
-
}
|
|
335
|
-
}
|
|
336
|
-
// 4. OpenAI / FAL (cloud fallbacks)
|
|
337
|
-
if (!openaiApiKey && !falApiKey) {
|
|
338
|
-
if (groqError) {
|
|
339
|
-
return {
|
|
340
|
-
text: null,
|
|
341
|
-
provider: "groq",
|
|
342
|
-
error: groqError,
|
|
343
|
-
notes,
|
|
344
|
-
};
|
|
345
|
-
}
|
|
346
|
-
return {
|
|
347
|
-
text: null,
|
|
348
|
-
provider: null,
|
|
349
|
-
error: new Error("No transcription providers available (install whisper-cpp or set GROQ_API_KEY, OPENAI_API_KEY, or FAL_KEY)"),
|
|
350
|
-
notes,
|
|
351
|
-
};
|
|
342
|
+
return null;
|
|
352
343
|
}
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
if (!canChunk) {
|
|
356
|
-
notes.push(`Media too large for Whisper upload (${formatBytes(stat.size)}); install ffmpeg to enable chunked transcription`);
|
|
357
|
-
const head = await readFirstBytes(filePath, MAX_OPENAI_UPLOAD_BYTES);
|
|
358
|
-
const partial = await transcribeMediaWithWhisper({
|
|
359
|
-
bytes: head,
|
|
360
|
-
mediaType,
|
|
361
|
-
filename,
|
|
362
|
-
groqApiKey,
|
|
363
|
-
skipGroq: skipGroqInNestedCalls,
|
|
364
|
-
openaiApiKey,
|
|
365
|
-
falApiKey,
|
|
366
|
-
env,
|
|
367
|
-
});
|
|
368
|
-
if (partial.notes.length > 0)
|
|
369
|
-
notes.push(...partial.notes);
|
|
370
|
-
return { ...partial, notes };
|
|
371
|
-
}
|
|
372
|
-
const dir = await fs.mkdtemp(join(tmpdir(), "summarize-whisper-segments-"));
|
|
373
|
-
try {
|
|
374
|
-
const pattern = join(dir, "part-%03d.mp3");
|
|
375
|
-
await runFfmpegSegment({
|
|
376
|
-
inputPath: filePath,
|
|
377
|
-
outputPattern: pattern,
|
|
378
|
-
segmentSeconds,
|
|
379
|
-
});
|
|
380
|
-
const files = (await fs.readdir(dir))
|
|
381
|
-
.filter((name) => name.startsWith("part-") && name.endsWith(".mp3"))
|
|
382
|
-
.sort((a, b) => a.localeCompare(b));
|
|
383
|
-
if (files.length === 0) {
|
|
384
|
-
return {
|
|
385
|
-
text: null,
|
|
386
|
-
provider: null,
|
|
387
|
-
error: new Error("ffmpeg produced no audio segments"),
|
|
388
|
-
notes,
|
|
389
|
-
};
|
|
390
|
-
}
|
|
391
|
-
notes.push(`ffmpeg chunked media into ${files.length} parts (${segmentSeconds}s each)`);
|
|
392
|
-
onProgress?.({
|
|
393
|
-
partIndex: null,
|
|
394
|
-
parts: files.length,
|
|
395
|
-
processedDurationSeconds: null,
|
|
396
|
-
totalDurationSeconds,
|
|
397
|
-
});
|
|
398
|
-
const parts = [];
|
|
399
|
-
let usedProvider = null;
|
|
400
|
-
for (const [index, name] of files.entries()) {
|
|
401
|
-
const segmentPath = join(dir, name);
|
|
402
|
-
const segmentBytes = new Uint8Array(await fs.readFile(segmentPath));
|
|
403
|
-
const result = await transcribeMediaWithWhisper({
|
|
404
|
-
bytes: segmentBytes,
|
|
405
|
-
mediaType: "audio/mpeg",
|
|
406
|
-
filename: name,
|
|
407
|
-
groqApiKey,
|
|
408
|
-
skipGroq: skipGroqInNestedCalls,
|
|
409
|
-
openaiApiKey,
|
|
410
|
-
falApiKey,
|
|
411
|
-
onProgress: null,
|
|
412
|
-
env,
|
|
413
|
-
});
|
|
414
|
-
if (!usedProvider && result.provider)
|
|
415
|
-
usedProvider = result.provider;
|
|
416
|
-
if (result.error && !result.text) {
|
|
417
|
-
return { text: null, provider: usedProvider, error: result.error, notes };
|
|
418
|
-
}
|
|
419
|
-
if (result.text)
|
|
420
|
-
parts.push(result.text);
|
|
421
|
-
// Coarse but useful: update based on part boundaries. Duration is best-effort (RSS hints or
|
|
422
|
-
// ffprobe); the per-part time is stable enough to make the spinner feel alive.
|
|
423
|
-
const processedSeconds = Math.max(0, (index + 1) * segmentSeconds);
|
|
424
|
-
onProgress?.({
|
|
425
|
-
partIndex: index + 1,
|
|
426
|
-
parts: files.length,
|
|
427
|
-
processedDurationSeconds: typeof totalDurationSeconds === "number" && totalDurationSeconds > 0
|
|
428
|
-
? Math.min(processedSeconds, totalDurationSeconds)
|
|
429
|
-
: null,
|
|
430
|
-
totalDurationSeconds,
|
|
431
|
-
});
|
|
432
|
-
}
|
|
433
|
-
return { text: parts.join("\n\n"), provider: usedProvider, error: null, notes };
|
|
434
|
-
}
|
|
435
|
-
finally {
|
|
436
|
-
await fs.rm(dir, { recursive: true, force: true }).catch(() => { });
|
|
437
|
-
}
|
|
344
|
+
finally {
|
|
345
|
+
await fs.unlink(tempFile).catch(() => { });
|
|
438
346
|
}
|
|
439
|
-
|
|
347
|
+
}
|
|
348
|
+
async function transcribeWithLocalWhisperFile({ filePath, mediaType, totalDurationSeconds, onProgress, env, notes, }) {
|
|
349
|
+
const localReady = await isWhisperCppReady(env);
|
|
350
|
+
if (!localReady)
|
|
351
|
+
return null;
|
|
440
352
|
onProgress?.({
|
|
441
353
|
partIndex: null,
|
|
442
354
|
parts: null,
|
|
443
355
|
processedDurationSeconds: null,
|
|
444
356
|
totalDurationSeconds,
|
|
445
357
|
});
|
|
446
|
-
const result = await
|
|
447
|
-
|
|
358
|
+
const result = await safeTranscribeWithWhisperCppFile({
|
|
359
|
+
filePath,
|
|
448
360
|
mediaType,
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
skipGroq: skipGroqInNestedCalls,
|
|
452
|
-
openaiApiKey,
|
|
453
|
-
falApiKey,
|
|
361
|
+
totalDurationSeconds,
|
|
362
|
+
onProgress,
|
|
454
363
|
env,
|
|
455
364
|
});
|
|
365
|
+
if (result.text) {
|
|
366
|
+
if (result.notes.length > 0)
|
|
367
|
+
notes.push(...result.notes);
|
|
368
|
+
return { ...result, notes };
|
|
369
|
+
}
|
|
456
370
|
if (result.notes.length > 0)
|
|
457
371
|
notes.push(...result.notes);
|
|
458
|
-
|
|
372
|
+
if (result.error) {
|
|
373
|
+
notes.push(`whisper.cpp failed; falling back to remote Whisper: ${result.error.message}`);
|
|
374
|
+
}
|
|
375
|
+
return null;
|
|
376
|
+
}
|
|
377
|
+
async function safeTranscribeWithWhisperCppFile(args) {
|
|
378
|
+
try {
|
|
379
|
+
return await transcribeWithWhisperCppFile(args);
|
|
380
|
+
}
|
|
381
|
+
catch (error) {
|
|
382
|
+
return {
|
|
383
|
+
text: null,
|
|
384
|
+
provider: "whisper.cpp",
|
|
385
|
+
error: wrapError("whisper.cpp failed", error),
|
|
386
|
+
notes: [],
|
|
387
|
+
};
|
|
388
|
+
}
|
|
459
389
|
}
|
|
460
390
|
//# sourceMappingURL=core.js.map
|