@steipete/summarize-core 0.11.1 → 0.13.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/dist/esm/content/bun.js +21 -0
  2. package/dist/esm/content/bun.js.map +1 -0
  3. package/dist/esm/content/direct-media.js +100 -0
  4. package/dist/esm/content/direct-media.js.map +1 -0
  5. package/dist/esm/content/index.js +2 -1
  6. package/dist/esm/content/index.js.map +1 -1
  7. package/dist/esm/content/link-preview/client.js +6 -0
  8. package/dist/esm/content/link-preview/client.js.map +1 -1
  9. package/dist/esm/content/link-preview/content/fetcher.js +19 -2
  10. package/dist/esm/content/link-preview/content/fetcher.js.map +1 -1
  11. package/dist/esm/content/link-preview/content/firecrawl.js.map +1 -1
  12. package/dist/esm/content/link-preview/content/html.js.map +1 -1
  13. package/dist/esm/content/link-preview/content/index.js +29 -12
  14. package/dist/esm/content/link-preview/content/index.js.map +1 -1
  15. package/dist/esm/content/link-preview/content/utils.js.map +1 -1
  16. package/dist/esm/content/link-preview/content/video.js +1 -1
  17. package/dist/esm/content/link-preview/content/video.js.map +1 -1
  18. package/dist/esm/content/local-file.js +58 -0
  19. package/dist/esm/content/local-file.js.map +1 -0
  20. package/dist/esm/content/transcript/index.js +2 -0
  21. package/dist/esm/content/transcript/index.js.map +1 -1
  22. package/dist/esm/content/transcript/providers/generic-direct-media.js +47 -0
  23. package/dist/esm/content/transcript/providers/generic-direct-media.js.map +1 -0
  24. package/dist/esm/content/transcript/providers/generic-embedded.js +126 -0
  25. package/dist/esm/content/transcript/providers/generic-embedded.js.map +1 -0
  26. package/dist/esm/content/transcript/providers/generic-twitter.js +78 -0
  27. package/dist/esm/content/transcript/providers/generic-twitter.js.map +1 -0
  28. package/dist/esm/content/transcript/providers/generic.js +12 -248
  29. package/dist/esm/content/transcript/providers/generic.js.map +1 -1
  30. package/dist/esm/content/transcript/providers/podcast/apple-flow.js.map +1 -1
  31. package/dist/esm/content/transcript/providers/podcast/media.js +9 -1
  32. package/dist/esm/content/transcript/providers/podcast/media.js.map +1 -1
  33. package/dist/esm/content/transcript/providers/podcast/provider-flow.js +157 -0
  34. package/dist/esm/content/transcript/providers/podcast/provider-flow.js.map +1 -0
  35. package/dist/esm/content/transcript/providers/podcast/rss-feed.js +123 -0
  36. package/dist/esm/content/transcript/providers/podcast/rss-feed.js.map +1 -0
  37. package/dist/esm/content/transcript/providers/podcast/rss-transcript.js +113 -0
  38. package/dist/esm/content/transcript/providers/podcast/rss-transcript.js.map +1 -0
  39. package/dist/esm/content/transcript/providers/podcast/rss.js +2 -226
  40. package/dist/esm/content/transcript/providers/podcast/rss.js.map +1 -1
  41. package/dist/esm/content/transcript/providers/podcast/spotify-flow.js.map +1 -1
  42. package/dist/esm/content/transcript/providers/podcast.js +26 -155
  43. package/dist/esm/content/transcript/providers/podcast.js.map +1 -1
  44. package/dist/esm/content/transcript/providers/transcription-capability.js +22 -0
  45. package/dist/esm/content/transcript/providers/transcription-capability.js.map +1 -0
  46. package/dist/esm/content/transcript/providers/transcription-start.js +43 -32
  47. package/dist/esm/content/transcript/providers/transcription-start.js.map +1 -1
  48. package/dist/esm/content/transcript/providers/youtube/api.js +3 -2
  49. package/dist/esm/content/transcript/providers/youtube/api.js.map +1 -1
  50. package/dist/esm/content/transcript/providers/youtube/captions-player.js +173 -0
  51. package/dist/esm/content/transcript/providers/youtube/captions-player.js.map +1 -0
  52. package/dist/esm/content/transcript/providers/youtube/captions-shared.js +8 -0
  53. package/dist/esm/content/transcript/providers/youtube/captions-shared.js.map +1 -0
  54. package/dist/esm/content/transcript/providers/youtube/captions-transcript.js +361 -0
  55. package/dist/esm/content/transcript/providers/youtube/captions-transcript.js.map +1 -0
  56. package/dist/esm/content/transcript/providers/youtube/captions.js +2 -557
  57. package/dist/esm/content/transcript/providers/youtube/captions.js.map +1 -1
  58. package/dist/esm/content/transcript/providers/youtube/provider-flow.js +217 -0
  59. package/dist/esm/content/transcript/providers/youtube/provider-flow.js.map +1 -0
  60. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js +33 -9
  61. package/dist/esm/content/transcript/providers/youtube/yt-dlp.js.map +1 -1
  62. package/dist/esm/content/transcript/providers/youtube.js +42 -194
  63. package/dist/esm/content/transcript/providers/youtube.js.map +1 -1
  64. package/dist/esm/content/transcript/transcription-config.js +24 -4
  65. package/dist/esm/content/transcript/transcription-config.js.map +1 -1
  66. package/dist/esm/content/url.js +5 -33
  67. package/dist/esm/content/url.js.map +1 -1
  68. package/dist/esm/processes.js.map +1 -1
  69. package/dist/esm/prompts/format.js +6 -0
  70. package/dist/esm/prompts/format.js.map +1 -1
  71. package/dist/esm/prompts/link-summary.js +27 -3
  72. package/dist/esm/prompts/link-summary.js.map +1 -1
  73. package/dist/esm/transcription/onnx-cli.js.map +1 -1
  74. package/dist/esm/transcription/whisper/assemblyai.js +132 -0
  75. package/dist/esm/transcription/whisper/assemblyai.js.map +1 -0
  76. package/dist/esm/transcription/whisper/chunking.js +64 -0
  77. package/dist/esm/transcription/whisper/chunking.js.map +1 -0
  78. package/dist/esm/transcription/whisper/cloud-providers.js +69 -0
  79. package/dist/esm/transcription/whisper/cloud-providers.js.map +1 -0
  80. package/dist/esm/transcription/whisper/core.js +320 -390
  81. package/dist/esm/transcription/whisper/core.js.map +1 -1
  82. package/dist/esm/transcription/whisper/gemini.js +324 -0
  83. package/dist/esm/transcription/whisper/gemini.js.map +1 -0
  84. package/dist/esm/transcription/whisper/groq.js +62 -1
  85. package/dist/esm/transcription/whisper/groq.js.map +1 -1
  86. package/dist/esm/transcription/whisper/preferences.js +16 -0
  87. package/dist/esm/transcription/whisper/preferences.js.map +1 -0
  88. package/dist/esm/transcription/whisper/provider-setup.js +62 -0
  89. package/dist/esm/transcription/whisper/provider-setup.js.map +1 -0
  90. package/dist/esm/transcription/whisper/remote-provider-attempts.js +189 -0
  91. package/dist/esm/transcription/whisper/remote-provider-attempts.js.map +1 -0
  92. package/dist/esm/transcription/whisper/remote.js +220 -0
  93. package/dist/esm/transcription/whisper/remote.js.map +1 -0
  94. package/dist/esm/transcription/whisper/whisper-cpp.js +21 -18
  95. package/dist/esm/transcription/whisper/whisper-cpp.js.map +1 -1
  96. package/dist/types/content/bun.d.ts +6 -0
  97. package/dist/types/content/direct-media.d.ts +9 -0
  98. package/dist/types/content/index.d.ts +2 -1
  99. package/dist/types/content/link-preview/client.d.ts +3 -1
  100. package/dist/types/content/link-preview/content/fetcher.d.ts +1 -1
  101. package/dist/types/content/link-preview/content/html.d.ts +1 -1
  102. package/dist/types/content/link-preview/deps.d.ts +8 -2
  103. package/dist/types/content/link-preview/types.d.ts +1 -1
  104. package/dist/types/content/local-file.d.ts +16 -0
  105. package/dist/types/content/transcript/providers/generic-direct-media.d.ts +11 -0
  106. package/dist/types/content/transcript/providers/generic-embedded.d.ts +16 -0
  107. package/dist/types/content/transcript/providers/generic-twitter.d.ts +11 -0
  108. package/dist/types/content/transcript/providers/podcast/flow-context.d.ts +3 -0
  109. package/dist/types/content/transcript/providers/podcast/media.d.ts +4 -2
  110. package/dist/types/content/transcript/providers/podcast/provider-flow.d.ts +7 -0
  111. package/dist/types/content/transcript/providers/podcast/rss-feed.d.ts +15 -0
  112. package/dist/types/content/transcript/providers/podcast/rss-transcript.d.ts +12 -0
  113. package/dist/types/content/transcript/providers/podcast/rss.d.ts +2 -24
  114. package/dist/types/content/transcript/providers/transcription-capability.d.ts +18 -0
  115. package/dist/types/content/transcript/providers/transcription-start.d.ts +11 -3
  116. package/dist/types/content/transcript/providers/youtube/captions-player.d.ts +12 -0
  117. package/dist/types/content/transcript/providers/youtube/captions-shared.d.ts +42 -0
  118. package/dist/types/content/transcript/providers/youtube/captions-transcript.d.ts +4 -0
  119. package/dist/types/content/transcript/providers/youtube/captions.d.ts +2 -19
  120. package/dist/types/content/transcript/providers/youtube/provider-flow.d.ts +34 -0
  121. package/dist/types/content/transcript/providers/youtube/yt-dlp.d.ts +4 -2
  122. package/dist/types/content/transcript/transcription-config.d.ts +6 -0
  123. package/dist/types/content/transcript/types.d.ts +1 -0
  124. package/dist/types/content/url.d.ts +2 -3
  125. package/dist/types/prompts/format.d.ts +1 -0
  126. package/dist/types/prompts/link-summary.d.ts +2 -1
  127. package/dist/types/transcription/whisper/assemblyai.d.ts +17 -0
  128. package/dist/types/transcription/whisper/chunking.d.ts +11 -0
  129. package/dist/types/transcription/whisper/cloud-providers.d.ts +22 -0
  130. package/dist/types/transcription/whisper/core.d.ts +12 -14
  131. package/dist/types/transcription/whisper/gemini.d.ts +14 -0
  132. package/dist/types/transcription/whisper/preferences.d.ts +4 -0
  133. package/dist/types/transcription/whisper/provider-setup.d.ts +30 -0
  134. package/dist/types/transcription/whisper/remote-provider-attempts.d.ts +51 -0
  135. package/dist/types/transcription/whisper/remote.d.ts +51 -0
  136. package/dist/types/transcription/whisper/types.d.ts +1 -1
  137. package/dist/types/transcription/whisper/whisper-cpp.d.ts +4 -3
  138. package/package.json +14 -10
@@ -2,459 +2,389 @@ import { randomUUID } from "node:crypto";
2
2
  import { promises as fs } from "node:fs";
3
3
  import { tmpdir } from "node:os";
4
4
  import { basename, join } from "node:path";
5
- import { resolvePreferredOnnxModel, transcribeWithOnnxCli, transcribeWithOnnxCliFile, } from "../onnx-cli.js";
5
+ import { transcribeWithOnnxCli, transcribeWithOnnxCliFile } from "../onnx-cli.js";
6
+ import { transcribeChunkedFile } from "./chunking.js";
6
7
  import { DEFAULT_SEGMENT_SECONDS, MAX_OPENAI_UPLOAD_BYTES } from "./constants.js";
7
- import { transcribeWithFal } from "./fal.js";
8
- import { isFfmpegAvailable, runFfmpegSegment, transcodeBytesToMp3 } from "./ffmpeg.js";
8
+ import { isFfmpegAvailable, transcodeBytesToMp3 } from "./ffmpeg.js";
9
9
  import { shouldRetryGroqViaFfmpeg, transcribeWithGroq } from "./groq.js";
10
- import { shouldRetryOpenAiViaFfmpeg, transcribeWithOpenAi } from "./openai.js";
11
- import { ensureWhisperFilenameExtension, formatBytes, readFirstBytes, wrapError } from "./utils.js";
10
+ import { resolveOnnxModelPreference } from "./preferences.js";
11
+ import { transcribeBytesWithRemoteFallbacks, transcribeFileWithRemoteFallbacks, transcribeOversizedBytesViaTempFile, } from "./remote.js";
12
+ import { ensureWhisperFilenameExtension, formatBytes, wrapError } from "./utils.js";
12
13
  import { isWhisperCppReady, transcribeWithWhisperCppFile } from "./whisper-cpp.js";
13
- function resolveTranscriberPreference(env) {
14
- const raw = env.SUMMARIZE_TRANSCRIBER?.trim().toLowerCase();
15
- if (raw === "auto" || raw === "whisper" || raw === "parakeet" || raw === "canary")
16
- return raw;
17
- return "auto";
18
- }
19
- function resolveOnnxModelPreference(env) {
20
- const preference = resolveTranscriberPreference(env);
21
- if (preference === "parakeet" || preference === "canary")
22
- return preference;
23
- if (preference === "auto")
24
- return resolvePreferredOnnxModel(env);
25
- return null;
26
- }
27
- export async function transcribeMediaWithWhisper({ bytes, mediaType, filename, groqApiKey, skipGroq = false, openaiApiKey, falApiKey, totalDurationSeconds = null, onProgress, env = process.env, }) {
14
+ export async function transcribeMediaWithWhisper({ bytes, mediaType, filename, groqApiKey, skipGroq = false, assemblyaiApiKey = null, geminiApiKey = null, openaiApiKey, falApiKey, totalDurationSeconds = null, onProgress, env = process.env, }) {
28
15
  const notes = [];
29
- // 1. Groq (cloud, free, fastest)
30
16
  let groqError = null;
31
17
  if (groqApiKey && !skipGroq) {
32
- try {
33
- const text = await transcribeWithGroq(bytes, mediaType, filename, groqApiKey);
34
- if (text) {
35
- return { text, provider: "groq", error: null, notes };
36
- }
37
- groqError = new Error("Groq transcription returned empty text");
38
- }
39
- catch (error) {
40
- groqError = wrapError("Groq transcription failed", error);
41
- }
42
- }
43
- if (!skipGroq && groqApiKey && groqError && shouldRetryGroqViaFfmpeg(groqError)) {
44
- const canTranscode = await isFfmpegAvailable();
45
- if (canTranscode) {
46
- try {
47
- notes.push("Groq could not decode media; transcoding via ffmpeg and retrying");
48
- const mp3Bytes = await transcodeBytesToMp3(bytes);
49
- const retried = await transcribeWithGroq(mp3Bytes, "audio/mpeg", "audio.mp3", groqApiKey);
50
- if (retried) {
51
- return { text: retried, provider: "groq", error: null, notes };
52
- }
53
- groqError = new Error("Groq transcription returned empty text after ffmpeg transcode");
54
- bytes = mp3Bytes;
55
- mediaType = "audio/mpeg";
56
- filename = "audio.mp3";
57
- }
58
- catch (error) {
59
- notes.push(`ffmpeg transcode failed; cannot retry Groq decode error: ${error instanceof Error ? error.message : String(error)}`);
60
- }
61
- }
62
- else {
63
- notes.push("Groq could not decode media; install ffmpeg to enable transcoding retry");
18
+ const groqResult = await transcribeWithGroqFirst({
19
+ bytes,
20
+ mediaType,
21
+ filename,
22
+ groqApiKey,
23
+ notes,
24
+ });
25
+ bytes = groqResult.bytes;
26
+ mediaType = groqResult.mediaType;
27
+ filename = groqResult.filename;
28
+ if (groqResult.text) {
29
+ return { text: groqResult.text, provider: "groq", error: null, notes };
64
30
  }
31
+ groqError = groqResult.error;
65
32
  }
66
33
  if (groqError) {
67
- notes.push(`Groq transcription failed; falling back to local/OpenAI: ${groqError.message}`);
34
+ notes.push(`Groq transcription failed; falling back to local/AssemblyAI/Gemini/OpenAI: ${groqError.message}`);
68
35
  }
69
- // 2. ONNX (local)
70
- const onnxPreference = resolveOnnxModelPreference(env);
71
- if (onnxPreference) {
72
- const onnx = await transcribeWithOnnxCli({
73
- model: onnxPreference,
36
+ const onnx = await transcribeWithLocalOnnx({
37
+ bytes,
38
+ mediaType,
39
+ filename,
40
+ totalDurationSeconds,
41
+ onProgress,
42
+ env,
43
+ notes,
44
+ });
45
+ if (onnx)
46
+ return onnx;
47
+ const local = await transcribeWithLocalWhisperBytes({
48
+ bytes,
49
+ mediaType,
50
+ filename,
51
+ totalDurationSeconds,
52
+ onProgress,
53
+ env,
54
+ notes,
55
+ });
56
+ if (local)
57
+ return local;
58
+ return await transcribeBytesWithRemoteFallbacks({
59
+ bytes,
60
+ mediaType,
61
+ filename,
62
+ notes,
63
+ groqApiKey,
64
+ groqError,
65
+ assemblyaiApiKey,
66
+ geminiApiKey,
67
+ openaiApiKey,
68
+ falApiKey,
69
+ env,
70
+ onProgress,
71
+ transcribeOversizedBytesWithChunking: ({ bytes, mediaType, filename, onProgress }) => transcribeOversizedBytesViaTempFile({
74
72
  bytes,
75
73
  mediaType,
76
74
  filename,
75
+ onProgress,
76
+ transcribeFile: ({ filePath, mediaType, filename, onProgress }) => transcribeMediaFileWithWhisper({
77
+ filePath,
78
+ mediaType,
79
+ filename,
80
+ groqApiKey,
81
+ assemblyaiApiKey,
82
+ geminiApiKey,
83
+ openaiApiKey,
84
+ falApiKey,
85
+ segmentSeconds: DEFAULT_SEGMENT_SECONDS,
86
+ onProgress,
87
+ env,
88
+ }),
89
+ }),
90
+ });
91
+ }
92
+ export async function transcribeMediaFileWithWhisper({ filePath, mediaType, filename, groqApiKey, assemblyaiApiKey = null, geminiApiKey = null, openaiApiKey, falApiKey, segmentSeconds = DEFAULT_SEGMENT_SECONDS, totalDurationSeconds = null, onProgress = null, env = process.env, }) {
93
+ const notes = [];
94
+ let skipGroqInNestedCalls = false;
95
+ let groqError = null;
96
+ if (groqApiKey) {
97
+ skipGroqInNestedCalls = true;
98
+ const groqResult = await transcribeGroqFileFirst({
99
+ filePath,
100
+ mediaType,
101
+ filename,
102
+ groqApiKey,
103
+ assemblyaiApiKey,
104
+ geminiApiKey,
105
+ openaiApiKey,
106
+ falApiKey,
107
+ segmentSeconds,
77
108
  totalDurationSeconds,
78
109
  onProgress,
79
110
  env,
80
- });
81
- if (onnx.text) {
82
- if (onnx.notes.length > 0)
83
- notes.push(...onnx.notes);
84
- return { ...onnx, notes };
85
- }
86
- if (onnx.notes.length > 0)
87
- notes.push(...onnx.notes);
88
- if (onnx.error) {
89
- notes.push(`${onnx.provider ?? "onnx"} failed; falling back to Whisper: ${onnx.error.message}`);
90
- }
91
- }
92
- // 3. whisper.cpp (local)
93
- const localReady = await isWhisperCppReady();
94
- let local = null;
95
- if (localReady) {
96
- const nameHint = filename?.trim() ? basename(filename.trim()) : "media";
97
- const tempFile = join(tmpdir(), `summarize-whisper-local-${randomUUID()}-${ensureWhisperFilenameExtension(nameHint, mediaType)}`);
98
- try {
99
- await fs.writeFile(tempFile, bytes);
100
- try {
101
- local = await transcribeWithWhisperCppFile({
102
- filePath: tempFile,
103
- mediaType,
104
- totalDurationSeconds,
105
- onProgress,
106
- });
107
- }
108
- catch (error) {
109
- local = {
110
- text: null,
111
- provider: "whisper.cpp",
112
- error: wrapError("whisper.cpp failed", error),
113
- notes: [],
114
- };
115
- }
116
- if (local.text) {
117
- if (local.notes.length > 0)
118
- notes.push(...local.notes);
119
- return { ...local, notes };
120
- }
121
- if (local.notes.length > 0)
122
- notes.push(...local.notes);
123
- if (local.error) {
124
- notes.push(`whisper.cpp failed; falling back to remote Whisper: ${local.error.message}`);
125
- }
126
- }
127
- finally {
128
- await fs.unlink(tempFile).catch(() => { });
129
- }
130
- }
131
- // 4. OpenAI / FAL (cloud fallbacks)
132
- if (!groqApiKey && !openaiApiKey && !falApiKey) {
133
- return {
134
- text: null,
135
- provider: null,
136
- error: new Error("No transcription providers available (install whisper-cpp or set GROQ_API_KEY, OPENAI_API_KEY, or FAL_KEY)"),
137
111
  notes,
138
- };
112
+ });
113
+ if (groqResult.text)
114
+ return groqResult;
115
+ groqError = groqResult.error;
139
116
  }
140
- if (openaiApiKey && bytes.byteLength > MAX_OPENAI_UPLOAD_BYTES) {
141
- const canChunk = await isFfmpegAvailable();
142
- if (canChunk) {
143
- const tempFile = join(tmpdir(), `summarize-whisper-${randomUUID()}`);
144
- try {
145
- await fs.writeFile(tempFile, bytes);
146
- const chunked = await transcribeMediaFileWithWhisper({
147
- filePath: tempFile,
148
- mediaType,
149
- filename,
150
- groqApiKey,
151
- openaiApiKey,
152
- falApiKey,
153
- segmentSeconds: DEFAULT_SEGMENT_SECONDS,
154
- onProgress,
155
- env,
156
- });
157
- return chunked;
158
- }
159
- finally {
160
- await fs.unlink(tempFile).catch(() => { });
161
- }
162
- }
163
- notes.push(`Media too large for Whisper upload (${formatBytes(bytes.byteLength)}); transcribing first ${formatBytes(MAX_OPENAI_UPLOAD_BYTES)} only (install ffmpeg for full transcription)`);
164
- bytes = bytes.slice(0, MAX_OPENAI_UPLOAD_BYTES);
117
+ const onnx = await transcribeWithLocalOnnxFile({
118
+ filePath,
119
+ mediaType,
120
+ totalDurationSeconds,
121
+ onProgress,
122
+ env,
123
+ notes,
124
+ });
125
+ if (onnx)
126
+ return onnx;
127
+ const local = await transcribeWithLocalWhisperFile({
128
+ filePath,
129
+ mediaType,
130
+ totalDurationSeconds,
131
+ onProgress,
132
+ env,
133
+ notes,
134
+ });
135
+ if (local)
136
+ return local;
137
+ return await transcribeFileWithRemoteFallbacks({
138
+ filePath,
139
+ mediaType,
140
+ filename,
141
+ notes,
142
+ groqApiKey,
143
+ groqError,
144
+ assemblyaiApiKey,
145
+ geminiApiKey,
146
+ openaiApiKey,
147
+ falApiKey,
148
+ env,
149
+ totalDurationSeconds,
150
+ onProgress,
151
+ transcribeChunkedFile: ({ filePath, segmentSeconds, totalDurationSeconds, onProgress }) => transcribeChunkedFile({
152
+ filePath,
153
+ segmentSeconds,
154
+ totalDurationSeconds,
155
+ onProgress,
156
+ transcribeSegment: ({ bytes, filename }) => transcribeMediaWithWhisper({
157
+ bytes,
158
+ mediaType: "audio/mpeg",
159
+ filename,
160
+ groqApiKey,
161
+ skipGroq: skipGroqInNestedCalls,
162
+ assemblyaiApiKey,
163
+ geminiApiKey,
164
+ openaiApiKey,
165
+ falApiKey,
166
+ env,
167
+ }),
168
+ }),
169
+ });
170
+ }
171
+ async function transcribeWithGroqFirst({ bytes, mediaType, filename, groqApiKey, notes, }) {
172
+ let groqError = null;
173
+ try {
174
+ const text = await transcribeWithGroq(bytes, mediaType, filename, groqApiKey);
175
+ if (text)
176
+ return { text, error: null, bytes, mediaType, filename };
177
+ groqError = new Error("Groq transcription returned empty text");
165
178
  }
166
- let openaiError = null;
167
- if (openaiApiKey) {
168
- try {
169
- const text = await transcribeWithOpenAi(bytes, mediaType, filename, openaiApiKey, { env });
170
- if (text) {
171
- return { text, provider: "openai", error: null, notes };
172
- }
173
- openaiError = new Error("OpenAI transcription returned empty text");
174
- }
175
- catch (error) {
176
- openaiError = wrapError("OpenAI transcription failed", error);
177
- }
179
+ catch (error) {
180
+ groqError = wrapError("Groq transcription failed", error);
178
181
  }
179
- if (openaiApiKey && openaiError && shouldRetryOpenAiViaFfmpeg(openaiError)) {
182
+ if (groqError && shouldRetryGroqViaFfmpeg(groqError)) {
180
183
  const canTranscode = await isFfmpegAvailable();
181
184
  if (canTranscode) {
182
185
  try {
183
- // Some providers hand out containers/codecs Whisper rejects. Transcoding to a small mono MP3
184
- // is the most reliable cross-format fallback (and also reduces upload size).
185
- notes.push("OpenAI could not decode media; transcoding via ffmpeg and retrying");
186
+ notes.push("Groq could not decode media; transcoding via ffmpeg and retrying");
186
187
  const mp3Bytes = await transcodeBytesToMp3(bytes);
187
- const retried = await transcribeWithOpenAi(mp3Bytes, "audio/mpeg", "audio.mp3", openaiApiKey, { env });
188
+ const retried = await transcribeWithGroq(mp3Bytes, "audio/mpeg", "audio.mp3", groqApiKey);
188
189
  if (retried) {
189
- return { text: retried, provider: "openai", error: null, notes };
190
+ return {
191
+ text: retried,
192
+ error: null,
193
+ bytes: mp3Bytes,
194
+ mediaType: "audio/mpeg",
195
+ filename: "audio.mp3",
196
+ };
190
197
  }
191
- openaiError = new Error("OpenAI transcription returned empty text after ffmpeg transcode");
198
+ groqError = new Error("Groq transcription returned empty text after ffmpeg transcode");
192
199
  bytes = mp3Bytes;
193
200
  mediaType = "audio/mpeg";
194
201
  filename = "audio.mp3";
195
202
  }
196
203
  catch (error) {
197
- notes.push(`ffmpeg transcode failed; cannot retry OpenAI decode error: ${error instanceof Error ? error.message : String(error)}`);
204
+ notes.push(`ffmpeg transcode failed; cannot retry Groq decode error: ${error instanceof Error ? error.message : String(error)}`);
198
205
  }
199
206
  }
200
207
  else {
201
- notes.push("OpenAI could not decode media; install ffmpeg to enable transcoding retry");
208
+ notes.push("Groq could not decode media; install ffmpeg to enable transcoding retry");
202
209
  }
203
210
  }
204
- const canUseFal = Boolean(falApiKey) && mediaType.toLowerCase().startsWith("audio/");
205
- if (openaiError && canUseFal) {
206
- notes.push(`OpenAI transcription failed; falling back to FAL: ${openaiError.message}`);
207
- }
208
- if (falApiKey && !canUseFal) {
209
- notes.push(`Skipping FAL transcription: unsupported mediaType ${mediaType}`);
210
- }
211
- if (falApiKey && canUseFal) {
211
+ return { text: null, error: groqError, bytes, mediaType, filename };
212
+ }
213
+ async function transcribeGroqFileFirst({ filePath, mediaType, filename, groqApiKey, assemblyaiApiKey, geminiApiKey, openaiApiKey, falApiKey, segmentSeconds, totalDurationSeconds, onProgress, env, notes, }) {
214
+ const stat = await fs.stat(filePath);
215
+ if (stat.size <= MAX_OPENAI_UPLOAD_BYTES) {
216
+ const fileBytes = new Uint8Array(await fs.readFile(filePath));
212
217
  try {
213
- const text = await transcribeWithFal(bytes, mediaType, falApiKey);
214
- if (text) {
215
- return { text, provider: "fal", error: null, notes };
216
- }
217
- return {
218
- text: null,
219
- provider: "fal",
220
- error: new Error("FAL transcription returned empty text"),
221
- notes,
222
- };
218
+ const text = await transcribeWithGroq(fileBytes, mediaType, filename, groqApiKey);
219
+ if (text)
220
+ return { text, provider: "groq", error: null, notes };
221
+ const error = new Error("Groq transcription returned empty text");
222
+ notes.push("Groq transcription returned empty text; falling back to local/AssemblyAI/Gemini/OpenAI");
223
+ return { text: null, provider: "groq", error, notes };
223
224
  }
224
225
  catch (error) {
225
- return {
226
- text: null,
227
- provider: "fal",
228
- error: wrapError("FAL transcription failed", error),
229
- notes,
230
- };
226
+ const wrapped = wrapError("Groq transcription failed", error);
227
+ notes.push(`Groq transcription failed; falling back to local/AssemblyAI/Gemini/OpenAI: ${error instanceof Error ? error.message : String(error)}`);
228
+ return { text: null, provider: "groq", error: wrapped, notes };
231
229
  }
232
230
  }
233
- const terminalError = openaiError ?? groqError ?? new Error("No transcription providers available");
234
- const terminalProvider = openaiError
235
- ? "openai"
236
- : groqError
237
- ? "groq"
238
- : openaiApiKey
239
- ? "openai"
240
- : null;
241
- return { text: null, provider: terminalProvider, error: terminalError, notes };
231
+ const canChunk = await isFfmpegAvailable();
232
+ if (!canChunk) {
233
+ const error = new Error(`File too large for Groq upload (${formatBytes(stat.size)}); trying local providers`);
234
+ notes.push(error.message);
235
+ return { text: null, provider: "groq", error, notes };
236
+ }
237
+ const chunked = await transcribeChunkedFile({
238
+ filePath,
239
+ segmentSeconds,
240
+ totalDurationSeconds,
241
+ onProgress,
242
+ transcribeSegment: ({ bytes, filename }) => transcribeMediaWithWhisper({
243
+ bytes,
244
+ mediaType: "audio/mpeg",
245
+ filename,
246
+ groqApiKey,
247
+ assemblyaiApiKey,
248
+ geminiApiKey,
249
+ openaiApiKey,
250
+ falApiKey,
251
+ env,
252
+ }),
253
+ });
254
+ if (chunked.notes.length > 0)
255
+ notes.push(...chunked.notes);
256
+ if (chunked.text)
257
+ return { ...chunked, notes };
258
+ const error = chunked.error ?? new Error("Groq chunked transcription failed");
259
+ notes.push(`Groq chunked transcription failed; falling back to local/AssemblyAI/Gemini/OpenAI: ${error.message}`);
260
+ return { text: null, provider: "groq", error, notes };
242
261
  }
243
- export async function transcribeMediaFileWithWhisper({ filePath, mediaType, filename, groqApiKey, openaiApiKey, falApiKey, segmentSeconds = DEFAULT_SEGMENT_SECONDS, totalDurationSeconds = null, onProgress = null, env = process.env, }) {
244
- const notes = [];
245
- const stat = await fs.stat(filePath);
246
- let skipGroqInNestedCalls = false;
247
- let groqError = null;
248
- // 1. Groq (cloud, free, fastest) — try first even for file-based transcription
249
- if (groqApiKey) {
250
- skipGroqInNestedCalls = true;
251
- if (stat.size <= MAX_OPENAI_UPLOAD_BYTES) {
252
- const fileBytes = new Uint8Array(await fs.readFile(filePath));
253
- try {
254
- const text = await transcribeWithGroq(fileBytes, mediaType, filename, groqApiKey);
255
- if (text) {
256
- return { text, provider: "groq", error: null, notes };
257
- }
258
- groqError = new Error("Groq transcription returned empty text");
259
- notes.push("Groq transcription returned empty text; falling back to local/OpenAI");
260
- }
261
- catch (error) {
262
- groqError = wrapError("Groq transcription failed", error);
263
- notes.push(`Groq transcription failed; falling back to local/OpenAI: ${error instanceof Error ? error.message : String(error)}`);
264
- }
265
- }
266
- else {
267
- groqError = new Error(`File too large for Groq upload (${formatBytes(stat.size)}); trying local providers`);
268
- notes.push(groqError.message);
269
- }
262
+ async function transcribeWithLocalOnnx({ bytes, mediaType, filename, totalDurationSeconds, onProgress, env, notes, }) {
263
+ const onnxPreference = resolveOnnxModelPreference(env);
264
+ if (!onnxPreference)
265
+ return null;
266
+ const onnx = await transcribeWithOnnxCli({
267
+ model: onnxPreference,
268
+ bytes,
269
+ mediaType,
270
+ filename,
271
+ totalDurationSeconds,
272
+ onProgress,
273
+ env,
274
+ });
275
+ if (onnx.text) {
276
+ if (onnx.notes.length > 0)
277
+ notes.push(...onnx.notes);
278
+ return { ...onnx, notes };
279
+ }
280
+ if (onnx.notes.length > 0)
281
+ notes.push(...onnx.notes);
282
+ if (onnx.error) {
283
+ notes.push(`${onnx.provider ?? "onnx"} failed; falling back to Whisper: ${onnx.error.message}`);
270
284
  }
271
- // 2. ONNX (local)
285
+ return null;
286
+ }
287
+ async function transcribeWithLocalOnnxFile({ filePath, mediaType, totalDurationSeconds, onProgress, env, notes, }) {
272
288
  const onnxPreference = resolveOnnxModelPreference(env);
273
- if (onnxPreference) {
274
- onProgress?.({
275
- partIndex: null,
276
- parts: null,
277
- processedDurationSeconds: null,
278
- totalDurationSeconds,
279
- });
280
- const onnx = await transcribeWithOnnxCliFile({
281
- model: onnxPreference,
282
- filePath,
283
- mediaType,
284
- totalDurationSeconds,
285
- onProgress,
286
- env,
287
- });
288
- if (onnx.text) {
289
- if (onnx.notes.length > 0)
290
- notes.push(...onnx.notes);
291
- return { ...onnx, notes };
292
- }
289
+ if (!onnxPreference)
290
+ return null;
291
+ onProgress?.({
292
+ partIndex: null,
293
+ parts: null,
294
+ processedDurationSeconds: null,
295
+ totalDurationSeconds,
296
+ });
297
+ const onnx = await transcribeWithOnnxCliFile({
298
+ model: onnxPreference,
299
+ filePath,
300
+ mediaType,
301
+ totalDurationSeconds,
302
+ onProgress,
303
+ env,
304
+ });
305
+ if (onnx.text) {
293
306
  if (onnx.notes.length > 0)
294
307
  notes.push(...onnx.notes);
295
- if (onnx.error) {
296
- notes.push(`${onnx.provider ?? "onnx"} failed; falling back to Whisper: ${onnx.error.message}`);
297
- }
308
+ return { ...onnx, notes };
309
+ }
310
+ if (onnx.notes.length > 0)
311
+ notes.push(...onnx.notes);
312
+ if (onnx.error) {
313
+ notes.push(`${onnx.provider ?? "onnx"} failed; falling back to Whisper: ${onnx.error.message}`);
298
314
  }
299
- // 3. whisper.cpp (local)
300
- const localReady = await isWhisperCppReady();
301
- let local = null;
302
- if (localReady) {
303
- onProgress?.({
304
- partIndex: null,
305
- parts: null,
306
- processedDurationSeconds: null,
315
+ return null;
316
+ }
317
+ async function transcribeWithLocalWhisperBytes({ bytes, mediaType, filename, totalDurationSeconds, onProgress, env, notes, }) {
318
+ const localReady = await isWhisperCppReady(env);
319
+ if (!localReady)
320
+ return null;
321
+ const nameHint = filename?.trim() ? basename(filename.trim()) : "media";
322
+ const tempFile = join(tmpdir(), `summarize-whisper-local-${randomUUID()}-${ensureWhisperFilenameExtension(nameHint, mediaType)}`);
323
+ try {
324
+ await fs.writeFile(tempFile, bytes);
325
+ const result = await safeTranscribeWithWhisperCppFile({
326
+ filePath: tempFile,
327
+ mediaType,
307
328
  totalDurationSeconds,
329
+ onProgress,
330
+ env,
308
331
  });
309
- try {
310
- local = await transcribeWithWhisperCppFile({
311
- filePath,
312
- mediaType,
313
- totalDurationSeconds,
314
- onProgress,
315
- });
316
- }
317
- catch (error) {
318
- local = {
319
- text: null,
320
- provider: "whisper.cpp",
321
- error: wrapError("whisper.cpp failed", error),
322
- notes: [],
323
- };
332
+ if (result.text) {
333
+ if (result.notes.length > 0)
334
+ notes.push(...result.notes);
335
+ return { ...result, notes };
324
336
  }
325
- if (local.text) {
326
- if (local.notes.length > 0)
327
- notes.push(...local.notes);
328
- return { ...local, notes };
337
+ if (result.notes.length > 0)
338
+ notes.push(...result.notes);
339
+ if (result.error) {
340
+ notes.push(`whisper.cpp failed; falling back to remote Whisper: ${result.error.message}`);
329
341
  }
330
- if (local.notes.length > 0)
331
- notes.push(...local.notes);
332
- if (local.error) {
333
- notes.push(`whisper.cpp failed; falling back to remote Whisper: ${local.error.message}`);
334
- }
335
- }
336
- // 4. OpenAI / FAL (cloud fallbacks)
337
- if (!openaiApiKey && !falApiKey) {
338
- if (groqError) {
339
- return {
340
- text: null,
341
- provider: "groq",
342
- error: groqError,
343
- notes,
344
- };
345
- }
346
- return {
347
- text: null,
348
- provider: null,
349
- error: new Error("No transcription providers available (install whisper-cpp or set GROQ_API_KEY, OPENAI_API_KEY, or FAL_KEY)"),
350
- notes,
351
- };
342
+ return null;
352
343
  }
353
- if (openaiApiKey && stat.size > MAX_OPENAI_UPLOAD_BYTES) {
354
- const canChunk = await isFfmpegAvailable();
355
- if (!canChunk) {
356
- notes.push(`Media too large for Whisper upload (${formatBytes(stat.size)}); install ffmpeg to enable chunked transcription`);
357
- const head = await readFirstBytes(filePath, MAX_OPENAI_UPLOAD_BYTES);
358
- const partial = await transcribeMediaWithWhisper({
359
- bytes: head,
360
- mediaType,
361
- filename,
362
- groqApiKey,
363
- skipGroq: skipGroqInNestedCalls,
364
- openaiApiKey,
365
- falApiKey,
366
- env,
367
- });
368
- if (partial.notes.length > 0)
369
- notes.push(...partial.notes);
370
- return { ...partial, notes };
371
- }
372
- const dir = await fs.mkdtemp(join(tmpdir(), "summarize-whisper-segments-"));
373
- try {
374
- const pattern = join(dir, "part-%03d.mp3");
375
- await runFfmpegSegment({
376
- inputPath: filePath,
377
- outputPattern: pattern,
378
- segmentSeconds,
379
- });
380
- const files = (await fs.readdir(dir))
381
- .filter((name) => name.startsWith("part-") && name.endsWith(".mp3"))
382
- .sort((a, b) => a.localeCompare(b));
383
- if (files.length === 0) {
384
- return {
385
- text: null,
386
- provider: null,
387
- error: new Error("ffmpeg produced no audio segments"),
388
- notes,
389
- };
390
- }
391
- notes.push(`ffmpeg chunked media into ${files.length} parts (${segmentSeconds}s each)`);
392
- onProgress?.({
393
- partIndex: null,
394
- parts: files.length,
395
- processedDurationSeconds: null,
396
- totalDurationSeconds,
397
- });
398
- const parts = [];
399
- let usedProvider = null;
400
- for (const [index, name] of files.entries()) {
401
- const segmentPath = join(dir, name);
402
- const segmentBytes = new Uint8Array(await fs.readFile(segmentPath));
403
- const result = await transcribeMediaWithWhisper({
404
- bytes: segmentBytes,
405
- mediaType: "audio/mpeg",
406
- filename: name,
407
- groqApiKey,
408
- skipGroq: skipGroqInNestedCalls,
409
- openaiApiKey,
410
- falApiKey,
411
- onProgress: null,
412
- env,
413
- });
414
- if (!usedProvider && result.provider)
415
- usedProvider = result.provider;
416
- if (result.error && !result.text) {
417
- return { text: null, provider: usedProvider, error: result.error, notes };
418
- }
419
- if (result.text)
420
- parts.push(result.text);
421
- // Coarse but useful: update based on part boundaries. Duration is best-effort (RSS hints or
422
- // ffprobe); the per-part time is stable enough to make the spinner feel alive.
423
- const processedSeconds = Math.max(0, (index + 1) * segmentSeconds);
424
- onProgress?.({
425
- partIndex: index + 1,
426
- parts: files.length,
427
- processedDurationSeconds: typeof totalDurationSeconds === "number" && totalDurationSeconds > 0
428
- ? Math.min(processedSeconds, totalDurationSeconds)
429
- : null,
430
- totalDurationSeconds,
431
- });
432
- }
433
- return { text: parts.join("\n\n"), provider: usedProvider, error: null, notes };
434
- }
435
- finally {
436
- await fs.rm(dir, { recursive: true, force: true }).catch(() => { });
437
- }
344
+ finally {
345
+ await fs.unlink(tempFile).catch(() => { });
438
346
  }
439
- const bytes = new Uint8Array(await fs.readFile(filePath));
347
+ }
348
+ async function transcribeWithLocalWhisperFile({ filePath, mediaType, totalDurationSeconds, onProgress, env, notes, }) {
349
+ const localReady = await isWhisperCppReady(env);
350
+ if (!localReady)
351
+ return null;
440
352
  onProgress?.({
441
353
  partIndex: null,
442
354
  parts: null,
443
355
  processedDurationSeconds: null,
444
356
  totalDurationSeconds,
445
357
  });
446
- const result = await transcribeMediaWithWhisper({
447
- bytes,
358
+ const result = await safeTranscribeWithWhisperCppFile({
359
+ filePath,
448
360
  mediaType,
449
- filename,
450
- groqApiKey,
451
- skipGroq: skipGroqInNestedCalls,
452
- openaiApiKey,
453
- falApiKey,
361
+ totalDurationSeconds,
362
+ onProgress,
454
363
  env,
455
364
  });
365
+ if (result.text) {
366
+ if (result.notes.length > 0)
367
+ notes.push(...result.notes);
368
+ return { ...result, notes };
369
+ }
456
370
  if (result.notes.length > 0)
457
371
  notes.push(...result.notes);
458
- return { ...result, notes };
372
+ if (result.error) {
373
+ notes.push(`whisper.cpp failed; falling back to remote Whisper: ${result.error.message}`);
374
+ }
375
+ return null;
376
+ }
377
+ async function safeTranscribeWithWhisperCppFile(args) {
378
+ try {
379
+ return await transcribeWithWhisperCppFile(args);
380
+ }
381
+ catch (error) {
382
+ return {
383
+ text: null,
384
+ provider: "whisper.cpp",
385
+ error: wrapError("whisper.cpp failed", error),
386
+ notes: [],
387
+ };
388
+ }
459
389
  }
460
390
  //# sourceMappingURL=core.js.map