@aj-archipelago/cortex 1.3.21 → 1.3.23
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +64 -0
- package/config.js +26 -1
- package/helper-apps/cortex-realtime-voice-server/src/cortex/memory.ts +2 -2
- package/helper-apps/cortex-realtime-voice-server/src/realtime/client.ts +9 -4
- package/helper-apps/cortex-realtime-voice-server/src/realtime/realtimeTypes.ts +1 -0
- package/lib/util.js +5 -25
- package/package.json +5 -2
- package/pathways/system/entity/memory/shared/sys_memory_helpers.js +228 -0
- package/pathways/system/entity/memory/sys_memory_format.js +30 -0
- package/pathways/system/entity/memory/sys_memory_manager.js +85 -27
- package/pathways/system/entity/memory/sys_memory_process.js +154 -0
- package/pathways/system/entity/memory/sys_memory_required.js +4 -2
- package/pathways/system/entity/memory/sys_memory_topic.js +22 -0
- package/pathways/system/entity/memory/sys_memory_update.js +50 -150
- package/pathways/system/entity/memory/sys_read_memory.js +67 -69
- package/pathways/system/entity/memory/sys_save_memory.js +1 -1
- package/pathways/system/entity/memory/sys_search_memory.js +1 -1
- package/pathways/system/entity/sys_entity_start.js +9 -6
- package/pathways/system/entity/sys_generator_image.js +5 -41
- package/pathways/system/entity/sys_generator_memory.js +3 -1
- package/pathways/system/entity/sys_generator_reasoning.js +1 -1
- package/pathways/system/entity/sys_router_tool.js +3 -4
- package/pathways/system/rest_streaming/sys_claude_35_sonnet.js +1 -1
- package/pathways/system/rest_streaming/sys_claude_3_haiku.js +1 -1
- package/pathways/system/rest_streaming/sys_google_gemini_chat.js +1 -1
- package/pathways/system/rest_streaming/sys_ollama_chat.js +21 -0
- package/pathways/system/rest_streaming/sys_ollama_completion.js +14 -0
- package/pathways/system/rest_streaming/sys_openai_chat_o1.js +1 -1
- package/pathways/system/rest_streaming/sys_openai_chat_o3_mini.js +1 -1
- package/pathways/transcribe_gemini.js +525 -0
- package/server/modelExecutor.js +8 -0
- package/server/pathwayResolver.js +13 -8
- package/server/plugins/claude3VertexPlugin.js +150 -18
- package/server/plugins/gemini15ChatPlugin.js +90 -1
- package/server/plugins/gemini15VisionPlugin.js +16 -3
- package/server/plugins/modelPlugin.js +12 -9
- package/server/plugins/ollamaChatPlugin.js +158 -0
- package/server/plugins/ollamaCompletionPlugin.js +147 -0
- package/server/rest.js +70 -8
- package/tests/claude3VertexToolConversion.test.js +411 -0
- package/tests/memoryfunction.test.js +560 -46
- package/tests/multimodal_conversion.test.js +169 -0
- package/tests/openai_api.test.js +332 -0
- package/tests/transcribe_gemini.test.js +217 -0
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
import logger from "../lib/logger.js";
|
|
2
|
+
import { publishRequestProgress } from "../lib/redisSubscription.js";
|
|
3
|
+
import { alignSubtitles, getMediaChunks } from "../lib/util.js";
|
|
4
|
+
import { Prompt } from "../server/prompt.js";
|
|
5
|
+
|
|
6
|
+
const OFFSET_CHUNK = 500; //seconds of each chunk offset, only used if helper does not provide
|
|
7
|
+
|
|
8
|
+
export function convertSrtToVtt(data) {
|
|
9
|
+
if (!data || !data.trim()) {
|
|
10
|
+
return "WEBVTT\n\n";
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
// If it's already VTT format and has header
|
|
14
|
+
if (data.trim().startsWith("WEBVTT")) {
|
|
15
|
+
const lines = data.split("\n");
|
|
16
|
+
const result = ["WEBVTT", ""]; // Start with header and blank line
|
|
17
|
+
let currentCue = [];
|
|
18
|
+
|
|
19
|
+
for (let i = 0; i < lines.length; i++) {
|
|
20
|
+
const line = lines[i].trim();
|
|
21
|
+
|
|
22
|
+
// Skip empty lines and the WEBVTT header
|
|
23
|
+
if (!line || line === "WEBVTT") {
|
|
24
|
+
continue;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// If it's a number by itself, it's a cue identifier
|
|
28
|
+
if (/^\d+$/.test(line)) {
|
|
29
|
+
// If we have a previous cue, add it with proper spacing
|
|
30
|
+
if (currentCue.length > 0) {
|
|
31
|
+
result.push(currentCue.join("\n"));
|
|
32
|
+
result.push(""); // Add blank line between cues
|
|
33
|
+
currentCue = [];
|
|
34
|
+
}
|
|
35
|
+
currentCue.push(line);
|
|
36
|
+
continue;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// Check for and convert timestamps
|
|
40
|
+
const fullTimeRegex = /^(\d{2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})[,.](\d{3})$/;
|
|
41
|
+
const shortTimeRegex = /^(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*(\d{2}):(\d{2})[,.](\d{3})$/;
|
|
42
|
+
const ultraShortTimeRegex = /^(\d{1,2})[.](\d{3})\s*-->\s*(\d{1,2})[.](\d{3})$/;
|
|
43
|
+
|
|
44
|
+
const fullMatch = line.match(fullTimeRegex);
|
|
45
|
+
const shortMatch = line.match(shortTimeRegex);
|
|
46
|
+
const ultraShortMatch = line.match(ultraShortTimeRegex);
|
|
47
|
+
|
|
48
|
+
if (fullMatch) {
|
|
49
|
+
// Already in correct format, just convert comma to dot
|
|
50
|
+
const convertedTime = line.replace(/,/g, '.');
|
|
51
|
+
currentCue.push(convertedTime);
|
|
52
|
+
} else if (shortMatch) {
|
|
53
|
+
// Convert MM:SS to HH:MM:SS
|
|
54
|
+
const convertedTime = `00:${shortMatch[1]}:${shortMatch[2]}.${shortMatch[3]} --> 00:${shortMatch[4]}:${shortMatch[5]}.${shortMatch[6]}`;
|
|
55
|
+
currentCue.push(convertedTime);
|
|
56
|
+
} else if (ultraShortMatch) {
|
|
57
|
+
// Convert SS to HH:MM:SS
|
|
58
|
+
const convertedTime = `00:00:${ultraShortMatch[1].padStart(2, '0')}.${ultraShortMatch[2]} --> 00:00:${ultraShortMatch[3].padStart(2, '0')}.${ultraShortMatch[4]}`;
|
|
59
|
+
currentCue.push(convertedTime);
|
|
60
|
+
} else if (!line.includes('-->')) {
|
|
61
|
+
// Must be subtitle text
|
|
62
|
+
currentCue.push(line);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Add the last cue if there is one
|
|
67
|
+
if (currentCue.length > 0) {
|
|
68
|
+
result.push(currentCue.join("\n"));
|
|
69
|
+
result.push(""); // Add final blank line
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Join with newlines and ensure proper ending
|
|
73
|
+
return result.join("\n") + "\n";
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
// remove dos newlines and trim
|
|
77
|
+
var srt = data.replace(/\r+/g, "");
|
|
78
|
+
srt = srt.replace(/^\s+|\s+$/g, "");
|
|
79
|
+
|
|
80
|
+
// Split into cues and filter out empty ones
|
|
81
|
+
var cuelist = srt.split("\n\n").filter(cue => cue.trim());
|
|
82
|
+
|
|
83
|
+
// Always add WEBVTT header
|
|
84
|
+
var result = "WEBVTT\n\n";
|
|
85
|
+
|
|
86
|
+
// Convert each cue to VTT format
|
|
87
|
+
for (const cue of cuelist) {
|
|
88
|
+
const lines = cue.split("\n").map(line => line.trim()).filter(line => line);
|
|
89
|
+
if (lines.length < 2) continue;
|
|
90
|
+
|
|
91
|
+
let output = [];
|
|
92
|
+
|
|
93
|
+
// Handle cue identifier
|
|
94
|
+
if (/^\d+$/.test(lines[0])) {
|
|
95
|
+
output.push(lines[0]);
|
|
96
|
+
lines.shift();
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Handle timestamp line
|
|
100
|
+
const timeLine = lines[0];
|
|
101
|
+
const fullTimeRegex = /^(\d{2}):(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2})[,.](\d{3})$/;
|
|
102
|
+
const shortTimeRegex = /^(\d{2}):(\d{2})[,.](\d{3})\s*-->\s*(\d{2}):(\d{2})[,.](\d{3})$/;
|
|
103
|
+
const ultraShortTimeRegex = /^(\d{1,2})[.](\d{3})\s*-->\s*(\d{1,2})[.](\d{3})$/;
|
|
104
|
+
|
|
105
|
+
const fullMatch = timeLine.match(fullTimeRegex);
|
|
106
|
+
const shortMatch = timeLine.match(shortTimeRegex);
|
|
107
|
+
const ultraShortMatch = timeLine.match(ultraShortTimeRegex);
|
|
108
|
+
|
|
109
|
+
if (fullMatch) {
|
|
110
|
+
output.push(timeLine.replace(/,/g, '.'));
|
|
111
|
+
} else if (shortMatch) {
|
|
112
|
+
output.push(`00:${shortMatch[1]}:${shortMatch[2]}.${shortMatch[3]} --> 00:${shortMatch[4]}:${shortMatch[5]}.${shortMatch[6]}`);
|
|
113
|
+
} else if (ultraShortMatch) {
|
|
114
|
+
output.push(`00:00:${ultraShortMatch[1].padStart(2, '0')}.${ultraShortMatch[2]} --> 00:00:${ultraShortMatch[3].padStart(2, '0')}.${ultraShortMatch[4]}`);
|
|
115
|
+
} else {
|
|
116
|
+
continue; // Invalid timestamp format
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Add remaining lines as subtitle text
|
|
120
|
+
output.push(...lines.slice(1));
|
|
121
|
+
|
|
122
|
+
// Add the cue to result
|
|
123
|
+
result += output.join("\n") + "\n\n";
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return result;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function convertSrtCue(caption) {
|
|
130
|
+
if (!caption || !caption.trim()) {
|
|
131
|
+
return "";
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
var cue = "";
|
|
135
|
+
var s = caption.split(/\n/);
|
|
136
|
+
|
|
137
|
+
// concatenate multi-line string separated in array into one
|
|
138
|
+
while (s.length > 3) {
|
|
139
|
+
for (var i = 3; i < s.length; i++) {
|
|
140
|
+
s[2] += "\n" + s[i];
|
|
141
|
+
}
|
|
142
|
+
s.splice(3, s.length - 3);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
var line = 0;
|
|
146
|
+
|
|
147
|
+
// detect identifier
|
|
148
|
+
if (
|
|
149
|
+
s[0] &&
|
|
150
|
+
s[1] &&
|
|
151
|
+
!s[0].match(/\d+:\d+:\d+/) &&
|
|
152
|
+
s[1].match(/\d+:\d+:\d+/)
|
|
153
|
+
) {
|
|
154
|
+
const match = s[0].match(/^\d+$/); // Only match if the entire line is a number
|
|
155
|
+
if (match) {
|
|
156
|
+
cue += match[0] + "\n";
|
|
157
|
+
line += 1;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// get time strings
|
|
162
|
+
if (s[line] && s[line].match(/\d+:\d+:\d+/)) {
|
|
163
|
+
// convert time string
|
|
164
|
+
var m = s[line].match(
|
|
165
|
+
/(\d{2}):(\d{2}):(\d{2})[,.](\d{3})\s*--?>\s*(\d{2}):(\d{2}):(\d{2})[,.](\d{3})/,
|
|
166
|
+
);
|
|
167
|
+
if (m) {
|
|
168
|
+
cue +=
|
|
169
|
+
m[1] +
|
|
170
|
+
":" +
|
|
171
|
+
m[2] +
|
|
172
|
+
":" +
|
|
173
|
+
m[3] +
|
|
174
|
+
"." +
|
|
175
|
+
m[4] +
|
|
176
|
+
" --> " +
|
|
177
|
+
m[5] +
|
|
178
|
+
":" +
|
|
179
|
+
m[6] +
|
|
180
|
+
":" +
|
|
181
|
+
m[7] +
|
|
182
|
+
"." +
|
|
183
|
+
m[8] +
|
|
184
|
+
"\n";
|
|
185
|
+
line += 1;
|
|
186
|
+
} else {
|
|
187
|
+
// Try alternate timestamp format
|
|
188
|
+
m = s[line].match(
|
|
189
|
+
/(\d{2}):(\d{2})\.(\d{3})\s*--?>\s*(\d{2}):(\d{2})\.(\d{3})/,
|
|
190
|
+
);
|
|
191
|
+
if (m) {
|
|
192
|
+
// Convert to full timestamp format
|
|
193
|
+
cue +=
|
|
194
|
+
"00:" +
|
|
195
|
+
m[1] +
|
|
196
|
+
":" +
|
|
197
|
+
m[2] +
|
|
198
|
+
"." +
|
|
199
|
+
m[3] +
|
|
200
|
+
" --> " +
|
|
201
|
+
"00:" +
|
|
202
|
+
m[4] +
|
|
203
|
+
":" +
|
|
204
|
+
m[5] +
|
|
205
|
+
"." +
|
|
206
|
+
m[6] +
|
|
207
|
+
"\n";
|
|
208
|
+
line += 1;
|
|
209
|
+
} else {
|
|
210
|
+
// Unrecognized timestring
|
|
211
|
+
return "";
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
} else {
|
|
215
|
+
// file format error or comment lines
|
|
216
|
+
return "";
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
// get cue text
|
|
220
|
+
if (s[line]) {
|
|
221
|
+
cue += s[line] + "\n\n";
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return cue;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
export function detectSubtitleFormat(text) {
|
|
228
|
+
// Remove DOS newlines and trim whitespace
|
|
229
|
+
const cleanText = text.replace(/\r+/g, "").trim();
|
|
230
|
+
const lines = cleanText.split("\n");
|
|
231
|
+
|
|
232
|
+
// Check if it's VTT format - be more lenient with the header
|
|
233
|
+
if (lines[0]?.trim() === "WEBVTT") {
|
|
234
|
+
return "vtt";
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
// Define regex patterns for timestamp formats
|
|
238
|
+
const srtTimeRegex =
|
|
239
|
+
/(\d{2}:\d{2}:\d{2})[,.]\d{3}\s*-->\s*(\d{2}:\d{2}:\d{2})[,.]\d{3}/;
|
|
240
|
+
const vttTimeRegex =
|
|
241
|
+
/(?:\d{2}:)?(\d{1,2})[.]\d{3}\s*-->\s*(?:\d{2}:)?(\d{1,2})[.]\d{3}/;
|
|
242
|
+
|
|
243
|
+
let hasSrtTimestamps = false;
|
|
244
|
+
let hasVttTimestamps = false;
|
|
245
|
+
let hasSequentialNumbers = false;
|
|
246
|
+
let lastNumber = 0;
|
|
247
|
+
|
|
248
|
+
// Look through first few lines to detect patterns
|
|
249
|
+
for (let i = 0; i < Math.min(lines.length, 12); i++) {
|
|
250
|
+
const line = lines[i]?.trim();
|
|
251
|
+
if (!line) continue;
|
|
252
|
+
|
|
253
|
+
// Check for timestamps
|
|
254
|
+
if (srtTimeRegex.test(line)) {
|
|
255
|
+
hasSrtTimestamps = true;
|
|
256
|
+
}
|
|
257
|
+
if (vttTimeRegex.test(line)) {
|
|
258
|
+
hasVttTimestamps = true;
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
// Check for sequential numbers
|
|
262
|
+
const numberMatch = line.match(/^(\d+)$/);
|
|
263
|
+
if (numberMatch) {
|
|
264
|
+
const num = parseInt(numberMatch[1]);
|
|
265
|
+
if (lastNumber === 0 || num === lastNumber + 1) {
|
|
266
|
+
hasSequentialNumbers = true;
|
|
267
|
+
lastNumber = num;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// If it has SRT-style timestamps (HH:MM:SS), it's SRT
|
|
273
|
+
if (hasSrtTimestamps && hasSequentialNumbers) {
|
|
274
|
+
return "srt";
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
// If it has VTT-style timestamps (MM:SS) or WEBVTT header, it's VTT
|
|
278
|
+
if (hasVttTimestamps) {
|
|
279
|
+
return "vtt";
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
return null;
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
export default {
|
|
286
|
+
prompt:
|
|
287
|
+
[
|
|
288
|
+
new Prompt({ messages: [
|
|
289
|
+
"{{messages}}",
|
|
290
|
+
]}),
|
|
291
|
+
],
|
|
292
|
+
model: 'gemini-flash-20-vision',
|
|
293
|
+
inputParameters: {
|
|
294
|
+
file: ``,
|
|
295
|
+
language: ``,
|
|
296
|
+
responseFormat: `text`,
|
|
297
|
+
wordTimestamped: false,
|
|
298
|
+
highlightWords: false,
|
|
299
|
+
maxLineWidth: 0,
|
|
300
|
+
maxLineCount: 0,
|
|
301
|
+
maxWordsPerLine: 0,
|
|
302
|
+
contextId: ``,
|
|
303
|
+
},
|
|
304
|
+
timeout: 3600, // in seconds
|
|
305
|
+
enableDuplicateRequests: false,
|
|
306
|
+
|
|
307
|
+
executePathway: async ({args, runAllPrompts, resolver}) => {
|
|
308
|
+
let intervalId;
|
|
309
|
+
const { requestId } = resolver;
|
|
310
|
+
|
|
311
|
+
try{
|
|
312
|
+
let totalCount = 11; //init max chunk value
|
|
313
|
+
let completedCount = 0;
|
|
314
|
+
let partialCount = 0;
|
|
315
|
+
let partialRatio = 0;
|
|
316
|
+
|
|
317
|
+
const sendProgress = (partial=false, resetCount=false) => {
|
|
318
|
+
partialCount = resetCount ? 0 : partialCount;
|
|
319
|
+
|
|
320
|
+
if(partial){
|
|
321
|
+
partialCount++;
|
|
322
|
+
const increment = 0.02 / Math.log2(partialCount + 1); // logarithmic diminishing increment
|
|
323
|
+
partialRatio = Math.min(partialRatio + increment, 0.99); // limit to 0.99
|
|
324
|
+
}else{
|
|
325
|
+
partialCount = 0;
|
|
326
|
+
partialRatio = 0;
|
|
327
|
+
completedCount++;
|
|
328
|
+
}
|
|
329
|
+
if(completedCount >= totalCount) return;
|
|
330
|
+
|
|
331
|
+
const progress = (completedCount + partialRatio) / totalCount;
|
|
332
|
+
logger.info(`Progress for ${requestId}: ${progress}`);
|
|
333
|
+
|
|
334
|
+
publishRequestProgress({
|
|
335
|
+
requestId,
|
|
336
|
+
progress,
|
|
337
|
+
data: null,
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
sendProgress(true);
|
|
341
|
+
intervalId = setInterval(() => sendProgress(true), 3000);
|
|
342
|
+
|
|
343
|
+
const { file, responseFormat, wordTimestamped, maxLineWidth } = args;
|
|
344
|
+
if(!file) {
|
|
345
|
+
throw new Error("Please provide a file to transcribe.");
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
//check if fils is a gcs file or youtube
|
|
350
|
+
const isGcs = file.startsWith('gs://');
|
|
351
|
+
const isYoutube = file.match(/^(http(s)?:\/\/)?((w){3}.)?youtu(be|.be)?(\.com)?\/.+/);
|
|
352
|
+
|
|
353
|
+
let chunks = [{
|
|
354
|
+
url: file,
|
|
355
|
+
gcs: file,
|
|
356
|
+
offset: 0,
|
|
357
|
+
}];
|
|
358
|
+
if(!isGcs && !isYoutube) {
|
|
359
|
+
//get chunks from helper api if not gcs or youtube
|
|
360
|
+
chunks = await getMediaChunks(file, requestId);
|
|
361
|
+
}
|
|
362
|
+
totalCount = chunks.length+1;
|
|
363
|
+
logger.info(`Processing chunks: ${JSON.stringify(chunks)}`);
|
|
364
|
+
|
|
365
|
+
sendProgress(true);
|
|
366
|
+
|
|
367
|
+
let respectLimitsPrompt = " ";
|
|
368
|
+
if (maxLineWidth) {
|
|
369
|
+
|
|
370
|
+
const possiblePlacement = maxLineWidth <= 25
|
|
371
|
+
? "vertical" : maxLineWidth <= 35 ? "horizontal" : "";
|
|
372
|
+
|
|
373
|
+
respectLimitsPrompt += `The output lines must not exceed ${maxLineWidth} characters, so make sure your transcription lines and timestamps are perfectly aligned. `;
|
|
374
|
+
|
|
375
|
+
if(possiblePlacement){
|
|
376
|
+
respectLimitsPrompt+= `This limit a must as user will be using the output for ${possiblePlacement} display.`
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
const transcriptionLevel = wordTimestamped ? "word" : "phrase";
|
|
381
|
+
|
|
382
|
+
function getMessages(file, format) {
|
|
383
|
+
|
|
384
|
+
const responseFormat = format!== 'text' ? 'SRT' : 'text';
|
|
385
|
+
|
|
386
|
+
const messages = [
|
|
387
|
+
{"role": "system", "content": `Instructions:\nYou are an AI entity with expertise of transcription. Your response only contains the transcription, no comments or additonal stuff.
|
|
388
|
+
|
|
389
|
+
Your output must be in the format asked, and must be strictly following the formats and parseble by auto parsers.
|
|
390
|
+
|
|
391
|
+
Word-level transcriptions must be per word timestamped, and phrase-level transcriptions are per phrase.
|
|
392
|
+
|
|
393
|
+
Each transcription timestamp must precisely match the corresponding audio/video segment.
|
|
394
|
+
Each timestamp must correspond to actual spoken content.
|
|
395
|
+
End time cannot exceed total media duration. Especially when transcribing word-level double check your timestamps, never exceed the total duration.
|
|
396
|
+
|
|
397
|
+
You must follow 1, 2, 3, ... numbering for each transcription segment without any missing numbers.
|
|
398
|
+
Never put newlines or spaces in the middle of a timestamp.
|
|
399
|
+
Never put multiple lines for a single timestamp.
|
|
400
|
+
|
|
401
|
+
Example responses:
|
|
402
|
+
|
|
403
|
+
- If asked SRT format, e.g.:
|
|
404
|
+
1
|
|
405
|
+
00:00:00,498 --> 00:00:02,827
|
|
406
|
+
Hello World!
|
|
407
|
+
|
|
408
|
+
2
|
|
409
|
+
00:00:02,827 --> 00:00:06,383
|
|
410
|
+
Being AI is fun!
|
|
411
|
+
|
|
412
|
+
- If asked VTT format, e.g.:
|
|
413
|
+
WEBVTT
|
|
414
|
+
|
|
415
|
+
1
|
|
416
|
+
00:00:00.000 --> 00:00:02.944
|
|
417
|
+
Hello World2!
|
|
418
|
+
|
|
419
|
+
2
|
|
420
|
+
00:00:05.344 --> 00:00:08.809
|
|
421
|
+
Being AI is also great!
|
|
422
|
+
|
|
423
|
+
- If asked text format, e.g.:
|
|
424
|
+
Hello World!!! Being AI is being great yet again!
|
|
425
|
+
|
|
426
|
+
Word-level output e.g.:
|
|
427
|
+
|
|
428
|
+
WEBVTT
|
|
429
|
+
|
|
430
|
+
1
|
|
431
|
+
00:00:00.000 --> 00:00:01.944
|
|
432
|
+
Hello
|
|
433
|
+
|
|
434
|
+
2
|
|
435
|
+
00:00:01.964 --> 00:00:02.383
|
|
436
|
+
World!
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
You must follow spacing, punctuation, and timestamps as shown in the examples otherwise your response will not be accepted.
|
|
440
|
+
Never output multiple lines for a single timestamp.
|
|
441
|
+
Even a single newline or space can cause the response to be rejected. You must follow the format strictly. You must place newlines and timestamps exactly as shown in the examples.
|
|
442
|
+
|
|
443
|
+
`},
|
|
444
|
+
{"role": "user", "content": [
|
|
445
|
+
`{ type: 'text', text: 'Transcribe the media ${transcriptionLevel}-level in ${responseFormat} format.${respectLimitsPrompt}' }`,
|
|
446
|
+
JSON.stringify({
|
|
447
|
+
type: 'image_url',
|
|
448
|
+
url: file,
|
|
449
|
+
gcs: file
|
|
450
|
+
})
|
|
451
|
+
]},
|
|
452
|
+
]
|
|
453
|
+
|
|
454
|
+
return messages;
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
|
|
458
|
+
const processChunksParallel = async (chunks, args) => {
|
|
459
|
+
try {
|
|
460
|
+
const chunkPromises = chunks.map(async (chunk, index) => ({
|
|
461
|
+
index,
|
|
462
|
+
result: await runAllPrompts({
|
|
463
|
+
...args,
|
|
464
|
+
messages: getMessages(chunk.gcs || chunk.uri, responseFormat),
|
|
465
|
+
requestId: `${requestId}-${index}`
|
|
466
|
+
})
|
|
467
|
+
}));
|
|
468
|
+
|
|
469
|
+
const results = await Promise.all(
|
|
470
|
+
chunkPromises.map(promise =>
|
|
471
|
+
promise.then(result => {
|
|
472
|
+
sendProgress();
|
|
473
|
+
return result;
|
|
474
|
+
})
|
|
475
|
+
));
|
|
476
|
+
|
|
477
|
+
return results
|
|
478
|
+
.sort((a, b) => a.index - b.index)
|
|
479
|
+
.map(item => item.result);
|
|
480
|
+
} catch (error) {
|
|
481
|
+
logger.error('Error processing chunks:', error);
|
|
482
|
+
throw error;
|
|
483
|
+
}
|
|
484
|
+
};
|
|
485
|
+
|
|
486
|
+
// serial processing of chunks
|
|
487
|
+
// const result = [];
|
|
488
|
+
// for(const chunk of chunks) {
|
|
489
|
+
// const chunkResult = await runAllPrompts({ ...args, messages: getMessages(chunk.gcs || chunk.uri) });
|
|
490
|
+
// result.push(chunkResult);
|
|
491
|
+
// }
|
|
492
|
+
|
|
493
|
+
const result = await processChunksParallel(chunks, args);
|
|
494
|
+
|
|
495
|
+
// publishRequestProgress({
|
|
496
|
+
// requestId: this.rootRequestId || this.requestId,
|
|
497
|
+
// progress: 1,
|
|
498
|
+
// data: "a",
|
|
499
|
+
// });
|
|
500
|
+
|
|
501
|
+
if (['srt','vtt'].includes(responseFormat) || wordTimestamped) { // align subtitles for formats
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
// convert as gemini output is unstable
|
|
506
|
+
for(let i = 0; i < result.length; i++) {
|
|
507
|
+
try{
|
|
508
|
+
result[i] = convertSrtToVtt(result[i]);
|
|
509
|
+
}catch(error){
|
|
510
|
+
logger.error(`Error converting to vtt: ${error}`);
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
const offsets = chunks.map((chunk, index) => chunk?.offset || index * OFFSET_CHUNK);
|
|
515
|
+
return alignSubtitles(result, responseFormat, offsets);
|
|
516
|
+
}
|
|
517
|
+
return result.join(` `);
|
|
518
|
+
}catch(error){
|
|
519
|
+
logger.error(`Error in transcribing: ${error}`);
|
|
520
|
+
throw error;
|
|
521
|
+
}finally{
|
|
522
|
+
intervalId && clearInterval(intervalId);
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
};
|
package/server/modelExecutor.js
CHANGED
|
@@ -28,6 +28,8 @@ import NeuralSpacePlugin from './plugins/neuralSpacePlugin.js';
|
|
|
28
28
|
import RunwareAiPlugin from './plugins/runwareAiPlugin.js';
|
|
29
29
|
import ReplicateApiPlugin from './plugins/replicateApiPlugin.js';
|
|
30
30
|
import AzureVideoTranslatePlugin from './plugins/azureVideoTranslatePlugin.js';
|
|
31
|
+
import OllamaChatPlugin from './plugins/ollamaChatPlugin.js';
|
|
32
|
+
import OllamaCompletionPlugin from './plugins/ollamaCompletionPlugin.js';
|
|
31
33
|
|
|
32
34
|
class ModelExecutor {
|
|
33
35
|
constructor(pathway, model) {
|
|
@@ -116,6 +118,12 @@ class ModelExecutor {
|
|
|
116
118
|
case 'AZURE-VIDEO-TRANSLATE':
|
|
117
119
|
plugin = new AzureVideoTranslatePlugin(pathway, model);
|
|
118
120
|
break;
|
|
121
|
+
case 'OLLAMA-CHAT':
|
|
122
|
+
plugin = new OllamaChatPlugin(pathway, model);
|
|
123
|
+
break;
|
|
124
|
+
case 'OLLAMA-COMPLETION':
|
|
125
|
+
plugin = new OllamaCompletionPlugin(pathway, model);
|
|
126
|
+
break;
|
|
119
127
|
default:
|
|
120
128
|
throw new Error(`Unsupported model type: ${model.type}`);
|
|
121
129
|
}
|
|
@@ -89,8 +89,13 @@ class PathwayResolver {
|
|
|
89
89
|
progress: 1,
|
|
90
90
|
data: '[DONE]',
|
|
91
91
|
});
|
|
92
|
+
} else {
|
|
93
|
+
publishRequestProgress({
|
|
94
|
+
requestId: this.rootRequestId || this.requestId,
|
|
95
|
+
progress: 1,
|
|
96
|
+
data: error.message || error.toString(),
|
|
97
|
+
});
|
|
92
98
|
}
|
|
93
|
-
return;
|
|
94
99
|
}
|
|
95
100
|
|
|
96
101
|
// If the response is a string, it's a regular long running response
|
|
@@ -102,7 +107,7 @@ class PathwayResolver {
|
|
|
102
107
|
if (!modelTypesExcludedFromProgressUpdates.includes(this.model.type)) {
|
|
103
108
|
await publishRequestProgress({
|
|
104
109
|
requestId: this.rootRequestId || this.requestId,
|
|
105
|
-
progress: completedCount / totalCount,
|
|
110
|
+
progress: Math.min(completedCount,totalCount) / totalCount,
|
|
106
111
|
data: JSON.stringify(responseData),
|
|
107
112
|
});
|
|
108
113
|
}
|
|
@@ -227,10 +232,10 @@ class PathwayResolver {
|
|
|
227
232
|
// Load saved context and core memory if it exists
|
|
228
233
|
const [savedContext, memorySelf, memoryDirectives, memoryTopics, memoryUser, memoryContext] = await Promise.all([
|
|
229
234
|
(getv && await getv(this.savedContextId)) || {},
|
|
230
|
-
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memorySelf', priority: 1}),
|
|
231
|
-
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memoryDirectives', priority: 1 }),
|
|
235
|
+
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memorySelf', priority: 1, stripMetadata: true }),
|
|
236
|
+
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memoryDirectives', priority: 1, stripMetadata: true }),
|
|
232
237
|
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memoryTopics', priority: 0, numResults: 10 }),
|
|
233
|
-
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memoryUser', priority: 1 }),
|
|
238
|
+
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memoryUser', priority: 1, stripMetadata: true }),
|
|
234
239
|
callPathway('sys_read_memory', { contextId: this.savedContextId, section: 'memoryContext', priority: 0 }),
|
|
235
240
|
]).catch(error => {
|
|
236
241
|
this.logError(`Failed to load memory: ${error.message}`);
|
|
@@ -315,12 +320,12 @@ class PathwayResolver {
|
|
|
315
320
|
processInputText(text) {
|
|
316
321
|
let chunkTokenLength = 0;
|
|
317
322
|
if (this.pathway.inputChunkSize) {
|
|
318
|
-
chunkTokenLength =
|
|
323
|
+
chunkTokenLength = this.pathway.inputChunkSize;
|
|
319
324
|
} else {
|
|
320
325
|
chunkTokenLength = this.chunkMaxTokenLength;
|
|
321
326
|
}
|
|
322
327
|
const encoded = text ? encode(text) : [];
|
|
323
|
-
if (!this.useInputChunking
|
|
328
|
+
if (!this.useInputChunking) { // no chunking, return as is
|
|
324
329
|
if (encoded.length > 0 && encoded.length >= chunkTokenLength) {
|
|
325
330
|
const warnText = `Truncating long input text. Text length: ${text.length}`;
|
|
326
331
|
this.logWarning(warnText);
|
|
@@ -375,7 +380,7 @@ class PathwayResolver {
|
|
|
375
380
|
// Process the request and return the result
|
|
376
381
|
async processRequest({ text, ...parameters }) {
|
|
377
382
|
text = await this.summarizeIfEnabled({ text, ...parameters }); // summarize if flag enabled
|
|
378
|
-
const chunks = this.processInputText(text);
|
|
383
|
+
const chunks = text && this.processInputText(text) || [text];
|
|
379
384
|
|
|
380
385
|
let anticipatedRequestCount = chunks.length * this.prompts.length
|
|
381
386
|
|