@aj-archipelago/cortex 1.3.30 → 1.3.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pathways/system/entity/sys_entity_start.js +11 -5
- package/pathways/system/entity/sys_generator_ack.js +1 -1
- package/pathways/system/entity/sys_router_tool.js +1 -1
- package/pathways/transcribe_gemini.js +88 -37
- package/server/graphql.js +1 -0
- package/server/pathwayResolver.js +6 -3
- package/server/plugins/azureVideoTranslatePlugin.js +8 -8
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@aj-archipelago/cortex",
|
|
3
|
-
"version": "1.3.
|
|
3
|
+
"version": "1.3.31",
|
|
4
4
|
"description": "Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.",
|
|
5
5
|
"private": false,
|
|
6
6
|
"repository": {
|
|
@@ -87,15 +87,21 @@ export default {
|
|
|
87
87
|
args.model = pathwayResolver.modelName;
|
|
88
88
|
}
|
|
89
89
|
|
|
90
|
-
//
|
|
90
|
+
// Save a copy of the chat history before the memory context is added
|
|
91
91
|
const chatHistoryBeforeMemory = [...args.chatHistory];
|
|
92
92
|
|
|
93
|
-
|
|
94
|
-
if (
|
|
95
|
-
const
|
|
96
|
-
|
|
93
|
+
// Add the memory context to the chat history if applicable
|
|
94
|
+
if (args.chatHistory.length > 1) {
|
|
95
|
+
const memoryContext = await callPathway('sys_read_memory', { ...args, section: 'memoryContext', priority: 0, recentHours: 0, stream: false }, pathwayResolver);
|
|
96
|
+
if (memoryContext) {
|
|
97
|
+
const lastMessage = args.chatHistory.length > 0 ? args.chatHistory.pop() : null;
|
|
98
|
+
const { toolCallId } = addToolCalls(args.chatHistory, "search memory for relevant information", "memory_lookup");
|
|
99
|
+
addToolResults(args.chatHistory, memoryContext, toolCallId);
|
|
100
|
+
args.chatHistory.push(lastMessage);
|
|
101
|
+
}
|
|
97
102
|
}
|
|
98
103
|
|
|
104
|
+
// If we're using voice, get a quick response to say
|
|
99
105
|
let ackResponse = null;
|
|
100
106
|
if (args.voiceResponse) {
|
|
101
107
|
ackResponse = await callPathway('sys_generator_ack', { ...args, stream: false });
|
|
@@ -4,7 +4,7 @@ export default {
|
|
|
4
4
|
prompt:
|
|
5
5
|
[
|
|
6
6
|
new Prompt({ messages: [
|
|
7
|
-
{"role": "system", "content": `{{renderTemplate AI_CONVERSATION_HISTORY}}\nYou are a part of an AI system named {{aiName}}. Your job is to acknowledge the user's request and provide a very brief voice filler response that is conversational and natural. The purpose of the response is just to let the user know that you have heard them and are processing a response.\nResponse Guidelines:\n- it should just be a normal 1-2 sentence vocalization (at least 10 words) that will take at most about 3-4 seconds to read and is easy for a text to speech engine to read\n- it should be the beginning of an appropriate response to the last user message in the conversation history\n- it should be an appropriate lead-in for the full response that will follow later\n- it should not directly ask for follow up or be a question\n- it must match the tone and verbal style of the rest of your responses in the conversation history\n- it should not be repetitive - don't always open with the same word, etc.\n- if the user has asked a binary question (yes or no, true or false, etc.) or a filler response is not appropriate, you should
|
|
7
|
+
{"role": "system", "content": `{{renderTemplate AI_CONVERSATION_HISTORY}}\nYou are a part of an AI system named {{aiName}}. Your job is to acknowledge the user's request and provide a very brief voice filler response that is conversational and natural. The purpose of the response is just to let the user know that you have heard them and are processing a response.\nResponse Guidelines:\n- it should just be a normal 1-2 sentence vocalization (at least 10 words) that will take at most about 3-4 seconds to read and is easy for a text to speech engine to read\n- it should be the beginning of an appropriate response to the last user message in the conversation history\n- it should be an appropriate lead-in for the full response that will follow later\n- it should not directly ask for follow up or be a question\n- it must match the tone and verbal style of the rest of your responses in the conversation history\n- it should not be repetitive - don't always open with the same word, etc.\n- if the user has asked a binary question (yes or no, true or false, etc.) or a filler response is not appropriate, you should respond with the string \"none\"\n\n{{renderTemplate AI_DATETIME}}`},
|
|
8
8
|
{"role": "user", "content": "Please generate a quick response to the user's last message in the conversation history that can be read verbatim to the user or \"none\" if a filler response is not appropriate."}
|
|
9
9
|
]}),
|
|
10
10
|
],
|
|
@@ -57,7 +57,7 @@ If you decide to use a tool, return a JSON object in this format:
|
|
|
57
57
|
toolMessage Guidelines:
|
|
58
58
|
- The message should be consistent in style and tone with the rest of your responses in the conversation history.
|
|
59
59
|
- The message should be brief and conversational and flow naturally with the conversation history.
|
|
60
|
-
- The message should
|
|
60
|
+
- The message should be something a human would say to the user to stall for time while you're working on the task.
|
|
61
61
|
|
|
62
62
|
If no tool is required, return:
|
|
63
63
|
{"toolRequired": false, "toolReason": "explanation of why no tool was necessary"}
|
|
@@ -5,6 +5,39 @@ import { Prompt } from "../server/prompt.js";
|
|
|
5
5
|
|
|
6
6
|
const OFFSET_CHUNK = 500; //seconds of each chunk offset, only used if helper does not provide
|
|
7
7
|
|
|
8
|
+
// Function to properly detect YouTube URLs
|
|
9
|
+
function isYoutubeUrl(url) {
|
|
10
|
+
try {
|
|
11
|
+
const urlObj = new URL(url);
|
|
12
|
+
|
|
13
|
+
// Check for standard youtube.com domains
|
|
14
|
+
if (
|
|
15
|
+
urlObj.hostname === "youtube.com" ||
|
|
16
|
+
urlObj.hostname === "www.youtube.com"
|
|
17
|
+
) {
|
|
18
|
+
// For standard watch URLs, verify they have a video ID
|
|
19
|
+
if (urlObj.pathname === "/watch") {
|
|
20
|
+
return !!urlObj.searchParams.get("v");
|
|
21
|
+
}
|
|
22
|
+
// For embed URLs, verify they have a video ID in the path
|
|
23
|
+
if (urlObj.pathname.startsWith("/embed/")) {
|
|
24
|
+
return urlObj.pathname.length > 7; // '/embed/' is 7 chars
|
|
25
|
+
}
|
|
26
|
+
return false;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// Check for shortened youtu.be domain
|
|
30
|
+
if (urlObj.hostname === "youtu.be") {
|
|
31
|
+
// Verify there's a video ID in the path
|
|
32
|
+
return urlObj.pathname.length > 1; // '/' is 1 char
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return false;
|
|
36
|
+
} catch (err) {
|
|
37
|
+
return false;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
8
41
|
export default {
|
|
9
42
|
prompt:
|
|
10
43
|
[
|
|
@@ -71,7 +104,7 @@ export default {
|
|
|
71
104
|
|
|
72
105
|
//check if fils is a gcs file or youtube
|
|
73
106
|
const isGcs = file.startsWith('gs://');
|
|
74
|
-
const isYoutube = file
|
|
107
|
+
const isYoutube = isYoutubeUrl(file);
|
|
75
108
|
|
|
76
109
|
let chunks = [{
|
|
77
110
|
url: file,
|
|
@@ -87,43 +120,43 @@ export default {
|
|
|
87
120
|
|
|
88
121
|
sendProgress(true);
|
|
89
122
|
|
|
90
|
-
let respectLimitsPrompt = "
|
|
123
|
+
let respectLimitsPrompt = "";
|
|
91
124
|
if (maxLineWidth) {
|
|
92
125
|
|
|
93
126
|
const possiblePlacement = maxLineWidth <= 25
|
|
94
127
|
? "vertical" : maxLineWidth <= 35 ? "horizontal" : "";
|
|
95
128
|
|
|
96
|
-
respectLimitsPrompt += `
|
|
97
|
-
|
|
98
|
-
if(possiblePlacement){
|
|
99
|
-
respectLimitsPrompt+= `This limit a must as user will be using the output for ${possiblePlacement} display.`
|
|
100
|
-
}
|
|
129
|
+
respectLimitsPrompt += ` These subtitles will be shown in a ${possiblePlacement} formatted video player. Each subtitle line should not exceed ${maxLineWidth} characters to fit the player.`;
|
|
101
130
|
}
|
|
102
131
|
|
|
103
|
-
const transcriptionLevel = wordTimestamped ? "word" : "phrase";
|
|
104
|
-
|
|
105
132
|
function getMessages(file, format) {
|
|
106
133
|
|
|
107
|
-
const responseFormat = format!== 'text' ? 'VTT' : 'text';
|
|
134
|
+
const responseFormat = format !== 'text' ? 'VTT' : 'text';
|
|
135
|
+
|
|
136
|
+
// Base system content that's always included
|
|
137
|
+
let systemContent = `Instructions:
|
|
138
|
+
You are a transcription assistant. Your job is to transcribe the audio/video content accurately.
|
|
108
139
|
|
|
109
|
-
|
|
110
|
-
{"role": "system", "content": `Instructions:\nYou are an AI entity with expertise of transcription. Your response only contains the transcription, no comments or additonal stuff.
|
|
111
|
-
|
|
112
|
-
Your output must be in the format asked, and must be strictly following the formats and parseble by auto parsers.
|
|
140
|
+
IMPORTANT: Only provide the transcription in your response - no explanations, comments, or additional text.
|
|
113
141
|
|
|
114
|
-
|
|
142
|
+
Format your response in ${responseFormat} format.`;
|
|
115
143
|
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
144
|
+
// Only include timestamp instructions if we're not using plain text format
|
|
145
|
+
if (responseFormat !== 'text') {
|
|
146
|
+
systemContent += `
|
|
119
147
|
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
148
|
+
CRITICAL TIMESTAMP INSTRUCTIONS:
|
|
149
|
+
- Timestamps MUST match the actual timing in the media
|
|
150
|
+
- For each new segment, look at the media time directly
|
|
151
|
+
- Start times should precisely match when spoken words begin
|
|
152
|
+
- Consecutive segments should have matching end/start times (no gaps or overlaps)`;
|
|
153
|
+
}
|
|
123
154
|
|
|
124
|
-
|
|
155
|
+
systemContent += `
|
|
125
156
|
|
|
126
|
-
|
|
157
|
+
Examples:
|
|
158
|
+
|
|
159
|
+
SRT format:
|
|
127
160
|
1
|
|
128
161
|
00:00:00,498 --> 00:00:02,827
|
|
129
162
|
Hello World!
|
|
@@ -132,21 +165,24 @@ Hello World!
|
|
|
132
165
|
00:00:02,827 --> 00:00:06,383
|
|
133
166
|
Being AI is fun!
|
|
134
167
|
|
|
135
|
-
|
|
168
|
+
VTT format:
|
|
136
169
|
WEBVTT
|
|
137
170
|
|
|
138
171
|
1
|
|
139
172
|
00:00:00.000 --> 00:00:02.944
|
|
140
|
-
Hello
|
|
173
|
+
Hello World!
|
|
141
174
|
|
|
142
175
|
2
|
|
143
|
-
00:00:
|
|
144
|
-
Being AI is
|
|
176
|
+
00:00:02.944 --> 00:00:08.809
|
|
177
|
+
Being AI is great!
|
|
145
178
|
|
|
146
|
-
|
|
147
|
-
Hello World
|
|
179
|
+
Text format:
|
|
180
|
+
Hello World! Being AI is great!`;
|
|
148
181
|
|
|
149
|
-
|
|
182
|
+
if (wordTimestamped) {
|
|
183
|
+
systemContent += `
|
|
184
|
+
|
|
185
|
+
For word-level transcription, timestamp each word:
|
|
150
186
|
|
|
151
187
|
WEBVTT
|
|
152
188
|
|
|
@@ -155,17 +191,32 @@ WEBVTT
|
|
|
155
191
|
Hello
|
|
156
192
|
|
|
157
193
|
2
|
|
158
|
-
00:00:01.
|
|
194
|
+
00:00:01.944 --> 00:00:02.383
|
|
159
195
|
World!
|
|
196
|
+
`;
|
|
197
|
+
}
|
|
160
198
|
|
|
199
|
+
// Only include anti-drift procedure and timestamp reminders for non-text formats
|
|
200
|
+
if (responseFormat !== 'text') {
|
|
201
|
+
systemContent += `
|
|
202
|
+
|
|
203
|
+
ANTI-DRIFT PROCEDURE:
|
|
204
|
+
1. For EVERY new segment, check the actual media time directly
|
|
205
|
+
2. After every 5 segments, verify your timestamps against the video/audio
|
|
206
|
+
3. Never calculate timestamps based on previous segments
|
|
207
|
+
4. Always match the end time of one segment with the start time of the next
|
|
208
|
+
|
|
209
|
+
REMEMBER:
|
|
210
|
+
- Transcription accuracy is your primary goal
|
|
211
|
+
- Timestamp accuracy is equally important
|
|
212
|
+
- Timestamp drift is the most common error - actively prevent it
|
|
213
|
+
- When in doubt, check the media time directly`;
|
|
214
|
+
}
|
|
161
215
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
Even a single newline or space can cause the response to be rejected. You must follow the format strictly. You must place newlines and timestamps exactly as shown in the examples.
|
|
165
|
-
|
|
166
|
-
`},
|
|
216
|
+
const messages = [
|
|
217
|
+
{"role": "system", "content": systemContent},
|
|
167
218
|
{"role": "user", "content": [
|
|
168
|
-
`{ type: 'text', text: 'Transcribe
|
|
219
|
+
`{ type: 'text', text: 'Transcribe this file in ${responseFormat} format.${respectLimitsPrompt}' }`,
|
|
169
220
|
JSON.stringify({
|
|
170
221
|
type: 'image_url',
|
|
171
222
|
url: file,
|
package/server/graphql.js
CHANGED
|
@@ -94,8 +94,10 @@ class PathwayResolver {
|
|
|
94
94
|
requestId: this.rootRequestId || this.requestId,
|
|
95
95
|
progress: 1,
|
|
96
96
|
data: '',
|
|
97
|
-
info: '
|
|
97
|
+
info: '',
|
|
98
|
+
error: error.message || error.toString()
|
|
98
99
|
});
|
|
100
|
+
return;
|
|
99
101
|
}
|
|
100
102
|
|
|
101
103
|
// If the response is a stream, handle it as streaming response
|
|
@@ -165,7 +167,8 @@ class PathwayResolver {
|
|
|
165
167
|
requestId: this.requestId,
|
|
166
168
|
progress: 1,
|
|
167
169
|
data: '',
|
|
168
|
-
info: '
|
|
170
|
+
info: '',
|
|
171
|
+
error: 'Stream read failed'
|
|
169
172
|
});
|
|
170
173
|
} else {
|
|
171
174
|
return;
|
|
@@ -180,7 +183,7 @@ class PathwayResolver {
|
|
|
180
183
|
requestId: this.rootRequestId || this.requestId,
|
|
181
184
|
progress: Math.min(completedCount, totalCount) / totalCount,
|
|
182
185
|
// Clients expect these to be strings
|
|
183
|
-
data: JSON.stringify(responseData),
|
|
186
|
+
data: JSON.stringify(responseData || ''),
|
|
184
187
|
info: this.tool || ''
|
|
185
188
|
});
|
|
186
189
|
}
|
|
@@ -121,8 +121,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
|
|
|
121
121
|
const operationUrl = response.headers['operation-location'];
|
|
122
122
|
return { translation: response.data, operationUrl };
|
|
123
123
|
} catch (error) {
|
|
124
|
-
const errorText = error.response?.data || error.message;
|
|
125
|
-
throw new Error(`Failed to create translation: ${
|
|
124
|
+
const errorText = error.response?.data?.error?.innererror?.message || error.message;
|
|
125
|
+
throw new Error(`Failed to create translation: ${errorText}`);
|
|
126
126
|
}
|
|
127
127
|
}
|
|
128
128
|
|
|
@@ -151,8 +151,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
|
|
|
151
151
|
});
|
|
152
152
|
return response.data;
|
|
153
153
|
} catch (error) {
|
|
154
|
-
const errorText = error.response?.data || error.message;
|
|
155
|
-
throw new Error(`Failed to get iteration status: ${
|
|
154
|
+
const errorText = error.response?.data?.error?.innererror?.message || error.message;
|
|
155
|
+
throw new Error(`Failed to get iteration status: ${errorText}`);
|
|
156
156
|
}
|
|
157
157
|
}
|
|
158
158
|
|
|
@@ -165,8 +165,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
|
|
|
165
165
|
});
|
|
166
166
|
return response.data;
|
|
167
167
|
} catch (error) {
|
|
168
|
-
const errorText = error.response?.data || error.message;
|
|
169
|
-
throw new Error(`Failed to poll operation: ${
|
|
168
|
+
const errorText = error.response?.data?.error?.innererror?.message || error.message;
|
|
169
|
+
throw new Error(`Failed to poll operation: ${errorText}`);
|
|
170
170
|
}
|
|
171
171
|
}
|
|
172
172
|
|
|
@@ -360,8 +360,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
|
|
|
360
360
|
const output = await this.getTranslationOutput(translationId, iteration.id);
|
|
361
361
|
return JSON.stringify(output);
|
|
362
362
|
} catch (error) {
|
|
363
|
-
const errorText = error.response?.data || error.message;
|
|
364
|
-
throw new Error(`Failed to create iteration: ${
|
|
363
|
+
const errorText = error.response?.data?.error?.innererror?.message || error.message;
|
|
364
|
+
throw new Error(`Failed to create iteration: ${errorText}`);
|
|
365
365
|
}
|
|
366
366
|
} catch (error) {
|
|
367
367
|
logger.error(`Error in video translation: ${error.message}`);
|