npm - @aj-archipelago/cortex - Versions diffs - 1.3.30 → 1.3.31 - Mend

@aj-archipelago/cortex 1.3.30 → 1.3.31

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/package.json +1 -1
package/pathways/system/entity/sys_entity_start.js +11 -5
package/pathways/system/entity/sys_generator_ack.js +1 -1
package/pathways/system/entity/sys_router_tool.js +1 -1
package/pathways/transcribe_gemini.js +88 -37
package/server/graphql.js +1 -0
package/server/pathwayResolver.js +6 -3
package/server/plugins/azureVideoTranslatePlugin.js +8 -8

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@aj-archipelago/cortex",
-  "version": "1.3.30",
+  "version": "1.3.31",
   "description": "Cortex is a GraphQL API for AI. It provides a simple, extensible interface for using AI services from OpenAI, Azure and others.",
   "private": false,
   "repository": {

package/pathways/system/entity/sys_entity_start.js CHANGED Viewed

@@ -87,15 +87,21 @@ export default {
             args.model = pathwayResolver.modelName;
         }
-        // Stuff the memory context into the chat history
+        // Save a copy of the chat history before the memory context is added
         const chatHistoryBeforeMemory = [...args.chatHistory];
-        const memoryContext = await callPathway('sys_read_memory', { ...args, section: 'memoryContext', priority: 0, recentHours: 0, stream: false }, pathwayResolver);
-        if (memoryContext) {
-            const { toolCallId } = addToolCalls(args.chatHistory, "search memory for relevant information", "memory_lookup");
-            addToolResults(args.chatHistory, memoryContext, toolCallId);
+        // Add the memory context to the chat history if applicable
+        if (args.chatHistory.length > 1) {
+            const memoryContext = await callPathway('sys_read_memory', { ...args, section: 'memoryContext', priority: 0, recentHours: 0, stream: false }, pathwayResolver);
+            if (memoryContext) {
+                const lastMessage = args.chatHistory.length > 0 ? args.chatHistory.pop() : null;
+                const { toolCallId } = addToolCalls(args.chatHistory, "search memory for relevant information", "memory_lookup");
+                addToolResults(args.chatHistory, memoryContext, toolCallId);
+                args.chatHistory.push(lastMessage);
+            }
         }
+        // If we're using voice, get a quick response to say
         let ackResponse = null;
         if (args.voiceResponse) {
             ackResponse = await callPathway('sys_generator_ack', { ...args, stream: false });

package/pathways/system/entity/sys_generator_ack.js CHANGED Viewed

@@ -4,7 +4,7 @@ export default {
     prompt:
         [
             new Prompt({ messages: [
-                {"role": "system", "content": `{{renderTemplate AI_CONVERSATION_HISTORY}}\nYou are a part of an AI system named {{aiName}}. Your job is to acknowledge the user's request and provide a very brief voice filler response that is conversational and natural. The purpose of the response is just to let the user know that you have heard them and are processing a response.\nResponse Guidelines:\n- it should just be a normal 1-2 sentence vocalization (at least 10 words) that will take at most about 3-4 seconds to read and is easy for a text to speech engine to read\n- it should be the beginning of an appropriate response to the last user message in the conversation history\n- it should be an appropriate lead-in for the full response that will follow later\n- it should not directly ask for follow up or be a question\n- it must match the tone and verbal style of the rest of your responses in the conversation history\n- it should not be repetitive - don't always open with the same word, etc.\n- if the user has asked a binary question (yes or no, true or false, etc.) or a filler response is not appropriate, you should response with the string \"none\"\n\n{{renderTemplate AI_DATETIME}}`},
+                {"role": "system", "content": `{{renderTemplate AI_CONVERSATION_HISTORY}}\nYou are a part of an AI system named {{aiName}}. Your job is to acknowledge the user's request and provide a very brief voice filler response that is conversational and natural. The purpose of the response is just to let the user know that you have heard them and are processing a response.\nResponse Guidelines:\n- it should just be a normal 1-2 sentence vocalization (at least 10 words) that will take at most about 3-4 seconds to read and is easy for a text to speech engine to read\n- it should be the beginning of an appropriate response to the last user message in the conversation history\n- it should be an appropriate lead-in for the full response that will follow later\n- it should not directly ask for follow up or be a question\n- it must match the tone and verbal style of the rest of your responses in the conversation history\n- it should not be repetitive - don't always open with the same word, etc.\n- if the user has asked a binary question (yes or no, true or false, etc.) or a filler response is not appropriate, you should respond with the string \"none\"\n\n{{renderTemplate AI_DATETIME}}`},
                 {"role": "user", "content": "Please generate a quick response to the user's last message in the conversation history that can be read verbatim to the user or \"none\" if a filler response is not appropriate."}
             ]}),
         ],

package/pathways/system/entity/sys_router_tool.js CHANGED Viewed

@@ -57,7 +57,7 @@ If you decide to use a tool, return a JSON object in this format:
 toolMessage Guidelines:
 - The message should be consistent in style and tone with the rest of your responses in the conversation history.
 - The message should be brief and conversational and flow naturally with the conversation history.
-- The message should not refer to the tool use directly, but rather what you're trying to do.
+- The message should be something a human would say to the user to stall for time while you're working on the task.
 If no tool is required, return:
 {"toolRequired": false, "toolReason": "explanation of why no tool was necessary"}

package/pathways/transcribe_gemini.js CHANGED Viewed

@@ -5,6 +5,39 @@ import { Prompt } from "../server/prompt.js";
 const OFFSET_CHUNK = 500; //seconds of each chunk offset, only used if helper does not provide
+// Function to properly detect YouTube URLs
+function isYoutubeUrl(url) {
+    try {
+        const urlObj = new URL(url);
+        // Check for standard youtube.com domains
+        if (
+            urlObj.hostname === "youtube.com" ||
+            urlObj.hostname === "www.youtube.com"
+        ) {
+            // For standard watch URLs, verify they have a video ID
+            if (urlObj.pathname === "/watch") {
+                return !!urlObj.searchParams.get("v");
+            }
+            // For embed URLs, verify they have a video ID in the path
+            if (urlObj.pathname.startsWith("/embed/")) {
+                return urlObj.pathname.length > 7; // '/embed/' is 7 chars
+            }
+            return false;
+        }
+        // Check for shortened youtu.be domain
+        if (urlObj.hostname === "youtu.be") {
+            // Verify there's a video ID in the path
+            return urlObj.pathname.length > 1; // '/' is 1 char
+        }
+        return false;
+    } catch (err) {
+        return false;
+    }
+}
 export default {
     prompt:
     [
@@ -71,7 +104,7 @@ export default {
         //check if fils is a gcs file or youtube
         const isGcs = file.startsWith('gs://');
-        const isYoutube = file.match(/^(http(s)?:\/\/)?((w){3}.)?youtu(be|.be)?(\.com)?\/.+/);
+        const isYoutube = isYoutubeUrl(file);
         let chunks = [{
             url: file,
@@ -87,43 +120,43 @@ export default {
         sendProgress(true);
-        let respectLimitsPrompt = " ";
+        let respectLimitsPrompt = "";
         if (maxLineWidth) {
             const possiblePlacement = maxLineWidth <= 25
             ? "vertical" : maxLineWidth <= 35 ? "horizontal" : "";
-            respectLimitsPrompt += `The output lines must not exceed ${maxLineWidth} characters, so make sure your transcription lines and timestamps are perfectly aligned. `;
-            if(possiblePlacement){
-                respectLimitsPrompt+= `This limit a must as user will be using the output for ${possiblePlacement} display.`
-            }
+            respectLimitsPrompt += `  These subtitles will be shown in a ${possiblePlacement} formatted video player.  Each subtitle line should not exceed ${maxLineWidth} characters to fit the player.`;
         }
-        const transcriptionLevel = wordTimestamped ? "word" : "phrase";
         function getMessages(file, format) {
-            const responseFormat = format!== 'text' ? 'VTT' : 'text';
+            const responseFormat = format !== 'text' ? 'VTT' : 'text';
+            // Base system content that's always included
+            let systemContent = `Instructions:
+You are a transcription assistant. Your job is to transcribe the audio/video content accurately.
-            const messages = [
-                {"role": "system", "content": `Instructions:\nYou are an AI entity with expertise of transcription. Your response only contains the transcription, no comments or additonal stuff.
-Your output must be in the format asked, and must be strictly following the formats and parseble by auto parsers.
+IMPORTANT: Only provide the transcription in your response - no explanations, comments, or additional text.
-Word-level transcriptions must be per word timestamped, and phrase-level transcriptions are per phrase.
+Format your response in ${responseFormat} format.`;
-Each transcription timestamp must precisely match the corresponding audio/video segment.
-Each timestamp must correspond to actual spoken content.
-End time cannot exceed total media duration. Especially when transcribing word-level double check your timestamps, never exceed the total duration.
+            // Only include timestamp instructions if we're not using plain text format
+            if (responseFormat !== 'text') {
+                systemContent += `
-You must follow 1, 2, 3, ... numbering for each transcription segment without any missing numbers.
-Never put newlines or spaces in the middle of a timestamp.
-Never put multiple lines for a single timestamp.
+CRITICAL TIMESTAMP INSTRUCTIONS:
+- Timestamps MUST match the actual timing in the media
+- For each new segment, look at the media time directly
+- Start times should precisely match when spoken words begin
+- Consecutive segments should have matching end/start times (no gaps or overlaps)`;
+            }
-Example responses:
+            systemContent += `
-- If asked SRT format, e.g.:
+Examples:
+SRT format:
 1
 00:00:00,498 --> 00:00:02,827
 Hello World!
@@ -132,21 +165,24 @@ Hello World!
 00:00:02,827 --> 00:00:06,383
 Being AI is fun!
-- If asked VTT format, e.g.:
+VTT format:
 WEBVTT
 1
 00:00:00.000 --> 00:00:02.944
-Hello World2!
+Hello World!
 2
-00:00:05.344 --> 00:00:08.809
-Being AI is also great!
+00:00:02.944 --> 00:00:08.809
+Being AI is great!
-- If asked text format, e.g.:
-Hello World!!! Being AI is being great yet again!
+Text format:
+Hello World! Being AI is great!`;
-Word-level output e.g.:
+            if (wordTimestamped) {
+                systemContent += `
+For word-level transcription, timestamp each word:
 WEBVTT
@@ -155,17 +191,32 @@ WEBVTT
 Hello
 2
-00:00:01.964 --> 00:00:02.383
+00:00:01.944 --> 00:00:02.383
 World!
+`;
+            }
+            // Only include anti-drift procedure and timestamp reminders for non-text formats
+            if (responseFormat !== 'text') {
+                systemContent += `
+ANTI-DRIFT PROCEDURE:
+1. For EVERY new segment, check the actual media time directly
+2. After every 5 segments, verify your timestamps against the video/audio
+3. Never calculate timestamps based on previous segments
+4. Always match the end time of one segment with the start time of the next
+REMEMBER:
+- Transcription accuracy is your primary goal
+- Timestamp accuracy is equally important
+- Timestamp drift is the most common error - actively prevent it
+- When in doubt, check the media time directly`;
+            }
-You must follow spacing, punctuation, and timestamps as shown in the examples otherwise your response will not be accepted.
-Never output multiple lines for a single timestamp.
-Even a single newline or space can cause the response to be rejected. You must follow the format strictly. You must place newlines and timestamps exactly as shown in the examples.
-    `},
+            const messages = [
+                {"role": "system", "content": systemContent},
                 {"role": "user", "content": [
-                    `{ type: 'text', text: 'Transcribe the media ${transcriptionLevel}-level in ${responseFormat} format.${respectLimitsPrompt}' }`,
+                    `{ type: 'text', text: 'Transcribe this file in ${responseFormat} format.${respectLimitsPrompt}' }`,
                     JSON.stringify({
                         type: 'image_url',
                         url: file,

package/server/graphql.js CHANGED Viewed

@@ -85,6 +85,7 @@ const getTypedefs = (pathways, pathwayManager) => {
         status: String
         data: String
         info: String
+        error: String
     }
     type Subscription {

package/server/pathwayResolver.js CHANGED Viewed

@@ -94,8 +94,10 @@ class PathwayResolver {
                 requestId: this.rootRequestId || this.requestId,
                 progress: 1,
                 data: '',
-                info: 'ERROR: ' + error.message || error.toString()
+                info: '',
+                error: error.message || error.toString()
             });
+            return;
         }
         // If the response is a stream, handle it as streaming response
@@ -165,7 +167,8 @@ class PathwayResolver {
                     requestId: this.requestId,
                     progress: 1,
                     data: '',
-                    info: 'ERROR: Stream read failed'
+                    info: '',
+                    error: 'Stream read failed'
                 });
             } else {
                 return;
@@ -180,7 +183,7 @@ class PathwayResolver {
                         requestId: this.rootRequestId || this.requestId,
                         progress: Math.min(completedCount, totalCount) / totalCount,
                         // Clients expect these to be strings
-                        data: JSON.stringify(responseData),
+                        data: JSON.stringify(responseData || ''),
                         info: this.tool || ''
                 });
             }

package/server/plugins/azureVideoTranslatePlugin.js CHANGED Viewed

@@ -121,8 +121,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
             const operationUrl = response.headers['operation-location'];
             return { translation: response.data, operationUrl };
         } catch (error) {
-            const errorText = error.response?.data || error.message;
-            throw new Error(`Failed to create translation: ${error.message}\nDetails: ${errorText}`);
+            const errorText = error.response?.data?.error?.innererror?.message || error.message;
+            throw new Error(`Failed to create translation: ${errorText}`);
         }
     }
@@ -151,8 +151,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
             });
             return response.data;
         } catch (error) {
-            const errorText = error.response?.data || error.message;
-            throw new Error(`Failed to get iteration status: ${error.message}\nDetails: ${errorText}`);
+            const errorText = error.response?.data?.error?.innererror?.message || error.message;
+            throw new Error(`Failed to get iteration status: ${errorText}`);
         }
     }
@@ -165,8 +165,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
             });
             return response.data;
         } catch (error) {
-            const errorText = error.response?.data || error.message;
-            throw new Error(`Failed to poll operation: ${error.message}\nDetails: ${errorText}`);
+            const errorText = error.response?.data?.error?.innererror?.message || error.message;
+            throw new Error(`Failed to poll operation: ${errorText}`);
         }
     }
@@ -360,8 +360,8 @@ class AzureVideoTranslatePlugin extends ModelPlugin {
                 const output = await this.getTranslationOutput(translationId, iteration.id);
                 return JSON.stringify(output);
             } catch (error) {
-                const errorText = error.response?.data || error.message;
-                throw new Error(`Failed to create iteration: ${error.message}\nDetails: ${errorText}`);
+                const errorText = error.response?.data?.error?.innererror?.message || error.message;
+                throw new Error(`Failed to create iteration: ${errorText}`);
             }
         } catch (error) {
             logger.error(`Error in video translation: ${error.message}`);