npm - @aj-archipelago/cortex - Versions diffs - 1.3.22 → 1.3.23 - Mend

@aj-archipelago/cortex 1.3.22 → 1.3.23

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

package/README.md +64 -0
package/config.js +26 -1
package/helper-apps/cortex-realtime-voice-server/src/realtime/client.ts +9 -4
package/helper-apps/cortex-realtime-voice-server/src/realtime/realtimeTypes.ts +1 -0
package/lib/util.js +4 -24
package/package.json +5 -2
package/pathways/system/rest_streaming/sys_ollama_chat.js +21 -0
package/pathways/system/rest_streaming/sys_ollama_completion.js +14 -0
package/pathways/transcribe_gemini.js +181 -53
package/server/modelExecutor.js +8 -0
package/server/pathwayResolver.js +6 -1
package/server/plugins/claude3VertexPlugin.js +41 -15
package/server/plugins/gemini15ChatPlugin.js +90 -1
package/server/plugins/gemini15VisionPlugin.js +9 -3
package/server/plugins/modelPlugin.js +11 -8
package/server/plugins/ollamaChatPlugin.js +158 -0
package/server/plugins/ollamaCompletionPlugin.js +147 -0
package/server/rest.js +46 -5
package/tests/multimodal_conversion.test.js +169 -0
package/tests/transcribe_gemini.test.js +217 -0

package/tests/multimodal_conversion.test.js CHANGED Viewed

@@ -271,12 +271,18 @@ test('Pathological cases', async (t) => {
     t.is(geminiSystem15.parts[0].text, 'You are a helpful assistant.');
     t.is(geminiSystem15.parts[1].text, 'You are also very knowledgeable.');
     t.is(geminiMessages15.length, 3);
+    // First user message combines "Hello" and "Another greeting"
     t.is(geminiMessages15[0].role, 'user');
     t.is(geminiMessages15[0].parts[0].text, 'Hello');
     t.is(geminiMessages15[0].parts[1].text, 'Another greeting');
+    // Assistant message "Hi there!"
     t.is(geminiMessages15[1].role, 'assistant');
     t.is(geminiMessages15[1].parts[0].text, 'Hi there!');
+    // Final user message combines "How are you?", image content, and "Another question"
     t.is(geminiMessages15[2].role, 'user');
     t.is(geminiMessages15[2].parts[0].text, 'How are you?');
     t.is(geminiMessages15[2].parts[1].text, 'What\'s this?');
@@ -310,6 +316,79 @@ test('Empty message array', async (t) => {
     t.is(geminiMessages15.length, 0);
 });
+// Test simple string array content
+test('Simple string array content', async (t) => {
+    const { gemini15 } = createPlugins();
+    const messages = [
+        { role: 'user', content: "Initial message" },
+        { role: 'assistant', content: [
+            "\"Searchin' for my lost shaker of salt...\"\n",
+        ]},
+        { role: 'user', content: [
+            "Here's another simple string in an array",
+        ]}
+    ];
+    const { modifiedMessages } = gemini15.convertMessagesToGemini(messages);
+    t.is(modifiedMessages.length, 3);
+    t.is(modifiedMessages[0].role, 'user');
+    t.is(modifiedMessages[0].parts.length, 1);
+    t.is(modifiedMessages[0].parts[0].text, "Initial message");
+    t.is(modifiedMessages[1].role, 'assistant');
+    t.is(modifiedMessages[1].parts.length, 1);
+    t.is(modifiedMessages[1].parts[0].text, "\"Searchin' for my lost shaker of salt...\"\n");
+    t.is(modifiedMessages[2].role, 'user');
+    t.is(modifiedMessages[2].parts.length, 1);
+    t.is(modifiedMessages[2].parts[0].text, "Here's another simple string in an array");
+});
+// Test string-encoded multimodal content
+test('String-encoded multimodal content', async (t) => {
+    const { gemini15 } = createPlugins();
+    const messages = [
+        { role: 'user', content: [
+            JSON.stringify({
+                type: 'text',
+                text: 'What is in this image?'
+            }),
+            JSON.stringify({
+                type: 'image_url',
+                image_url: { url: 'gs://my-bucket/image.jpg' }
+            })
+        ]},
+        { role: 'assistant', content: [
+            JSON.stringify({
+                type: 'text',
+                text: 'I see a cat.'
+            })
+        ]},
+        { role: 'user', content: [
+            JSON.stringify({
+                type: 'text',
+                text: 'Is it a big cat?'
+            })
+        ]}
+    ];
+    const { modifiedMessages } = gemini15.convertMessagesToGemini(messages);
+    t.is(modifiedMessages.length, 3);
+    t.is(modifiedMessages[0].role, 'user');
+    t.is(modifiedMessages[0].parts.length, 2);
+    t.is(modifiedMessages[0].parts[0].text, 'What is in this image?');
+    t.true('fileData' in modifiedMessages[0].parts[1]);
+    t.is(modifiedMessages[0].parts[1].fileData.fileUri, 'gs://my-bucket/image.jpg');
+    t.is(modifiedMessages[1].role, 'assistant');
+    t.is(modifiedMessages[1].parts.length, 1);
+    t.is(modifiedMessages[1].parts[0].text, 'I see a cat.');
+    t.is(modifiedMessages[2].role, 'user');
+    t.is(modifiedMessages[2].parts.length, 1);
+    t.is(modifiedMessages[2].parts[0].text, 'Is it a big cat?');
+});
 // Test messages with only system messages
 test('Only system messages', async (t) => {
     const { openai, claude, gemini, gemini15 } = createPlugins();
@@ -417,3 +496,93 @@ test('Gemini 1.5 image URL edge cases', t => {
     // Verify we only have one part (the text)
     t.is(modifiedMessages[0].parts.length, 1, 'Should only have the text part');
 });
+// Test multiple images in single message for Claude
+test('Multiple images in single Claude message', async (t) => {
+    const { claude } = createPlugins();
+    const multiImageMessage = [
+        { role: 'user', content: [
+            { type: 'text', text: 'Compare these images:' },
+            { type: 'image_url', image_url: { url: sampleBase64Image } },
+            { type: 'text', text: 'with this one:' },
+            { type: 'image_url', image_url: { url: sampleBase64Image } },
+            { type: 'image_url', gcs: 'gs://cortex-bucket/image.jpg' }
+        ]}
+    ];
+    const { modifiedMessages } = await claude.convertMessagesToClaudeVertex(multiImageMessage);
+    t.is(modifiedMessages.length, 1);
+    t.is(modifiedMessages[0].role, 'user');
+    t.is(modifiedMessages[0].content.length, 4);
+    t.is(modifiedMessages[0].content[0].text, 'Compare these images:');
+    t.true(modifiedMessages[0].content[1].source.type === 'base64');
+    t.is(modifiedMessages[0].content[2].text, 'with this one:');
+    t.true(modifiedMessages[0].content[3].source.type === 'base64');
+});
+// Test conversation history with mixed image types
+test('Conversation history with mixed image types', async (t) => {
+    const { claude, gemini15 } = createPlugins();
+    const conversationHistory = [
+        { role: 'system', content: 'You are a visual analysis assistant.' },
+        { role: 'user', content: [
+            { type: 'text', text: 'What\'s in this image?' },
+            { type: 'image_url', image_url: { url: sampleBase64Image } }
+        ]},
+        { role: 'assistant', content: 'I see a landscape.' },
+        { role: 'user', content: [
+            { type: 'text', text: 'Compare it with this:' },
+            { type: 'image_url', gcs: 'gs://cortex-bucket/image2.jpg' }
+        ]},
+        { role: 'assistant', content: 'The second image shows a different scene.' },
+        { role: 'user', content: 'Which one do you prefer?' }
+    ];
+    // Test Claude conversion
+    const { system: claudeSystem, modifiedMessages: claudeMessages } = await claude.convertMessagesToClaudeVertex(conversationHistory);
+    t.is(claudeSystem, 'You are a visual analysis assistant.');
+    t.is(claudeMessages.length, 5);
+    t.is(claudeMessages[1].content[0].text, 'I see a landscape.');
+    t.is(claudeMessages[3].content[0].text, 'The second image shows a different scene.');
+    t.is(claudeMessages[4].content[0].text, 'Which one do you prefer?');
+    // Test Gemini 1.5 conversion
+    const { system: geminiSystem15, modifiedMessages: geminiMessages15 } = gemini15.convertMessagesToGemini(conversationHistory);
+    t.is(geminiSystem15.parts[0].text, 'You are a visual analysis assistant.');
+    t.is(geminiMessages15.length, 5);
+    t.true('inlineData' in geminiMessages15[0].parts[1]);
+    t.is(geminiMessages15[1].parts[0].text, 'I see a landscape.');
+    t.true('fileData' in geminiMessages15[2].parts[1]);
+    t.is(geminiMessages15[2].parts[1].fileData.fileUri, 'gs://cortex-bucket/image2.jpg');
+    t.is(geminiMessages15[3].parts[0].text, 'The second image shows a different scene.');
+    t.is(geminiMessages15[4].parts[0].text, 'Which one do you prefer?');
+});
+// Test handling of large images
+test('Large image handling', async (t) => {
+    const { claude, gemini15 } = createPlugins();
+    // Create a large base64 string (>10MB)
+    const largeSampleImage = 'data:image/jpeg;base64,' + 'A'.repeat(10 * 1024 * 1024);
+    const largeImageMessage = [
+        { role: 'user', content: [
+            { type: 'text', text: 'Check this large image:' },
+            { type: 'image_url', image_url: { url: largeSampleImage } }
+        ]}
+    ];
+    // Both Claude and Gemini should handle or reject oversized images gracefully
+    const { modifiedMessages: claudeMessages } = await claude.convertMessagesToClaudeVertex(largeImageMessage);
+    const { modifiedMessages: geminiMessages } = gemini15.convertMessagesToGemini(largeImageMessage);
+    // Verify both models handle the oversized image appropriately
+    // (The exact behavior - rejection vs. compression - should match the model's specifications)
+    t.is(claudeMessages[0].content[0].text, 'Check this large image:');
+    t.is(geminiMessages[0].parts[0].text, 'Check this large image:');
+});

package/tests/transcribe_gemini.test.js ADDED Viewed

@@ -0,0 +1,217 @@
+import test from 'ava';
+import { convertSrtToVtt } from '../pathways/transcribe_gemini.js';
+test('should return empty WebVTT for null or empty input', t => {
+    t.is(convertSrtToVtt(null), "WEBVTT\n\n");
+    t.is(convertSrtToVtt(''), "WEBVTT\n\n");
+    t.is(convertSrtToVtt('   '), "WEBVTT\n\n");
+});
+test('should convert basic SRT to WebVTT format', t => {
+    const srtInput =
+`1
+00:00:01,000 --> 00:00:04,000
+Hello world`;
+    const expectedOutput =
+`WEBVTT
+1
+00:00:01.000 --> 00:00:04.000
+Hello world
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should convert multiple subtitle entries', t => {
+    const srtInput =
+`1
+00:00:01,000 --> 00:00:04,000
+First subtitle
+2
+00:00:05,000 --> 00:00:08,000
+Second subtitle`;
+    const expectedOutput =
+`WEBVTT
+1
+00:00:01.000 --> 00:00:04.000
+First subtitle
+2
+00:00:05.000 --> 00:00:08.000
+Second subtitle
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should handle DOS line endings', t => {
+    const srtInput = "1\r\n00:00:01,000 --> 00:00:04,000\r\nHello world\r\n";
+    const expectedOutput = "WEBVTT\n\n1\n00:00:01.000 --> 00:00:04.000\nHello world\n\n";
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should handle multi-line subtitles', t => {
+    const srtInput =
+`1
+00:00:01,000 --> 00:00:04,000
+First line
+Second line
+Third line
+2
+00:00:05,000 --> 00:00:08,000
+Another subtitle`;
+    const expectedOutput =
+`WEBVTT
+1
+00:00:01.000 --> 00:00:04.000
+First line
+Second line
+Third line
+2
+00:00:05.000 --> 00:00:08.000
+Another subtitle
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should handle invalid timestamp formats', t => {
+    const srtInput =
+`1
+invalid timestamp
+Hello world
+2
+00:00:05,000 --> 00:00:08,000
+Valid subtitle`;
+    const expectedOutput =
+`WEBVTT
+2
+00:00:05.000 --> 00:00:08.000
+Valid subtitle
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should convert comma to dot in timestamps', t => {
+    const srtInput =
+`1
+00:00:01,500 --> 00:00:04,750
+Test subtitle`;
+    const expectedOutput =
+`WEBVTT
+1
+00:00:01.500 --> 00:00:04.750
+Test subtitle
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should handle extra whitespace in input', t => {
+    const srtInput = `
+1
+  00:00:01,000 --> 00:00:04,000
+  Hello world
+`;
+    const expectedOutput =
+`WEBVTT
+1
+00:00:01.000 --> 00:00:04.000
+Hello world
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should handle timestamps with only minutes and seconds', t => {
+    const srtInput =
+`1
+01:30,000 --> 02:45,500
+Short timestamp format`;
+    const expectedOutput =
+`WEBVTT
+1
+00:01:30.000 --> 00:02:45.500
+Short timestamp format
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should handle ultra-short timestamps (SS.mmm)', t => {
+    const srtInput =
+`1
+03.298 --> 04.578
+First line
+2
+04.578 --> 06.178
+Second line`;
+    const expectedOutput =
+`WEBVTT
+1
+00:00:03.298 --> 00:00:04.578
+First line
+2
+00:00:04.578 --> 00:00:06.178
+Second line
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});
+test('should handle mixed timestamp formats', t => {
+    const srtInput =
+`1
+03.298 --> 04.578
+First line
+2
+00:04.578 --> 00:06.178
+Second line
+3
+00:00:06.178 --> 00:00:07.518
+Third line`;
+    const expectedOutput =
+`WEBVTT
+1
+00:00:03.298 --> 00:00:04.578
+First line
+2
+00:00:04.578 --> 00:00:06.178
+Second line
+3
+00:00:06.178 --> 00:00:07.518
+Third line
+`;
+    t.is(convertSrtToVtt(srtInput), expectedOutput);
+});