npm - @aj-archipelago/cortex - Versions diffs - 1.1.37 → 1.2.0 - Mend

@aj-archipelago/cortex 1.1.37 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/config.js +60 -0
package/package.json +1 -1
package/pathways/flux_image.js +2 -1
package/pathways/index.js +6 -1
package/pathways/sys_parse_numbered_object_list.js +19 -0
package/pathways/sys_repair_json.js +17 -0
package/server/chunker.js +156 -113
package/server/modelExecutor.js +9 -1
package/server/parser.js +18 -36
package/server/pathwayResolver.js +1 -1
package/server/pathwayResponseParser.js +3 -3
package/server/plugins/azureCognitivePlugin.js +1 -1
package/server/plugins/azureVideoTranslatePlugin.js +163 -0
package/server/plugins/openAiVisionPlugin.js +0 -3
package/server/plugins/{runwareAIPlugin.js → runwareAiPlugin.js} +1 -1
package/tests/chunkfunction.test.js +270 -4
package/tests/main.test.js +0 -55
package/tests/parser.test.js +255 -0
package/tests/translate_srt.test.js +82 -0

package/server/plugins/azureVideoTranslatePlugin.js ADDED Viewed

@@ -0,0 +1,163 @@
+// AzureVideoTranslatePlugin.js
+import ModelPlugin from "./modelPlugin.js";
+import logger from "../../lib/logger.js";
+import axios from "axios";
+import { publishRequestProgress } from "../../lib/redisSubscription.js";
+import { config } from "../../config.js";
+function isValidJSON(str) {
+    try {
+      JSON.parse(str);
+      return true;
+    } catch (e) {
+      return false;
+    }
+}
+class AzureVideoTranslatePlugin extends ModelPlugin {
+    constructor(pathway, model) {
+        super(pathway, model);
+        this.apiUrl = config.get("azureVideoTranslationApiUrl");
+        this.eventSource = null;
+        this.jsonBuffer = '';
+        this.jsonDepth = 0;
+        this.currentStep = 0;
+        this.totalNumOfSteps = 30;
+    }
+    getRequestParameters(_, parameters, __) {
+        const excludedParameters = [
+            'text', 'parameters', 'prompt', 'promptParameters', 'previousResult', 'stream'
+        ];
+        return Object.fromEntries(
+            Object.entries(parameters).filter(([key, value]) =>
+                !excludedParameters.includes(key) &&
+                value !== '' &&
+                typeof value !== 'undefined'
+            )
+        );
+    }
+    handleStream(stream, onData, onEnd, onError) {
+        const timeout = setTimeout(() => {
+            onError(new Error('Stream timeout'));
+        }, 300000); // timeout
+        stream.on('data', (chunk) => {
+            clearTimeout(timeout);
+            const lines = chunk.toString().split('\n\n');
+            lines.forEach(line => {
+                if (line.startsWith('data: ')) {
+                    const eventData = line.slice(6);
+                    try {
+                        this.handleEvent({ data: eventData }, onData);
+                    } catch (error) {
+                        onError(error);
+                    }
+                }
+            });
+        });
+        stream.on('end', () => {
+            clearTimeout(timeout);
+            this.cleanup();
+            onEnd();
+        });
+        stream.on('error', (error) => {
+            clearTimeout(timeout);
+            console.error('Stream error:', error);
+            this.cleanup();
+            onError(error);
+        });
+    }
+    handleEvent(event, onData) {
+        const data = event.data;
+        this.jsonBuffer += data;
+        this.jsonDepth += (data.match(/{/g) || []).length - (data.match(/}/g) || []).length;
+        if (this.jsonDepth === 0 && this.jsonBuffer.trim()) {
+            console.log(this.jsonBuffer);
+            if (this.jsonBuffer.includes('Failed to run with exception')) {
+                this.cleanup();
+                throw new Error(this.jsonBuffer);
+            }
+            onData(this.jsonBuffer);
+            this.jsonBuffer = '';
+            this.jsonDepth = 0;
+        }
+    }
+    async execute(text, parameters, prompt, cortexRequest) {
+        if (!this.apiUrl) {
+            throw new Error("API URL is not set");
+        }
+        this.requestId = cortexRequest.requestId;
+        const requestParameters = this.getRequestParameters(text, parameters, prompt);
+        try {
+            const response = await axios.post(this.apiUrl, requestParameters, {
+                responseType: 'stream',
+                headers: {
+                    'Cache-Control': 'no-cache',
+                    'Pragma': 'no-cache',
+                    'Expires': '0',
+                }
+            });
+            return new Promise((resolve, reject) => {
+                let finalJson = '';
+                this.handleStream(response.data,
+                    (data) => {
+                        this.currentStep++;
+                        publishRequestProgress({
+                            requestId: this.requestId,
+                            progress: this.currentStep / this.totalNumOfSteps,
+                            // data: this.jsonBuffer,
+                            info: data
+                        });
+                        if (isValidJSON(data)) {
+                            finalJson = data;
+                        }
+                    },
+                    () => {
+                        // console.log('Full data:', fullData);
+                        resolve(finalJson)
+                    },
+                    (error) => reject(error)
+                );
+            }).finally(() => this.cleanup());
+        } catch (error) {
+            this.cleanup();
+            return error;
+        }
+    }
+    parseResponse(data) {
+        const response = typeof data === 'object' ? JSON.stringify(data) : data;
+        publishRequestProgress({
+            requestId: this.requestId,
+            progress: 1,
+            data: response,
+        });
+        return response;
+    }
+    logRequestData(data, responseData, prompt) {
+        logger.verbose(`Request: ${JSON.stringify(data)}`);
+        logger.verbose(`Response: ${this.parseResponse(responseData)}`);
+        if (prompt?.debugInfo) {
+            prompt.debugInfo += `\nRequest: ${JSON.stringify(data)}`;
+            prompt.debugInfo += `\nResponse: ${this.parseResponse(responseData)}`;
+        }
+    }
+    cleanup() {
+        if (this.eventSource) {
+            this.eventSource.close();
+            this.eventSource = null;
+        }
+    }
+}
+export default AzureVideoTranslatePlugin;

package/server/plugins/openAiVisionPlugin.js CHANGED Viewed

@@ -22,9 +22,6 @@ class OpenAIVisionPlugin extends OpenAIChatPlugin {
                 if (message.role === "tool") {
                     return message;
                 }
-                if (typeof message.content === 'string') {
-                    message.content = safeJsonParse(message.content);
-                }
                 if (Array.isArray(message.content)) {
                     message.content = message.content.map(item => {
                         const parsedItem = safeJsonParse(item);

package/server/plugins/{runwareAIPlugin.js → runwareAiPlugin.js} RENAMED Viewed

@@ -57,7 +57,7 @@ class RunwareAiPlugin extends ModelPlugin {
     return this.executeRequest(cortexRequest);
   }
-  // Parse the response from the Azure Translate API
+  // Parse the response from the Runware API
   parseResponse(data) {
     if (data.data) {
       return JSON.stringify(data.data);

package/tests/chunkfunction.test.js CHANGED Viewed

@@ -87,9 +87,9 @@ test('should chunk text between html elements if needed', async t => {
     t.is(chunks.length, 4);
     t.is(chunks[0], htmlChunkTwo);
-    t.is(chunks[1], 'Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia curae');
-    t.is(encode(chunks[1]).length, chunkSize);
-    t.is(chunks[2], '; Fusce at dignissim quam.');
+    t.is(chunks[1], 'Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia ');
+    t.true(encode(chunks[1]).length < chunkSize);
+    t.is(chunks[2], 'curae; Fusce at dignissim quam.');
     t.is(chunks[3], htmlChunkTwo);
 });
@@ -221,4 +221,270 @@ test('should correctly split text into single token chunks', t => {
     // Check specific tokens (this may need adjustment based on your tokenizer)
     t.deepEqual(chunks, ['Hello', ',', ' world', '!']);
-});
+});
+test('should respect sentence boundaries when possible', t => {
+    const text = 'First sentence. Second sentence. Third sentence.';
+    const maxChunkToken = encode('First sentence. Second').length;
+    const chunks = getSemanticChunks(text, maxChunkToken);
+    t.is(chunks[0], 'First sentence.');
+    t.is(chunks[1], ' Second sentence.');
+    t.is(chunks[2], ' Third sentence.');
+});
+test('should respect paragraph boundaries', t => {
+    const text = 'First paragraph.\n\nSecond paragraph.\n\nThird paragraph.';
+    const maxChunkToken = encode('First paragraph.\n\nSecond').length;
+    const chunks = getSemanticChunks(text, maxChunkToken);
+    t.is(chunks[0], 'First paragraph.\n\n');
+    t.is(chunks[1], 'Second paragraph.\n\n');
+    t.is(chunks[2], 'Third paragraph.');
+});
+test('should handle lists appropriately', t => {
+    const text = '1. First item\n2. Second item\n3. Third item';
+    const maxChunkToken = encode('1. First item\n2.').length;
+    const chunks = getSemanticChunks(text, maxChunkToken);
+    t.is(chunks[0], '1. First item\n');
+    t.is(chunks[1], '2. Second item\n');
+    t.is(chunks[2], '3. Third item');
+});
+test('should keep related punctuation together', t => {
+    const text = 'Question? Answer! Ellipsis... Done.';
+    const maxChunkToken = 5; // Small chunk size to force splits
+    const chunks = getSemanticChunks(text, maxChunkToken);
+    // Ensure question mark stays with "Question"
+    t.true(chunks.some(chunk => chunk.includes('Question?')));
+    // Ensure exclamation mark stays with "Answer"
+    t.true(chunks.some(chunk => chunk.includes('Answer!')));
+    // Ensure ellipsis stays together
+    t.true(chunks.some(chunk => chunk.includes('...')));
+});
+test('should handle empty strings appropriately', t => {
+    const chunks = getSemanticChunks('', 100);
+    t.deepEqual(chunks, []);
+});
+test('should handle strings with only whitespace', t => {
+    const text = '    \n\n   \t   \n    ';
+    const chunks = getSemanticChunks(text, 100);
+    t.is(chunks.join(''), text);
+});
+test('should handle special characters and emoji correctly', t => {
+    const text = '👋 Hello! Special chars: §±@#$%^&* and more 🌟';
+    const maxChunkToken = 10;
+    const chunks = getSemanticChunks(text, maxChunkToken);
+    t.true(chunks.length > 0);
+    t.is(chunks.join(''), text);
+});
+test('should handle code-like content appropriately', t => {
+    const text = 'const x = 42;\nfunction test() {\n    return x;\n}';
+    const maxChunkToken = 20;
+    const chunks = getSemanticChunks(text, maxChunkToken);
+    // Code blocks should preferably break at logical points
+    t.true(chunks.some(chunk => chunk.includes('const x = 42;')));
+    t.true(chunks.join('').includes('function test() {'));
+});
+test('should handle extremely large token sizes gracefully', t => {
+    const maxChunkToken = Number.MAX_SAFE_INTEGER;
+    const chunks = getSemanticChunks(testText, maxChunkToken);
+    t.is(chunks.length, 1);
+    t.is(chunks[0], testText);
+});
+test('should throw error for invalid maxChunkToken values', t => {
+    t.throws(() => getSemanticChunks(testText, 0), { message: /invalid/i });
+    t.throws(() => getSemanticChunks(testText, -1), { message: /invalid/i });
+    t.throws(() => getSemanticChunks(testText, NaN), { message: /invalid/i });
+});
+test('should handle Arabic text correctly', t => {
+    const arabicText = 'مرحبا بالعالم. هذه جملة عربية. وهذه جملة أخرى!';
+    const maxChunkToken = encode('مرحبا بالعالم. هذه !').length;
+    const chunks = getSemanticChunks(arabicText, maxChunkToken);
+    // Check that chunks respect Arabic sentence boundaries
+    t.true(chunks[0].endsWith('.'));
+    t.is(chunks.join(''), arabicText);
+});
+test('should handle mixed RTL and LTR text', t => {
+    const mixedText = 'Hello مرحبا World عالم! Testing اختبار.';
+    const maxChunkToken = 10;
+    const chunks = getSemanticChunks(mixedText, maxChunkToken);
+    t.true(chunks.length > 0);
+    t.is(chunks.join(''), mixedText);
+});
+test('should handle Chinese text correctly', t => {
+    const chineseText = '你好世界。这是一个测试。我们在测试中文分段。';
+    const maxChunkToken = encode('你好世界。').length;
+    const chunks = getSemanticChunks(chineseText, maxChunkToken);
+    // Check that chunks respect Chinese sentence boundaries
+    t.true(chunks[0].endsWith('。'));
+    t.is(chunks.join(''), chineseText);
+});
+test('should handle mixed scripts appropriately', t => {
+    const mixedText = 'Hello World! مرحبا بالعالم! 你好世界! Bonjour le monde!';
+    const maxChunkToken = 15;
+    const chunks = getSemanticChunks(mixedText, maxChunkToken);
+    t.true(chunks.length > 0);
+    t.true(chunks.every(chunk => encode(chunk).length <= maxChunkToken));
+    t.is(chunks.join(''), mixedText);
+});
+test('should handle text with combining diacritical marks', t => {
+    const textWithDiacritics = 'é è ê ë ā ă ą ḥ ḫ ṭ ﻋَﺮَﺑِﻲ';
+    const maxChunkToken = 5;
+    const chunks = getSemanticChunks(textWithDiacritics, maxChunkToken);
+    t.true(chunks.length > 0);
+    t.is(chunks.join(''), textWithDiacritics);
+});
+test('should handle Arabic text with various sentence structures', t => {
+    const arabicText = `السَّلامُ عَلَيْكُمْ وَرَحْمَةُ اللهِ وَبَرَكَاتُهُ!
+    هَذَا نَصٌّ طَوِيلٌ لِاخْتِبَارِ التَّقْسِيمِ. يَحْتَوِي عَلَى عِدَّةِ جُمَلٍ؟ وَيَشْمَلُ عَلَامَاتِ التَّرْقِيمِ!
+    نَصٌّ مَعَ أَرْقَامٍ: 123 و ٤٥٦ و ٧٨٩.`;
+    const maxChunkToken = 20;
+    const chunks = getSemanticChunks(arabicText, maxChunkToken);
+    t.true(chunks.length > 1);
+    t.true(chunks.every(chunk => encode(chunk).length <= maxChunkToken));
+    t.is(chunks.join(''), arabicText);
+});
+test('should handle Arabic text with Quranic diacritics', t => {
+    const quranText = 'بِسْمِ ٱللَّهِ ٱلرَّحْمَٰنِ ٱلرَّحِيمِ ۝ ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ';
+    const maxChunkToken = 15;
+    const chunks = getSemanticChunks(quranText, maxChunkToken);
+    t.true(chunks.every(chunk => encode(chunk).length <= maxChunkToken));
+    t.is(chunks.join(''), quranText);
+});
+test('should handle Arabic text with mixed numbers and punctuation', t => {
+    const mixedArabicText = 'العام الدراسي 2023-2024م. سيبدأ في ١٥ سبتمبر! (إن شاء الله)';
+    const maxChunkToken = 10;
+    const chunks = getSemanticChunks(mixedArabicText, maxChunkToken);
+    t.true(chunks.length > 1);
+    t.true(chunks.every(chunk => encode(chunk).length <= maxChunkToken));
+    t.is(chunks.join(''), mixedArabicText);
+});
+test('should handle Arabic text with HTML', t => {
+    const arabicHtml = '<p>مرحباً <strong>بالعالم</strong> العربي!</p>';
+    const maxChunkToken = encode(arabicHtml).length;
+    const chunks = getSemanticChunks(arabicHtml, maxChunkToken, 'html');
+    t.is(chunks.length, 1);
+    t.is(chunks[0], arabicHtml);
+});
+test('should respect Arabic paragraph breaks', t => {
+    const arabicParagraphs = `الفقرة الأولى تحتوي على معلومات مهمة.
+    الفقرة الثانية تكمل الموضوع.
+    الفقرة الثالثة تختم الكلام.`;
+    const maxChunkToken = encode('الفقرة الأولى تحتوي على معلومات مهمة.').length;
+    const chunks = getSemanticChunks(arabicParagraphs, maxChunkToken);
+    t.true(chunks.some(chunk => chunk.includes('الفقرة الأولى')));
+    t.true(chunks.some(chunk => chunk.includes('الفقرة الثانية')));
+    t.true(chunks.some(chunk => chunk.includes('الفقرة الثالثة')));
+});
+test('should handle very large text (50x) efficiently', async t => {
+    const largeText = Array(50).fill(testText).join('\n');
+    t.log('Size of very large text:', largeText.length, 'bytes');
+    const startTime = performance.now();
+    const maxChunkToken = 1000;
+    const chunks = getSemanticChunks(largeText, maxChunkToken);
+    const endTime = performance.now();
+    const processingTime = endTime - startTime;
+    t.true(chunks.length > 0);
+    t.true(chunks.every(chunk => encode(chunk).length <= maxChunkToken));
+    t.is(chunks.join(''), largeText);
+    // Processing should take less than 1 second for this size
+    t.true(processingTime < 1000, `Processing took ${processingTime}ms`);
+});
+test('should handle extremely large text (500x) efficiently', async t => {
+    const largeText = Array(500).fill(testText).join('\n');
+    t.log('Size of extremely large text:', largeText.length, 'bytes');
+    const startTime = performance.now();
+    const maxChunkToken = 1000;
+    const chunks = getSemanticChunks(largeText, maxChunkToken);
+    const endTime = performance.now();
+    const processingTime = endTime - startTime;
+    t.true(chunks.length > 0);
+    t.true(chunks.every(chunk => encode(chunk).length <= maxChunkToken));
+    t.is(chunks.join(''), largeText);
+    // Processing should take less than 5 seconds for this size
+    t.true(processingTime < 5000, `Processing took ${processingTime}ms`);
+});
+test('should handle massive text (5000x) efficiently', async t => {
+    const largeText = Array(5000).fill(testText).join('\n');
+    t.log('Size of massive text:', largeText.length, 'bytes');
+    const startTime = performance.now();
+    const maxChunkToken = 1000;
+    const chunks = getSemanticChunks(largeText, maxChunkToken);
+    const endTime = performance.now();
+    const processingTime = endTime - startTime;
+    t.true(chunks.length > 0);
+    t.true(chunks.every(chunk => encode(chunk).length <= maxChunkToken));
+    t.is(chunks.join(''), largeText);
+    // Processing should take less than 30 seconds for this size
+    t.true(processingTime < 30000, `Processing took ${processingTime}ms`);
+});
+test('should maintain memory efficiency with huge texts', async t => {
+    const initialMemory = process.memoryUsage().heapUsed;
+    const largeText = Array(1000).fill(testText).join('\n');
+    const maxChunkToken = 1000;
+    const chunks = getSemanticChunks(largeText, maxChunkToken);
+    const finalMemory = process.memoryUsage().heapUsed;
+    const memoryIncrease = (finalMemory - initialMemory) / 1024 / 1024; // Convert to MB
+    t.true(chunks.length > 0);
+    // Memory increase should be reasonable (less than 100MB for this test)
+    t.true(memoryIncrease < 100, `Memory increase was ${memoryIncrease.toFixed(2)}MB`);
+});

package/tests/main.test.js CHANGED Viewed

@@ -357,61 +357,6 @@ test('test translate endpoint with huge arabic text english translation and chec
 });
-async function testTranslateSrt(t, text, language='English') {
-    const response = await testServer.executeOperation({
-        query: 'query translate_subtitle($text: String!, $to:String) { translate_subtitle(text: $text, to:$to) { result } }',
-        variables: {
-            to: language,
-            text
-         },
-    });
-    t.falsy(response.body?.singleResult?.errors);
-    const result = response.body?.singleResult?.data?.translate_subtitle?.result;
-    t.true(result?.length > text.length*0.5);
-    //check all timestamps are still there and not translated
-    const originalTimestamps = text.match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}/g);
-    const translatedTimestamps = result.match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}/g);
-    t.deepEqual(originalTimestamps, translatedTimestamps, 'All timestamps should be present and unchanged');
-    const originalLineCount = text.split('\n').length;
-    const translatedLineCount = result.split('\n').length;
-    t.is(originalLineCount, translatedLineCount, 'Total number of lines should be the same');
-}
-test('test translate_srt endpoint with simple srt', async t => {
-    const text = `1
-00:00:03,069 --> 00:00:04,771
-Who’s that?
-2
-00:00:04,771 --> 00:00:06,039
-Aseel.
-3
-00:00:06,039 --> 00:00:07,474
-Who is Aseel a mom to?
-4
-00:00:07,474 --> 00:00:09,376
-Aseel is mommy
-`;
-    await testTranslateSrt(t, text, 'Spanish');
-});
-test('test translate_srt endpoint with long srt file', async t => {
-    t.timeout(400000);
-    const text = fs.readFileSync(path.join(__dirname, 'sublong.srt'), 'utf8');
-    await testTranslateSrt(t, text, 'English');
-});
-test('test translate_srt endpoint with horizontal srt file', async t => {
-    t.timeout(400000);
-    const text = fs.readFileSync(path.join(__dirname, 'subhorizontal.srt'), 'utf8');
-    await testTranslateSrt(t, text, 'Turkish');
-});