@aj-archipelago/cortex 1.3.28 → 1.3.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/pathways/system/entity/sys_entity_continue.js +6 -1
- package/pathways/system/entity/sys_generator_error.js +5 -2
- package/pathways/translate_subtitle.js +51 -24
- package/server/plugins/gemini15ChatPlugin.js +1 -1
- package/server/plugins/gemini15VisionPlugin.js +1 -1
- package/server/plugins/openAiWhisperPlugin.js +3 -8
- package/tests/subchunk.srt +1459 -0
- package/tests/translate_srt.test.js +386 -2
|
@@ -4,7 +4,8 @@ import { fileURLToPath } from 'url';
|
|
|
4
4
|
import { dirname } from 'path';
|
|
5
5
|
import fs from 'fs';
|
|
6
6
|
import path from 'path';
|
|
7
|
-
import { SubtitleUtils } from '@aj-archipelago/subvibe';
|
|
7
|
+
import { SubtitleUtils, parse } from '@aj-archipelago/subvibe';
|
|
8
|
+
import { selectBestTranslation, splitIntoOverlappingChunks } from '../pathways/translate_subtitle.js';
|
|
8
9
|
|
|
9
10
|
const __filename = fileURLToPath(import.meta.url);
|
|
10
11
|
const __dirname = dirname(__filename);
|
|
@@ -23,6 +24,21 @@ test.after.always('cleanup', async () => {
|
|
|
23
24
|
}
|
|
24
25
|
});
|
|
25
26
|
|
|
27
|
+
// Improved mock implementation of translateChunk that preserves identifiers
|
|
28
|
+
async function mockTranslateChunk(chunk, args) {
|
|
29
|
+
try {
|
|
30
|
+
// Instead of building and parsing which might lose identifiers,
|
|
31
|
+
// directly map each caption to a translated version
|
|
32
|
+
return chunk.captions.map(caption => ({
|
|
33
|
+
...caption, // Preserve all properties including identifier
|
|
34
|
+
text: `Translated: ${caption.text}`, // Just modify the text
|
|
35
|
+
}));
|
|
36
|
+
} catch (e) {
|
|
37
|
+
console.error(`Error in mock translate chunk: ${e.message}`);
|
|
38
|
+
throw e;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
26
42
|
async function testSubtitleTranslation(t, text, language = 'English', format = 'srt') {
|
|
27
43
|
const response = await testServer.executeOperation({
|
|
28
44
|
query: 'query translate_subtitle($text: String!, $to: String, $format: String) { translate_subtitle(text: $text, to: $to, format: $format) { result } }',
|
|
@@ -170,4 +186,372 @@ test('test subtitle translation with horizontal SRT file', async t => {
|
|
|
170
186
|
t.timeout(400000);
|
|
171
187
|
const text = fs.readFileSync(path.join(__dirname, 'subhorizontal.srt'), 'utf8');
|
|
172
188
|
await testSubtitleTranslation(t, text, 'Turkish', 'srt');
|
|
173
|
-
});
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
/**
|
|
192
|
+
* Mock implementation of callPathway that handles translate_subtitle_helper
|
|
193
|
+
*/
|
|
194
|
+
const mockCallPathway = async (pathwayName, params) => {
|
|
195
|
+
if (pathwayName === "translate_subtitle_helper") {
|
|
196
|
+
// Create a mock translation by adding "Translated: " prefix to each line
|
|
197
|
+
const mockCaptions = params.text
|
|
198
|
+
.split("\n")
|
|
199
|
+
.map((line) => `Translated: ${line}`)
|
|
200
|
+
.join("\n");
|
|
201
|
+
return `<SUBTITLES>${mockCaptions}</SUBTITLES>`;
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
throw new Error(`Mock callPathway: Unhandled pathway ${pathwayName}`);
|
|
205
|
+
};
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
test("translationMap is built correctly with multiple chunks", async (t) => {
|
|
209
|
+
// Create a sample of 50 captions
|
|
210
|
+
const sampleCaptions = Array.from({ length: 50 }, (_, i) => ({
|
|
211
|
+
identifier: i.toString(),
|
|
212
|
+
text: `Caption ${i}`,
|
|
213
|
+
index: i,
|
|
214
|
+
}));
|
|
215
|
+
|
|
216
|
+
// Use the actual function from the module to create chunks
|
|
217
|
+
const chunks = splitIntoOverlappingChunks(sampleCaptions);
|
|
218
|
+
t.true(chunks.length > 1, "Should create multiple chunks");
|
|
219
|
+
|
|
220
|
+
// Mock args parameter required by translateChunk
|
|
221
|
+
const mockArgs = {
|
|
222
|
+
format: "srt",
|
|
223
|
+
to: "Spanish",
|
|
224
|
+
};
|
|
225
|
+
|
|
226
|
+
// Use our simplified mock translateChunk function
|
|
227
|
+
const chunkPromises = chunks.map((chunk) => mockTranslateChunk(chunk, mockArgs));
|
|
228
|
+
const translatedChunks = await Promise.all(chunkPromises);
|
|
229
|
+
|
|
230
|
+
// Build translation map
|
|
231
|
+
const translationMap = new Map();
|
|
232
|
+
translatedChunks.flat().forEach((caption) => {
|
|
233
|
+
if (!translationMap.has(caption.identifier)) {
|
|
234
|
+
translationMap.set(caption.identifier, []);
|
|
235
|
+
}
|
|
236
|
+
translationMap.get(caption.identifier).push(caption);
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
// Debug output
|
|
240
|
+
console.log(`Translation map size: ${translationMap.size}`);
|
|
241
|
+
|
|
242
|
+
// Check a few sample entries
|
|
243
|
+
if (translationMap.size === 0) {
|
|
244
|
+
console.log("Sample of translated chunks:", translatedChunks[0].slice(0, 3));
|
|
245
|
+
console.log("First few captions from sample:", sampleCaptions.slice(0, 3));
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Verify the translation map
|
|
249
|
+
t.truthy(translationMap, "Translation map should be created");
|
|
250
|
+
|
|
251
|
+
// Check if all captions have entries
|
|
252
|
+
sampleCaptions.forEach((caption) => {
|
|
253
|
+
const hasEntry = translationMap.has(caption.identifier);
|
|
254
|
+
if (!hasEntry) {
|
|
255
|
+
console.log(`Missing entry for caption: ${caption.identifier}`);
|
|
256
|
+
}
|
|
257
|
+
t.true(
|
|
258
|
+
hasEntry,
|
|
259
|
+
`Translation map should have entry for caption ${caption.identifier}`
|
|
260
|
+
);
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
// Check for overlapping translations (captions appearing in multiple chunks)
|
|
264
|
+
let overlappingCaptions = 0;
|
|
265
|
+
translationMap.forEach((translations) => {
|
|
266
|
+
if (translations.length > 1) {
|
|
267
|
+
overlappingCaptions++;
|
|
268
|
+
}
|
|
269
|
+
});
|
|
270
|
+
|
|
271
|
+
// Due to the chunk overlap, some captions should have multiple translations
|
|
272
|
+
t.true(
|
|
273
|
+
overlappingCaptions > 0,
|
|
274
|
+
"Some captions should have multiple translations due to chunk overlap"
|
|
275
|
+
);
|
|
276
|
+
});
|
|
277
|
+
|
|
278
|
+
test("selectBestTranslation picks the best translation based on proximity to target", (t) => {
|
|
279
|
+
// Sample translations for the same caption with different identifiers/positions
|
|
280
|
+
const translations = [
|
|
281
|
+
{ identifier: "10", text: "Translation 1", index: 10 },
|
|
282
|
+
{ identifier: "15", text: "Translation 2", index: 15 },
|
|
283
|
+
{ identifier: "20", text: "Translation 3", index: 20 },
|
|
284
|
+
{ identifier: "25", text: "Translation 4", index: 25 },
|
|
285
|
+
];
|
|
286
|
+
|
|
287
|
+
// Now we can use the actual function from the module
|
|
288
|
+
|
|
289
|
+
// Case 1: Target closer to first translation
|
|
290
|
+
const best1 = selectBestTranslation(translations, 10, 14);
|
|
291
|
+
t.is(
|
|
292
|
+
best1.text,
|
|
293
|
+
"Translation 1",
|
|
294
|
+
"Should select translation closest to target position 10-14"
|
|
295
|
+
);
|
|
296
|
+
|
|
297
|
+
// Case 2: Target closer to second translation
|
|
298
|
+
const best2 = selectBestTranslation(translations, 15, 19);
|
|
299
|
+
t.is(
|
|
300
|
+
best2.text,
|
|
301
|
+
"Translation 2",
|
|
302
|
+
"Should select translation closest to target position 15-19"
|
|
303
|
+
);
|
|
304
|
+
|
|
305
|
+
// Case 3: Target closer to third translation
|
|
306
|
+
const best3 = selectBestTranslation(translations, 20, 24);
|
|
307
|
+
t.is(
|
|
308
|
+
best3.text,
|
|
309
|
+
"Translation 3",
|
|
310
|
+
"Should select translation closest to target position 20-24"
|
|
311
|
+
);
|
|
312
|
+
|
|
313
|
+
// Case 4: Target exactly at one of the positions
|
|
314
|
+
const best4 = selectBestTranslation(translations, 15, 15);
|
|
315
|
+
t.is(best4.text, "Translation 2", "Should select exact matching translation");
|
|
316
|
+
|
|
317
|
+
// Case 5: Target between two positions
|
|
318
|
+
const best5 = selectBestTranslation(translations, 17, 23);
|
|
319
|
+
t.is(
|
|
320
|
+
best5.text,
|
|
321
|
+
"Translation 3",
|
|
322
|
+
"Should select translation closest to midpoint of target 17-23"
|
|
323
|
+
);
|
|
324
|
+
|
|
325
|
+
// Case 6: Single translation available
|
|
326
|
+
const singleTranslation = [
|
|
327
|
+
{ identifier: "10", text: "Only translation", index: 10 },
|
|
328
|
+
];
|
|
329
|
+
const best6 = selectBestTranslation(singleTranslation, 30, 30);
|
|
330
|
+
t.is(
|
|
331
|
+
best6.text,
|
|
332
|
+
"Only translation",
|
|
333
|
+
"With single translation, should select it regardless of target"
|
|
334
|
+
);
|
|
335
|
+
|
|
336
|
+
// Case 7: Handle missing identifier (use index instead)
|
|
337
|
+
const mixedTranslations = [
|
|
338
|
+
{ text: "No identifier", index: 5 },
|
|
339
|
+
{ identifier: "10", text: "With identifier", index: 10 },
|
|
340
|
+
];
|
|
341
|
+
const best7 = selectBestTranslation(mixedTranslations, 4, 6);
|
|
342
|
+
t.is(
|
|
343
|
+
best7.text,
|
|
344
|
+
"No identifier",
|
|
345
|
+
"Should use index when identifier is missing"
|
|
346
|
+
);
|
|
347
|
+
|
|
348
|
+
// Case 8: Empty translations array
|
|
349
|
+
const emptyArray = [];
|
|
350
|
+
const best8 = selectBestTranslation(emptyArray, 10, 10);
|
|
351
|
+
t.is(best8, null, "Should return null for empty translations array");
|
|
352
|
+
|
|
353
|
+
// Case 9: Invalid input handling
|
|
354
|
+
t.is(
|
|
355
|
+
selectBestTranslation(null, 10, 10),
|
|
356
|
+
null,
|
|
357
|
+
"Should handle null input gracefully"
|
|
358
|
+
);
|
|
359
|
+
t.is(
|
|
360
|
+
selectBestTranslation(undefined, 10, 10),
|
|
361
|
+
null,
|
|
362
|
+
"Should handle undefined input gracefully"
|
|
363
|
+
);
|
|
364
|
+
});
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
test("subtitle translation with translation coverage verification", async (t) => {
|
|
368
|
+
t.timeout(400000); // Long timeout for potentially large file
|
|
369
|
+
const text = fs.readFileSync(path.join(__dirname, "subchunk.srt"), "utf8");
|
|
370
|
+
|
|
371
|
+
const response = await testServer.executeOperation({
|
|
372
|
+
query:
|
|
373
|
+
"query translate_subtitle($text: String!, $to: String, $format: String) { translate_subtitle(text: $text, to: $to, format: $format) { result } }",
|
|
374
|
+
variables: {
|
|
375
|
+
to: "Arabic",
|
|
376
|
+
text,
|
|
377
|
+
format: "srt",
|
|
378
|
+
},
|
|
379
|
+
});
|
|
380
|
+
|
|
381
|
+
t.falsy(response.body?.singleResult?.errors);
|
|
382
|
+
|
|
383
|
+
const result = response.body?.singleResult?.data?.translate_subtitle?.result;
|
|
384
|
+
|
|
385
|
+
t.log(`Result: ${result}`);
|
|
386
|
+
|
|
387
|
+
t.true(result?.length > text.length * 0.5);
|
|
388
|
+
|
|
389
|
+
// Parse both original and translated subtitles
|
|
390
|
+
const originalSubs = parse(text, { format: "srt" });
|
|
391
|
+
const translatedSubs = parse(result, { format: "srt" });
|
|
392
|
+
|
|
393
|
+
// Ensure we have the same number of cues/captions
|
|
394
|
+
t.is(
|
|
395
|
+
originalSubs.cues.length,
|
|
396
|
+
translatedSubs.cues.length,
|
|
397
|
+
"Should have same number of captions"
|
|
398
|
+
);
|
|
399
|
+
|
|
400
|
+
// Check that all lines have been translated to Arabic
|
|
401
|
+
let untranslatedCount = 0;
|
|
402
|
+
let translatedCount = 0;
|
|
403
|
+
let nonArabicCount = 0;
|
|
404
|
+
let exactMatchCount = 0;
|
|
405
|
+
|
|
406
|
+
// Store all original texts to check for duplicates
|
|
407
|
+
const allOriginalTexts = originalSubs.cues.map(cue => cue.text.toLowerCase().trim());
|
|
408
|
+
|
|
409
|
+
// Track translated texts to check for duplicates within translations
|
|
410
|
+
const translatedTextsSet = new Set();
|
|
411
|
+
const duplicateTranslations = new Map(); // Map to store duplicate counts
|
|
412
|
+
|
|
413
|
+
// Regular expression to match Arabic characters (Unicode range for Arabic script)
|
|
414
|
+
const arabicRegex = /[\u0600-\u06FF]/;
|
|
415
|
+
|
|
416
|
+
translatedSubs.cues.forEach((cue, index) => {
|
|
417
|
+
const originalText = originalSubs.cues[index].text;
|
|
418
|
+
const translatedText = cue.text;
|
|
419
|
+
|
|
420
|
+
// Skip empty lines
|
|
421
|
+
if (!originalText.trim()) return;
|
|
422
|
+
|
|
423
|
+
// Check if the text has been translated (different from original)
|
|
424
|
+
const isDifferent =
|
|
425
|
+
translatedText.toLowerCase().trim() !== originalText.toLowerCase().trim();
|
|
426
|
+
|
|
427
|
+
// Check if it's an exact match with ANY original line (not just its own line)
|
|
428
|
+
const normalizedTranslated = translatedText.toLowerCase().trim();
|
|
429
|
+
const isExactMatchWithAny = allOriginalTexts.includes(normalizedTranslated);
|
|
430
|
+
|
|
431
|
+
// Track duplicate translations
|
|
432
|
+
if (translatedTextsSet.has(normalizedTranslated)) {
|
|
433
|
+
if (duplicateTranslations.has(normalizedTranslated)) {
|
|
434
|
+
duplicateTranslations.set(
|
|
435
|
+
normalizedTranslated,
|
|
436
|
+
duplicateTranslations.get(normalizedTranslated) + 1
|
|
437
|
+
);
|
|
438
|
+
} else {
|
|
439
|
+
duplicateTranslations.set(normalizedTranslated, 2); // 2 occurrences total
|
|
440
|
+
}
|
|
441
|
+
} else {
|
|
442
|
+
translatedTextsSet.add(normalizedTranslated);
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
if (isExactMatchWithAny) {
|
|
446
|
+
exactMatchCount++;
|
|
447
|
+
console.log(
|
|
448
|
+
`Line ${index + 1} matches an original line: "${originalText}" => "${translatedText}"`
|
|
449
|
+
);
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// Check if it contains Arabic characters
|
|
453
|
+
const containsArabic = arabicRegex.test(translatedText);
|
|
454
|
+
|
|
455
|
+
if (isDifferent && containsArabic) {
|
|
456
|
+
translatedCount++;
|
|
457
|
+
} else if (isDifferent && !containsArabic) {
|
|
458
|
+
nonArabicCount++;
|
|
459
|
+
console.log(
|
|
460
|
+
`Line ${
|
|
461
|
+
index + 1
|
|
462
|
+
} translated but not to Arabic: "${originalText}" => "${translatedText}"`
|
|
463
|
+
);
|
|
464
|
+
} else {
|
|
465
|
+
untranslatedCount++;
|
|
466
|
+
console.log(
|
|
467
|
+
`Line ${
|
|
468
|
+
index + 1
|
|
469
|
+
} not translated: "${originalText}" => "${translatedText}"`
|
|
470
|
+
);
|
|
471
|
+
}
|
|
472
|
+
});
|
|
473
|
+
|
|
474
|
+
// Log translation statistics
|
|
475
|
+
const totalCaptions = originalSubs.cues.length;
|
|
476
|
+
console.log(
|
|
477
|
+
`Translation coverage: ${translatedCount}/${totalCaptions} (${(
|
|
478
|
+
(translatedCount / totalCaptions) *
|
|
479
|
+
100
|
|
480
|
+
).toFixed(2)}%)`
|
|
481
|
+
);
|
|
482
|
+
|
|
483
|
+
console.log(
|
|
484
|
+
`Lines with non-Arabic translation: ${nonArabicCount}/${totalCaptions} (${(
|
|
485
|
+
(nonArabicCount / totalCaptions) *
|
|
486
|
+
100
|
|
487
|
+
).toFixed(2)}%)`
|
|
488
|
+
);
|
|
489
|
+
|
|
490
|
+
console.log(
|
|
491
|
+
`Lines that exactly match some original line: ${exactMatchCount}/${totalCaptions} (${(
|
|
492
|
+
(exactMatchCount / totalCaptions) *
|
|
493
|
+
100
|
|
494
|
+
).toFixed(2)}%)`
|
|
495
|
+
);
|
|
496
|
+
|
|
497
|
+
// Log duplicate translation statistics
|
|
498
|
+
const duplicateCount = [...duplicateTranslations.values()].reduce((a, b) => a + b, 0) - duplicateTranslations.size;
|
|
499
|
+
console.log(
|
|
500
|
+
`Duplicate translations: ${duplicateCount}/${totalCaptions} (${(
|
|
501
|
+
(duplicateCount / totalCaptions) *
|
|
502
|
+
100
|
|
503
|
+
).toFixed(2)}%)`
|
|
504
|
+
);
|
|
505
|
+
|
|
506
|
+
// If there are many duplicates, log the most common ones for debugging
|
|
507
|
+
if (duplicateCount > totalCaptions * 0.05) { // More than 5% are duplicates
|
|
508
|
+
console.log("Most common duplicate translations:");
|
|
509
|
+
[...duplicateTranslations.entries()]
|
|
510
|
+
.sort((a, b) => b[1] - a[1]) // Sort by frequency, highest first
|
|
511
|
+
.slice(0, 5) // Top 5 duplicates
|
|
512
|
+
.forEach(([text, count]) => {
|
|
513
|
+
console.log(`"${text}" appears ${count} times`);
|
|
514
|
+
});
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
// Ensure at least 95% of lines are translated to Arabic
|
|
518
|
+
const arabicTranslationCoverage = translatedCount / totalCaptions;
|
|
519
|
+
t.true(
|
|
520
|
+
arabicTranslationCoverage > 0.95,
|
|
521
|
+
`At least 95% of lines should be translated to Arabic (actual: ${(
|
|
522
|
+
arabicTranslationCoverage * 100
|
|
523
|
+
).toFixed(2)}%)`
|
|
524
|
+
);
|
|
525
|
+
|
|
526
|
+
// Ensure that no more than 5% of lines exactly match any original line
|
|
527
|
+
const exactMatchPercentage = exactMatchCount / totalCaptions;
|
|
528
|
+
t.true(
|
|
529
|
+
exactMatchPercentage < 0.05,
|
|
530
|
+
`No more than 5% of lines should match original text (actual: ${(
|
|
531
|
+
exactMatchPercentage * 100
|
|
532
|
+
).toFixed(2)}%)`
|
|
533
|
+
);
|
|
534
|
+
|
|
535
|
+
// Ensure that duplicate translations are limited
|
|
536
|
+
// For a file with distinct English inputs, we'd expect distinct Arabic outputs
|
|
537
|
+
// Allow some duplication for very simple phrases like "Yes" or "Thank you"
|
|
538
|
+
const duplicatePercentage = duplicateCount / totalCaptions;
|
|
539
|
+
t.true(
|
|
540
|
+
duplicatePercentage < 0.15, // Allow up to 15% duplicate translations
|
|
541
|
+
`No more than 15% of lines should be duplicate translations (actual: ${(
|
|
542
|
+
duplicatePercentage * 100
|
|
543
|
+
).toFixed(2)}%)`
|
|
544
|
+
);
|
|
545
|
+
|
|
546
|
+
// Check timestamps are preserved
|
|
547
|
+
const timestampPattern =
|
|
548
|
+
/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}/g;
|
|
549
|
+
const originalTimestamps = text.match(timestampPattern);
|
|
550
|
+
const translatedTimestamps = result.match(timestampPattern);
|
|
551
|
+
|
|
552
|
+
t.deepEqual(
|
|
553
|
+
originalTimestamps,
|
|
554
|
+
translatedTimestamps,
|
|
555
|
+
"Timestamps should be preserved exactly"
|
|
556
|
+
);
|
|
557
|
+
});
|