@aj-archipelago/cortex 1.3.28 → 1.3.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,7 +4,8 @@ import { fileURLToPath } from 'url';
4
4
  import { dirname } from 'path';
5
5
  import fs from 'fs';
6
6
  import path from 'path';
7
- import { SubtitleUtils } from '@aj-archipelago/subvibe';
7
+ import { SubtitleUtils, parse } from '@aj-archipelago/subvibe';
8
+ import { selectBestTranslation, splitIntoOverlappingChunks } from '../pathways/translate_subtitle.js';
8
9
 
9
10
  const __filename = fileURLToPath(import.meta.url);
10
11
  const __dirname = dirname(__filename);
@@ -23,6 +24,21 @@ test.after.always('cleanup', async () => {
23
24
  }
24
25
  });
25
26
 
27
+ // Improved mock implementation of translateChunk that preserves identifiers
28
+ async function mockTranslateChunk(chunk, args) {
29
+ try {
30
+ // Instead of building and parsing which might lose identifiers,
31
+ // directly map each caption to a translated version
32
+ return chunk.captions.map(caption => ({
33
+ ...caption, // Preserve all properties including identifier
34
+ text: `Translated: ${caption.text}`, // Just modify the text
35
+ }));
36
+ } catch (e) {
37
+ console.error(`Error in mock translate chunk: ${e.message}`);
38
+ throw e;
39
+ }
40
+ }
41
+
26
42
  async function testSubtitleTranslation(t, text, language = 'English', format = 'srt') {
27
43
  const response = await testServer.executeOperation({
28
44
  query: 'query translate_subtitle($text: String!, $to: String, $format: String) { translate_subtitle(text: $text, to: $to, format: $format) { result } }',
@@ -170,4 +186,372 @@ test('test subtitle translation with horizontal SRT file', async t => {
170
186
  t.timeout(400000);
171
187
  const text = fs.readFileSync(path.join(__dirname, 'subhorizontal.srt'), 'utf8');
172
188
  await testSubtitleTranslation(t, text, 'Turkish', 'srt');
173
- });
189
+ });
190
+
191
+ /**
192
+ * Mock implementation of callPathway that handles translate_subtitle_helper
193
+ */
194
+ const mockCallPathway = async (pathwayName, params) => {
195
+ if (pathwayName === "translate_subtitle_helper") {
196
+ // Create a mock translation by adding "Translated: " prefix to each line
197
+ const mockCaptions = params.text
198
+ .split("\n")
199
+ .map((line) => `Translated: ${line}`)
200
+ .join("\n");
201
+ return `<SUBTITLES>${mockCaptions}</SUBTITLES>`;
202
+ }
203
+
204
+ throw new Error(`Mock callPathway: Unhandled pathway ${pathwayName}`);
205
+ };
206
+
207
+
208
+ test("translationMap is built correctly with multiple chunks", async (t) => {
209
+ // Create a sample of 50 captions
210
+ const sampleCaptions = Array.from({ length: 50 }, (_, i) => ({
211
+ identifier: i.toString(),
212
+ text: `Caption ${i}`,
213
+ index: i,
214
+ }));
215
+
216
+ // Use the actual function from the module to create chunks
217
+ const chunks = splitIntoOverlappingChunks(sampleCaptions);
218
+ t.true(chunks.length > 1, "Should create multiple chunks");
219
+
220
+ // Mock args parameter required by translateChunk
221
+ const mockArgs = {
222
+ format: "srt",
223
+ to: "Spanish",
224
+ };
225
+
226
+ // Use our simplified mock translateChunk function
227
+ const chunkPromises = chunks.map((chunk) => mockTranslateChunk(chunk, mockArgs));
228
+ const translatedChunks = await Promise.all(chunkPromises);
229
+
230
+ // Build translation map
231
+ const translationMap = new Map();
232
+ translatedChunks.flat().forEach((caption) => {
233
+ if (!translationMap.has(caption.identifier)) {
234
+ translationMap.set(caption.identifier, []);
235
+ }
236
+ translationMap.get(caption.identifier).push(caption);
237
+ });
238
+
239
+ // Debug output
240
+ console.log(`Translation map size: ${translationMap.size}`);
241
+
242
+ // Check a few sample entries
243
+ if (translationMap.size === 0) {
244
+ console.log("Sample of translated chunks:", translatedChunks[0].slice(0, 3));
245
+ console.log("First few captions from sample:", sampleCaptions.slice(0, 3));
246
+ }
247
+
248
+ // Verify the translation map
249
+ t.truthy(translationMap, "Translation map should be created");
250
+
251
+ // Check if all captions have entries
252
+ sampleCaptions.forEach((caption) => {
253
+ const hasEntry = translationMap.has(caption.identifier);
254
+ if (!hasEntry) {
255
+ console.log(`Missing entry for caption: ${caption.identifier}`);
256
+ }
257
+ t.true(
258
+ hasEntry,
259
+ `Translation map should have entry for caption ${caption.identifier}`
260
+ );
261
+ });
262
+
263
+ // Check for overlapping translations (captions appearing in multiple chunks)
264
+ let overlappingCaptions = 0;
265
+ translationMap.forEach((translations) => {
266
+ if (translations.length > 1) {
267
+ overlappingCaptions++;
268
+ }
269
+ });
270
+
271
+ // Due to the chunk overlap, some captions should have multiple translations
272
+ t.true(
273
+ overlappingCaptions > 0,
274
+ "Some captions should have multiple translations due to chunk overlap"
275
+ );
276
+ });
277
+
278
+ test("selectBestTranslation picks the best translation based on proximity to target", (t) => {
279
+ // Sample translations for the same caption with different identifiers/positions
280
+ const translations = [
281
+ { identifier: "10", text: "Translation 1", index: 10 },
282
+ { identifier: "15", text: "Translation 2", index: 15 },
283
+ { identifier: "20", text: "Translation 3", index: 20 },
284
+ { identifier: "25", text: "Translation 4", index: 25 },
285
+ ];
286
+
287
+ // Now we can use the actual function from the module
288
+
289
+ // Case 1: Target closer to first translation
290
+ const best1 = selectBestTranslation(translations, 10, 14);
291
+ t.is(
292
+ best1.text,
293
+ "Translation 1",
294
+ "Should select translation closest to target position 10-14"
295
+ );
296
+
297
+ // Case 2: Target closer to second translation
298
+ const best2 = selectBestTranslation(translations, 15, 19);
299
+ t.is(
300
+ best2.text,
301
+ "Translation 2",
302
+ "Should select translation closest to target position 15-19"
303
+ );
304
+
305
+ // Case 3: Target closer to third translation
306
+ const best3 = selectBestTranslation(translations, 20, 24);
307
+ t.is(
308
+ best3.text,
309
+ "Translation 3",
310
+ "Should select translation closest to target position 20-24"
311
+ );
312
+
313
+ // Case 4: Target exactly at one of the positions
314
+ const best4 = selectBestTranslation(translations, 15, 15);
315
+ t.is(best4.text, "Translation 2", "Should select exact matching translation");
316
+
317
+ // Case 5: Target between two positions
318
+ const best5 = selectBestTranslation(translations, 17, 23);
319
+ t.is(
320
+ best5.text,
321
+ "Translation 3",
322
+ "Should select translation closest to midpoint of target 17-23"
323
+ );
324
+
325
+ // Case 6: Single translation available
326
+ const singleTranslation = [
327
+ { identifier: "10", text: "Only translation", index: 10 },
328
+ ];
329
+ const best6 = selectBestTranslation(singleTranslation, 30, 30);
330
+ t.is(
331
+ best6.text,
332
+ "Only translation",
333
+ "With single translation, should select it regardless of target"
334
+ );
335
+
336
+ // Case 7: Handle missing identifier (use index instead)
337
+ const mixedTranslations = [
338
+ { text: "No identifier", index: 5 },
339
+ { identifier: "10", text: "With identifier", index: 10 },
340
+ ];
341
+ const best7 = selectBestTranslation(mixedTranslations, 4, 6);
342
+ t.is(
343
+ best7.text,
344
+ "No identifier",
345
+ "Should use index when identifier is missing"
346
+ );
347
+
348
+ // Case 8: Empty translations array
349
+ const emptyArray = [];
350
+ const best8 = selectBestTranslation(emptyArray, 10, 10);
351
+ t.is(best8, null, "Should return null for empty translations array");
352
+
353
+ // Case 9: Invalid input handling
354
+ t.is(
355
+ selectBestTranslation(null, 10, 10),
356
+ null,
357
+ "Should handle null input gracefully"
358
+ );
359
+ t.is(
360
+ selectBestTranslation(undefined, 10, 10),
361
+ null,
362
+ "Should handle undefined input gracefully"
363
+ );
364
+ });
365
+
366
+
367
+ test("subtitle translation with translation coverage verification", async (t) => {
368
+ t.timeout(400000); // Long timeout for potentially large file
369
+ const text = fs.readFileSync(path.join(__dirname, "subchunk.srt"), "utf8");
370
+
371
+ const response = await testServer.executeOperation({
372
+ query:
373
+ "query translate_subtitle($text: String!, $to: String, $format: String) { translate_subtitle(text: $text, to: $to, format: $format) { result } }",
374
+ variables: {
375
+ to: "Arabic",
376
+ text,
377
+ format: "srt",
378
+ },
379
+ });
380
+
381
+ t.falsy(response.body?.singleResult?.errors);
382
+
383
+ const result = response.body?.singleResult?.data?.translate_subtitle?.result;
384
+
385
+ t.log(`Result: ${result}`);
386
+
387
+ t.true(result?.length > text.length * 0.5);
388
+
389
+ // Parse both original and translated subtitles
390
+ const originalSubs = parse(text, { format: "srt" });
391
+ const translatedSubs = parse(result, { format: "srt" });
392
+
393
+ // Ensure we have the same number of cues/captions
394
+ t.is(
395
+ originalSubs.cues.length,
396
+ translatedSubs.cues.length,
397
+ "Should have same number of captions"
398
+ );
399
+
400
+ // Check that all lines have been translated to Arabic
401
+ let untranslatedCount = 0;
402
+ let translatedCount = 0;
403
+ let nonArabicCount = 0;
404
+ let exactMatchCount = 0;
405
+
406
+ // Store all original texts to check for duplicates
407
+ const allOriginalTexts = originalSubs.cues.map(cue => cue.text.toLowerCase().trim());
408
+
409
+ // Track translated texts to check for duplicates within translations
410
+ const translatedTextsSet = new Set();
411
+ const duplicateTranslations = new Map(); // Map to store duplicate counts
412
+
413
+ // Regular expression to match Arabic characters (Unicode range for Arabic script)
414
+ const arabicRegex = /[\u0600-\u06FF]/;
415
+
416
+ translatedSubs.cues.forEach((cue, index) => {
417
+ const originalText = originalSubs.cues[index].text;
418
+ const translatedText = cue.text;
419
+
420
+ // Skip empty lines
421
+ if (!originalText.trim()) return;
422
+
423
+ // Check if the text has been translated (different from original)
424
+ const isDifferent =
425
+ translatedText.toLowerCase().trim() !== originalText.toLowerCase().trim();
426
+
427
+ // Check if it's an exact match with ANY original line (not just its own line)
428
+ const normalizedTranslated = translatedText.toLowerCase().trim();
429
+ const isExactMatchWithAny = allOriginalTexts.includes(normalizedTranslated);
430
+
431
+ // Track duplicate translations
432
+ if (translatedTextsSet.has(normalizedTranslated)) {
433
+ if (duplicateTranslations.has(normalizedTranslated)) {
434
+ duplicateTranslations.set(
435
+ normalizedTranslated,
436
+ duplicateTranslations.get(normalizedTranslated) + 1
437
+ );
438
+ } else {
439
+ duplicateTranslations.set(normalizedTranslated, 2); // 2 occurrences total
440
+ }
441
+ } else {
442
+ translatedTextsSet.add(normalizedTranslated);
443
+ }
444
+
445
+ if (isExactMatchWithAny) {
446
+ exactMatchCount++;
447
+ console.log(
448
+ `Line ${index + 1} matches an original line: "${originalText}" => "${translatedText}"`
449
+ );
450
+ }
451
+
452
+ // Check if it contains Arabic characters
453
+ const containsArabic = arabicRegex.test(translatedText);
454
+
455
+ if (isDifferent && containsArabic) {
456
+ translatedCount++;
457
+ } else if (isDifferent && !containsArabic) {
458
+ nonArabicCount++;
459
+ console.log(
460
+ `Line ${
461
+ index + 1
462
+ } translated but not to Arabic: "${originalText}" => "${translatedText}"`
463
+ );
464
+ } else {
465
+ untranslatedCount++;
466
+ console.log(
467
+ `Line ${
468
+ index + 1
469
+ } not translated: "${originalText}" => "${translatedText}"`
470
+ );
471
+ }
472
+ });
473
+
474
+ // Log translation statistics
475
+ const totalCaptions = originalSubs.cues.length;
476
+ console.log(
477
+ `Translation coverage: ${translatedCount}/${totalCaptions} (${(
478
+ (translatedCount / totalCaptions) *
479
+ 100
480
+ ).toFixed(2)}%)`
481
+ );
482
+
483
+ console.log(
484
+ `Lines with non-Arabic translation: ${nonArabicCount}/${totalCaptions} (${(
485
+ (nonArabicCount / totalCaptions) *
486
+ 100
487
+ ).toFixed(2)}%)`
488
+ );
489
+
490
+ console.log(
491
+ `Lines that exactly match some original line: ${exactMatchCount}/${totalCaptions} (${(
492
+ (exactMatchCount / totalCaptions) *
493
+ 100
494
+ ).toFixed(2)}%)`
495
+ );
496
+
497
+ // Log duplicate translation statistics
498
+ const duplicateCount = [...duplicateTranslations.values()].reduce((a, b) => a + b, 0) - duplicateTranslations.size;
499
+ console.log(
500
+ `Duplicate translations: ${duplicateCount}/${totalCaptions} (${(
501
+ (duplicateCount / totalCaptions) *
502
+ 100
503
+ ).toFixed(2)}%)`
504
+ );
505
+
506
+ // If there are many duplicates, log the most common ones for debugging
507
+ if (duplicateCount > totalCaptions * 0.05) { // More than 5% are duplicates
508
+ console.log("Most common duplicate translations:");
509
+ [...duplicateTranslations.entries()]
510
+ .sort((a, b) => b[1] - a[1]) // Sort by frequency, highest first
511
+ .slice(0, 5) // Top 5 duplicates
512
+ .forEach(([text, count]) => {
513
+ console.log(`"${text}" appears ${count} times`);
514
+ });
515
+ }
516
+
517
+ // Ensure at least 95% of lines are translated to Arabic
518
+ const arabicTranslationCoverage = translatedCount / totalCaptions;
519
+ t.true(
520
+ arabicTranslationCoverage > 0.95,
521
+ `At least 95% of lines should be translated to Arabic (actual: ${(
522
+ arabicTranslationCoverage * 100
523
+ ).toFixed(2)}%)`
524
+ );
525
+
526
+ // Ensure that no more than 5% of lines exactly match any original line
527
+ const exactMatchPercentage = exactMatchCount / totalCaptions;
528
+ t.true(
529
+ exactMatchPercentage < 0.05,
530
+ `No more than 5% of lines should match original text (actual: ${(
531
+ exactMatchPercentage * 100
532
+ ).toFixed(2)}%)`
533
+ );
534
+
535
+ // Ensure that duplicate translations are limited
536
+ // For a file with distinct English inputs, we'd expect distinct Arabic outputs
537
+ // Allow some duplication for very simple phrases like "Yes" or "Thank you"
538
+ const duplicatePercentage = duplicateCount / totalCaptions;
539
+ t.true(
540
+ duplicatePercentage < 0.15, // Allow up to 15% duplicate translations
541
+ `No more than 15% of lines should be duplicate translations (actual: ${(
542
+ duplicatePercentage * 100
543
+ ).toFixed(2)}%)`
544
+ );
545
+
546
+ // Check timestamps are preserved
547
+ const timestampPattern =
548
+ /\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}/g;
549
+ const originalTimestamps = text.match(timestampPattern);
550
+ const translatedTimestamps = result.match(timestampPattern);
551
+
552
+ t.deepEqual(
553
+ originalTimestamps,
554
+ translatedTimestamps,
555
+ "Timestamps should be preserved exactly"
556
+ );
557
+ });