mtranserver 4.0.26 → 4.0.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@
6
6
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
7
  <meta name="darkreader-lock" />
8
8
  <title>MTranServer</title>
9
- <script type="module" crossorigin src="/ui/assets/index-bQLHUB4Q.js"></script>
9
+ <script type="module" crossorigin src="/ui/assets/index-DAzz1C6Q.js"></script>
10
10
  <link rel="stylesheet" crossorigin href="/ui/assets/index-D-fF9r3Z.css">
11
11
  </head>
12
12
  <body>
package/dist/index.js CHANGED
@@ -148,7 +148,7 @@ function getConfig() {
148
148
  enableOfflineMode: getBool("--offline", "MT_OFFLINE", fileConfig.enableOfflineMode ?? false),
149
149
  workerIdleTimeout: getInt("--worker-idle-timeout", "MT_WORKER_IDLE_TIMEOUT", fileConfig.workerIdleTimeout ?? 60),
150
150
  workersPerLanguage: getInt("--workers-per-language", "MT_WORKERS_PER_LANGUAGE", fileConfig.workersPerLanguage ?? 1),
151
- maxLengthBreak: getInt("--max-length-break", "MT_MAX_LENGTH_BREAK", fileConfig.maxLengthBreak ?? 128),
151
+ maxSentenceLength: getInt("--max-sentence-length", "MT_MAX_SENTENCE_LENGTH", fileConfig.maxSentenceLength ?? 512),
152
152
  apiToken: getString("--api-token", "MT_API_TOKEN", fileConfig.apiToken || ""),
153
153
  logToFile: getBool("--log-to-file", "MT_LOG_TO_FILE", fileConfig.logToFile ?? false),
154
154
  logConsole: getBool("--log-console", "MT_LOG_CONSOLE", fileConfig.logConsole ?? true),
@@ -683,6 +683,77 @@ var init_models = __esm(() => {
683
683
  init_records();
684
684
  });
685
685
 
686
+ // src/utils/lang-alias.ts
687
+ function NormalizeLanguageCode(code) {
688
+ if (!code)
689
+ return "";
690
+ const normalized = code.toLowerCase().replace(/_/g, "-");
691
+ if (languageAliases[normalized]) {
692
+ return languageAliases[normalized];
693
+ }
694
+ const mainCode = normalized.split("-")[0];
695
+ if (languageAliases[mainCode]) {
696
+ return languageAliases[mainCode];
697
+ }
698
+ return mainCode;
699
+ }
700
+ function isCJKCode(code) {
701
+ if (!code)
702
+ return false;
703
+ const lower = code.toLowerCase();
704
+ return lower.startsWith("zh") || lower.startsWith("ja") || lower.startsWith("ko");
705
+ }
706
+ var languageAliases;
707
+ var init_lang_alias = __esm(() => {
708
+ languageAliases = {
709
+ zh: "zh-Hans",
710
+ "zh-cn": "zh-Hans",
711
+ "zh-sg": "zh-Hans",
712
+ "zh-hans": "zh-Hans",
713
+ cmn: "zh-Hans",
714
+ chinese: "zh-Hans",
715
+ "zh-tw": "zh-Hant",
716
+ "zh-hk": "zh-Hant",
717
+ "zh-mo": "zh-Hant",
718
+ "zh-hant": "zh-Hant",
719
+ cht: "zh-Hant",
720
+ "en-us": "en",
721
+ "en-gb": "en",
722
+ "en-au": "en",
723
+ "en-ca": "en",
724
+ "en-nz": "en",
725
+ "en-ie": "en",
726
+ "en-za": "en",
727
+ "en-jm": "en",
728
+ "en-bz": "en",
729
+ "en-tt": "en",
730
+ "fr-fr": "fr",
731
+ "fr-ca": "fr",
732
+ "fr-be": "fr",
733
+ "fr-ch": "fr",
734
+ "es-es": "es",
735
+ "es-mx": "es",
736
+ "es-ar": "es",
737
+ "es-co": "es",
738
+ "es-cl": "es",
739
+ "es-pe": "es",
740
+ "es-ve": "es",
741
+ "pt-pt": "pt",
742
+ "pt-br": "pt",
743
+ "de-de": "de",
744
+ "de-at": "de",
745
+ "de-ch": "de",
746
+ "it-it": "it",
747
+ "it-ch": "it",
748
+ "ja-jp": "ja",
749
+ jp: "ja",
750
+ "ko-kr": "ko",
751
+ kr: "ko",
752
+ "ru-ru": "ru",
753
+ nb: "no"
754
+ };
755
+ });
756
+
686
757
  // src/core/engine.ts
687
758
  class TranslationEngine {
688
759
  options;
@@ -692,7 +763,7 @@ class TranslationEngine {
692
763
  isReady = false;
693
764
  translating = false;
694
765
  pendingQueue = [];
695
- maxLengthBreak = 128;
766
+ maxSentenceLength = 512;
696
767
  constructor(options = {}) {
697
768
  this.options = options;
698
769
  }
@@ -704,7 +775,7 @@ class TranslationEngine {
704
775
  "beam-size": 1,
705
776
  normalize: 1,
706
777
  "word-penalty": 0,
707
- "max-length-break": 128,
778
+ "max-length-break": 512,
708
779
  "mini-batch-words": 1024,
709
780
  workspace: 128,
710
781
  "max-length-factor": 2,
@@ -716,7 +787,6 @@ class TranslationEngine {
716
787
  alignment: "soft"
717
788
  };
718
789
  const mergedConfig = { ...defaultConfig, ...config };
719
- this.maxLengthBreak = mergedConfig["max-length-break"] || 128;
720
790
  const MODEL_FILE_ALIGNMENTS = {
721
791
  model: 256,
722
792
  lex: 64,
@@ -754,7 +824,7 @@ class TranslationEngine {
754
824
  const effectiveOptions = forceHtml ? { ...options, html: true } : options;
755
825
  let translation;
756
826
  try {
757
- if (cleanText.length > this.maxLengthBreak) {
827
+ if (cleanText.length > this.maxSentenceLength) {
758
828
  translation = this._translateLongText(cleanText, effectiveOptions);
759
829
  } else {
760
830
  translation = this._translateInternal(cleanText, effectiveOptions);
@@ -847,6 +917,29 @@ class TranslationEngine {
847
917
  const errorMsg = error2.message.toLowerCase();
848
918
  return fatalPatterns.some((pattern) => errorMsg.includes(pattern));
849
919
  }
920
+ _getMappedSeparator(sep, targetLang) {
921
+ if (!targetLang)
922
+ return sep;
923
+ const isTargetCJK = isCJKCode(targetLang);
924
+ const map = {
925
+ ". ": { cjk: "。", nonCjk: ". " },
926
+ "。": { cjk: "。", nonCjk: ". " },
927
+ "!": { cjk: "!", nonCjk: "! " },
928
+ "!": { cjk: "!", nonCjk: "! " },
929
+ "?": { cjk: "?", nonCjk: "? " },
930
+ "?": { cjk: "?", nonCjk: "? " },
931
+ "; ": { cjk: ";", nonCjk: "; " },
932
+ ";": { cjk: ";", nonCjk: "; " },
933
+ ":": { cjk: ":", nonCjk: ": " },
934
+ ": ": { cjk: ":", nonCjk: ": " },
935
+ ",": { cjk: ",", nonCjk: ", " },
936
+ ", ": { cjk: ",", nonCjk: ", " }
937
+ };
938
+ if (sep in map) {
939
+ return isTargetCJK ? map[sep].cjk : map[sep].nonCjk;
940
+ }
941
+ return sep;
942
+ }
850
943
  _translateLongText(text, options = {}) {
851
944
  const separators = [
852
945
  `
@@ -881,24 +974,50 @@ class TranslationEngine {
881
974
  bestSep = sep;
882
975
  bestParts = parts;
883
976
  }
884
- if (maxLen <= this.maxLengthBreak) {
977
+ if (maxLen <= this.maxSentenceLength) {
885
978
  break;
886
979
  }
887
980
  }
888
981
  }
889
982
  if (bestParts.length <= 1) {
890
- bestParts = this._chunkByLength(text, this.maxLengthBreak);
983
+ bestParts = this._chunkByWordBoundary(text, this.maxSentenceLength);
891
984
  bestSep = "";
892
985
  }
893
986
  const results = bestParts.map((part) => {
894
987
  if (!part.trim())
895
988
  return part;
896
- if (part.length > this.maxLengthBreak) {
989
+ if (part.length > this.maxSentenceLength) {
897
990
  return this._translateLongText(part, options);
898
991
  }
899
992
  return this._translateInternal(part, options);
900
993
  });
901
- return results.join(bestSep);
994
+ const targetSep = this._getMappedSeparator(bestSep, this.options.targetLang);
995
+ return results.join(targetSep);
996
+ }
997
+ _chunkByWordBoundary(text, limit) {
998
+ const parts = text.split(/(\s+)/);
999
+ const chunks = [];
1000
+ let currentChunk = "";
1001
+ for (const part of parts) {
1002
+ if (currentChunk.length + part.length <= limit) {
1003
+ currentChunk += part;
1004
+ } else {
1005
+ if (currentChunk.length > 0) {
1006
+ chunks.push(currentChunk);
1007
+ currentChunk = "";
1008
+ }
1009
+ if (part.length > limit) {
1010
+ const subChunks = this._chunkByLength(part, limit);
1011
+ chunks.push(...subChunks);
1012
+ } else {
1013
+ currentChunk = part;
1014
+ }
1015
+ }
1016
+ }
1017
+ if (currentChunk.length > 0) {
1018
+ chunks.push(currentChunk);
1019
+ }
1020
+ return chunks;
902
1021
  }
903
1022
  _chunkByLength(text, chunkSize) {
904
1023
  if (chunkSize <= 0)
@@ -970,6 +1089,9 @@ class TranslationEngine {
970
1089
  }
971
1090
  }
972
1091
  }
1092
+ var init_engine = __esm(() => {
1093
+ init_lang_alias();
1094
+ });
973
1095
 
974
1096
  // src/lib/bergamot/bergamot-translator.js
975
1097
  var require_bergamot_translator = __commonJS((exports, module) => {
@@ -5105,15 +5227,16 @@ function bcp47Normalize(code) {
5105
5227
  return code.toLowerCase();
5106
5228
  }
5107
5229
  }
5108
- function detectShortCjkLanguage(text) {
5109
- if (text.length > SHORT_TEXT_CJK_THRESHOLD) {
5110
- return null;
5111
- }
5230
+ function detectPureCjkLanguage(text, startIndex = 0) {
5231
+ const limit = Math.min(text.length, startIndex + 2000);
5112
5232
  let hasHan = false;
5113
5233
  let hasKana = false;
5114
5234
  let hasHangul = false;
5115
- for (const char of text) {
5116
- const code = char.charCodeAt(0);
5235
+ for (let i = startIndex;i < limit; i++) {
5236
+ const code = text.charCodeAt(i);
5237
+ if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
5238
+ return null;
5239
+ }
5117
5240
  if (code >= 12352 && code <= 12543) {
5118
5241
  hasKana = true;
5119
5242
  continue;
@@ -5126,16 +5249,6 @@ function detectShortCjkLanguage(text) {
5126
5249
  hasHan = true;
5127
5250
  continue;
5128
5251
  }
5129
- if (code >= 65 && code <= 90 || code >= 97 && code <= 122) {
5130
- return null;
5131
- }
5132
- if (code >= 48 && code <= 57 || code <= 127) {
5133
- continue;
5134
- }
5135
- if (code >= 12288 && code <= 12351) {
5136
- continue;
5137
- }
5138
- return null;
5139
5252
  }
5140
5253
  if (hasKana)
5141
5254
  return "ja";
@@ -5145,21 +5258,29 @@ function detectShortCjkLanguage(text) {
5145
5258
  return "zh-Hans";
5146
5259
  return null;
5147
5260
  }
5261
+ function getValidContentStartIndex(text) {
5262
+ const match = text.match(/^[^\p{L}\p{N}]+/u);
5263
+ return match ? match[0].length : 0;
5264
+ }
5148
5265
  async function detectLanguage(text, maxBytes = MAX_DETECTION_BYTES) {
5149
5266
  if (!text) {
5150
5267
  return "";
5151
5268
  }
5152
- const shortCjk = detectShortCjkLanguage(text);
5153
- if (shortCjk) {
5154
- return shortCjk;
5269
+ const startIndex = getValidContentStartIndex(text);
5270
+ if (startIndex >= text.length)
5271
+ return "en";
5272
+ const pureCjk = detectPureCjkLanguage(text, startIndex);
5273
+ if (pureCjk) {
5274
+ return pureCjk;
5155
5275
  }
5156
5276
  await initCLD();
5277
+ const cleanText = text.slice(startIndex);
5157
5278
  try {
5158
- const result = detectLanguageWithCLD(text, false, maxBytes);
5279
+ const result = detectLanguageWithCLD(cleanText, false, maxBytes);
5159
5280
  return bcp47Normalize(result.language);
5160
5281
  } catch (error2) {
5161
5282
  warn(`Language detection failed: ${error2}`);
5162
- handleCldError(error2, { text, operation: "detectLanguage" });
5283
+ handleCldError(error2, { text: cleanText, operation: "detectLanguage" });
5163
5284
  return "en";
5164
5285
  }
5165
5286
  }
@@ -5167,9 +5288,17 @@ async function detectLanguageWithConfidence(text, minConfidence = DEFAULT_CONFID
5167
5288
  if (!text) {
5168
5289
  return { language: "", confidence: 0 };
5169
5290
  }
5291
+ const startIndex = getValidContentStartIndex(text);
5292
+ if (startIndex >= text.length)
5293
+ return { language: "en", confidence: 0 };
5294
+ const pureCjk = detectPureCjkLanguage(text, startIndex);
5295
+ if (pureCjk) {
5296
+ return { language: pureCjk, confidence: 1 };
5297
+ }
5170
5298
  await initCLD();
5299
+ const cleanText = text.slice(startIndex);
5171
5300
  try {
5172
- const result = detectLanguageWithCLD(text, false, maxBytes);
5301
+ const result = detectLanguageWithCLD(cleanText, false, maxBytes);
5173
5302
  const confidence = result.percentScore / 100;
5174
5303
  if (confidence < minConfidence) {
5175
5304
  return { language: "", confidence };
@@ -5180,7 +5309,7 @@ async function detectLanguageWithConfidence(text, minConfidence = DEFAULT_CONFID
5180
5309
  };
5181
5310
  } catch (error2) {
5182
5311
  warn(`Language detection with confidence failed: ${error2}`);
5183
- handleCldError(error2, { text, operation: "detectLanguageWithConfidence" });
5312
+ handleCldError(error2, { text: cleanText, operation: "detectLanguageWithConfidence" });
5184
5313
  return { language: "en", confidence: 0 };
5185
5314
  }
5186
5315
  }
@@ -5219,9 +5348,6 @@ function getScriptType(text) {
5219
5348
  return "Latin";
5220
5349
  return "Other";
5221
5350
  }
5222
- function isCJKLanguage(lang) {
5223
- return ["zh", "zh-Hans", "zh-Hant", "ja", "ko"].includes(lang) || lang.startsWith("zh-");
5224
- }
5225
5351
  async function detectMultipleLanguages(text) {
5226
5352
  return detectMultipleLanguagesWithThreshold(text, DEFAULT_CONFIDENCE_THRESHOLD);
5227
5353
  }
@@ -5259,7 +5385,7 @@ async function detectMultipleLanguagesWithThreshold(text, threshold) {
5259
5385
  finalLang = detectedLang;
5260
5386
  usedLogic = "confidence";
5261
5387
  } else {
5262
- if (scriptType === "Latin" && isCJKLanguage(effectiveFallback)) {
5388
+ if (scriptType === "Latin" && isCJKCode(effectiveFallback)) {
5263
5389
  if (detectedLang && detectedLang !== "un") {
5264
5390
  finalLang = detectedLang;
5265
5391
  usedLogic = "script-override-latin";
@@ -5267,7 +5393,7 @@ async function detectMultipleLanguagesWithThreshold(text, threshold) {
5267
5393
  finalLang = "en";
5268
5394
  usedLogic = "script-override-en";
5269
5395
  }
5270
- } else if (scriptType === "CJK" && !isCJKLanguage(effectiveFallback)) {
5396
+ } else if (scriptType === "CJK" && !isCJKCode(effectiveFallback)) {
5271
5397
  if (detectedLang && detectedLang !== "un") {
5272
5398
  finalLang = detectedLang;
5273
5399
  usedLogic = "script-override-cjk";
@@ -5344,8 +5470,9 @@ function limitLanguages(segments, originalText, maxLangs) {
5344
5470
  debug(`limitLanguages: reduced to ${maxLangs} languages, ${result.length} segments`);
5345
5471
  return result;
5346
5472
  }
5347
- var import_cld2, DEFAULT_CONFIDENCE_THRESHOLD = 0.5, MAXIMUM_LANGUAGES_IN_ONE_TEXT = 2, MAX_DETECTION_BYTES = 512, MAX_FALLBACK_DETECTION_BYTES = 1024, SHORT_TEXT_CJK_THRESHOLD = 3, cldModule = null, initPromise = null;
5473
+ var import_cld2, DEFAULT_CONFIDENCE_THRESHOLD = 0.5, MAXIMUM_LANGUAGES_IN_ONE_TEXT = 2, MAX_DETECTION_BYTES = 511, MAX_FALLBACK_DETECTION_BYTES = 1023, cldModule = null, initPromise = null;
5348
5474
  var init_detector = __esm(() => {
5475
+ init_lang_alias();
5349
5476
  init_cld2();
5350
5477
  init_logger();
5351
5478
  import_cld2 = __toESM(require_cld2(), 1);
@@ -5526,7 +5653,7 @@ async function translateWithPivot(fromLang, toLang, text, isHTML = false) {
5526
5653
  if (fromLang !== "auto" && fromLang === toLang) {
5527
5654
  return text;
5528
5655
  }
5529
- if (fromLang !== "auto" && text.length <= 128) {
5656
+ if (fromLang !== "auto" && text.length <= 512) {
5530
5657
  return translateSegment(fromLang, toLang, text, isHTML);
5531
5658
  }
5532
5659
  const config2 = getConfig();
@@ -5547,7 +5674,7 @@ async function translateWithPivot(fromLang, toLang, text, isHTML = false) {
5547
5674
  if (effectiveFromLang === toLang) {
5548
5675
  return text;
5549
5676
  }
5550
- if (text.length > config2.maxLengthBreak && !isHTML) {
5677
+ if (text.length > config2.maxSentenceLength && !isHTML) {
5551
5678
  return translateLongText(effectiveFromLang, toLang, text);
5552
5679
  }
5553
5680
  return translateSegment(effectiveFromLang, toLang, text, isHTML);
@@ -5611,7 +5738,8 @@ function cleanupAllEngines() {
5611
5738
  info("All engines cleaned up successfully");
5612
5739
  }
5613
5740
  var import_bergamot_translator, engines, loadingPromises;
5614
- var init_engine = __esm(() => {
5741
+ var init_engine2 = __esm(() => {
5742
+ init_engine();
5615
5743
  init_factory();
5616
5744
  init_config();
5617
5745
  init_bergamot_translator();
@@ -5635,75 +5763,10 @@ __export(exports_services, {
5635
5763
  cleanupAllEngines: () => cleanupAllEngines
5636
5764
  });
5637
5765
  var init_services = __esm(() => {
5638
- init_engine();
5766
+ init_engine2();
5639
5767
  init_detector();
5640
5768
  });
5641
5769
 
5642
- // src/utils/lang-alias.ts
5643
- function NormalizeLanguageCode(code) {
5644
- if (!code)
5645
- return "";
5646
- const normalized = code.toLowerCase().replace(/_/g, "-");
5647
- if (languageAliases[normalized]) {
5648
- return languageAliases[normalized];
5649
- }
5650
- const mainCode = normalized.split("-")[0];
5651
- if (languageAliases[mainCode]) {
5652
- return languageAliases[mainCode];
5653
- }
5654
- return mainCode;
5655
- }
5656
- var languageAliases;
5657
- var init_lang_alias = __esm(() => {
5658
- languageAliases = {
5659
- zh: "zh-Hans",
5660
- "zh-cn": "zh-Hans",
5661
- "zh-sg": "zh-Hans",
5662
- "zh-hans": "zh-Hans",
5663
- cmn: "zh-Hans",
5664
- chinese: "zh-Hans",
5665
- "zh-tw": "zh-Hant",
5666
- "zh-hk": "zh-Hant",
5667
- "zh-mo": "zh-Hant",
5668
- "zh-hant": "zh-Hant",
5669
- cht: "zh-Hant",
5670
- "en-us": "en",
5671
- "en-gb": "en",
5672
- "en-au": "en",
5673
- "en-ca": "en",
5674
- "en-nz": "en",
5675
- "en-ie": "en",
5676
- "en-za": "en",
5677
- "en-jm": "en",
5678
- "en-bz": "en",
5679
- "en-tt": "en",
5680
- "fr-fr": "fr",
5681
- "fr-ca": "fr",
5682
- "fr-be": "fr",
5683
- "fr-ch": "fr",
5684
- "es-es": "es",
5685
- "es-mx": "es",
5686
- "es-ar": "es",
5687
- "es-co": "es",
5688
- "es-cl": "es",
5689
- "es-pe": "es",
5690
- "es-ve": "es",
5691
- "pt-pt": "pt",
5692
- "pt-br": "pt",
5693
- "de-de": "de",
5694
- "de-at": "de",
5695
- "de-ch": "de",
5696
- "it-it": "it",
5697
- "it-ch": "it",
5698
- "ja-jp": "ja",
5699
- jp: "ja",
5700
- "ko-kr": "ko",
5701
- kr: "ko",
5702
- "ru-ru": "ru",
5703
- nb: "no"
5704
- };
5705
- });
5706
-
5707
5770
  // src/utils/port.ts
5708
5771
  import net from "net";
5709
5772
  async function getFreePort() {
@@ -5739,6 +5802,7 @@ var init_memory = () => {};
5739
5802
  // src/utils/index.ts
5740
5803
  var exports_utils = {};
5741
5804
  __export(exports_utils, {
5805
+ isCJKCode: () => isCJKCode,
5742
5806
  getLargestVersion: () => getLargestVersion,
5743
5807
  getFreePort: () => getFreePort,
5744
5808
  getAvailableMemoryMB: () => getAvailableMemoryMB,
@@ -5795,6 +5859,7 @@ export {
5795
5859
  setConfig,
5796
5860
  saveConfigFile,
5797
5861
  resetConfig,
5862
+ isCJKCode,
5798
5863
  initRecords,
5799
5864
  hasLanguagePair,
5800
5865
  globalRecords,