BatchalignHK 0.7.19.post9__tar.gz → 0.7.19.post11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (125) hide show
  1. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/BatchalignHK.egg-info/PKG-INFO +1 -1
  2. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/PKG-INFO +1 -1
  3. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/tencent.py +86 -59
  4. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/retrace.py +1 -1
  5. batchalignhk-0.7.19.post11/batchalign/version +3 -0
  6. batchalignhk-0.7.19.post9/batchalign/version +0 -3
  7. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/BatchalignHK.egg-info/SOURCES.txt +0 -0
  8. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  9. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/BatchalignHK.egg-info/entry_points.txt +0 -0
  10. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/BatchalignHK.egg-info/requires.txt +0 -0
  11. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/BatchalignHK.egg-info/top_level.txt +0 -0
  12. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/LICENSE +0 -0
  13. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/MANIFEST.in +0 -0
  14. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/README.md +0 -0
  15. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/__init__.py +0 -0
  16. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/__main__.py +0 -0
  17. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/cli/__init__.py +0 -0
  18. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/cli/cli.py +0 -0
  19. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/cli/dispatch.py +0 -0
  20. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/constants.py +0 -0
  21. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/document.py +0 -0
  22. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/errors.py +0 -0
  23. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/__init__.py +0 -0
  24. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/base.py +0 -0
  25. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/chat/__init__.py +0 -0
  26. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/chat/file.py +0 -0
  27. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/chat/generator.py +0 -0
  28. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/chat/lexer.py +0 -0
  29. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/chat/parser.py +0 -0
  30. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/chat/utils.py +0 -0
  31. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/textgrid/__init__.py +0 -0
  32. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/textgrid/file.py +0 -0
  33. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/textgrid/generator.py +0 -0
  34. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/formats/textgrid/parser.py +0 -0
  35. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/__init__.py +0 -0
  36. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/resolve.py +0 -0
  37. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/speaker/__init__.py +0 -0
  38. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/speaker/config.yaml +0 -0
  39. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/speaker/infer.py +0 -0
  40. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/speaker/utils.py +0 -0
  41. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/training/__init__.py +0 -0
  42. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/training/run.py +0 -0
  43. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/training/utils.py +0 -0
  44. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utils.py +0 -0
  45. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utterance/__init__.py +0 -0
  46. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utterance/cantonese_infer.py +0 -0
  47. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utterance/dataset.py +0 -0
  48. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utterance/execute.py +0 -0
  49. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utterance/infer.py +0 -0
  50. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utterance/prep.py +0 -0
  51. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/utterance/train.py +0 -0
  52. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/wave2vec/__init__.py +0 -0
  53. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/wave2vec/infer_fa.py +0 -0
  54. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/whisper/__init__.py +0 -0
  55. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/whisper/infer_asr.py +0 -0
  56. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/models/whisper/infer_fa.py +0 -0
  57. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/__init__.py +0 -0
  58. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/analysis/__init__.py +0 -0
  59. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/analysis/eval.py +0 -0
  60. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/__init__.py +0 -0
  61. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/num2chinese.py +0 -0
  62. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  63. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/rev.py +0 -0
  64. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/utils.py +0 -0
  65. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/whisper.py +0 -0
  66. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/asr/whisperx.py +0 -0
  67. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/base.py +0 -0
  68. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/__init__.py +0 -0
  69. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  70. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  71. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  72. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  73. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  74. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/cleanup/support/test.test +0 -0
  75. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/dispatch.py +0 -0
  76. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/fa/__init__.py +0 -0
  77. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  78. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  79. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  80. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  81. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  82. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  83. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  84. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  85. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  86. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  87. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/pipeline.py +0 -0
  88. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/speaker/__init__.py +0 -0
  89. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  90. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/translate/__init__.py +0 -0
  91. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/translate/gtrans.py +0 -0
  92. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/translate/seamless.py +0 -0
  93. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/translate/utils.py +0 -0
  94. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/utr/__init__.py +0 -0
  95. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/utr/rev_utr.py +0 -0
  96. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/utr/utils.py +0 -0
  97. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  98. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/utterance/__init__.py +0 -0
  99. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  100. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/__init__.py +0 -0
  101. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/conftest.py +0 -0
  102. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  103. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  104. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  105. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  106. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  107. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  108. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  109. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  110. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  111. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  112. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  113. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  114. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/fixures.py +0 -0
  115. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  116. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  117. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/tests/test_document.py +0 -0
  118. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/utils/__init__.py +0 -0
  119. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/utils/abbrev.py +0 -0
  120. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/utils/config.py +0 -0
  121. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/utils/dp.py +0 -0
  122. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/utils/names.py +0 -0
  123. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/batchalign/utils/utils.py +0 -0
  124. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/setup.cfg +0 -0
  125. {batchalignhk-0.7.19.post9 → batchalignhk-0.7.19.post11}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.19.post9
3
+ Version: 0.7.19.post11
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: BatchalignHK
3
- Version: 0.7.19.post9
3
+ Version: 0.7.19.post11
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -111,6 +111,11 @@ class TencentEngine(BatchalignEngine):
111
111
  "重復": "重複"
112
112
  }
113
113
  return word_replacements.get(word, word)
114
+
115
+ @staticmethod
116
+ def is_roman(x):
117
+ """check if x contains only roman characters"""
118
+ return all(c.isalpha() and ord(c) < 128 for c in x if not c.isspace())
114
119
 
115
120
  def generate(self, f, **kwargs):
116
121
  lang = self.__lang
@@ -119,73 +124,95 @@ class TencentEngine(BatchalignEngine):
119
124
  # processed_path = self.__preprocess_audio(f)
120
125
  # audio = AudioSegment.from_file(processed_path)
121
126
 
122
- try:
123
- L.info(f"Uploading '{pathlib.Path(f).stem}'...")
124
- # we will send the file for processing
125
- if not str(f).startswith("http"):
126
- with open(f, "rb") as image_file:
127
- encoded_string = base64.b64encode(image_file.read())
128
-
129
- req = models.CreateRecTaskRequest()
130
- if lang in {'zho', 'yue', 'wuu', 'nan','hak'}:
131
- req.EngineModelType = "16k_zh_large"
132
- else:
133
- req.EngineModelType = f"16k_{lang}"
134
- req.ResTextFormat = 1
135
- req.SpeakerDiarization = 1
136
- req.ChannelNum = 1
137
- if not str(f).startswith("http"):
138
- req.Data = encoded_string.decode('ascii')
139
- req.SourceType = 1
140
- else:
141
- req.Url = f
142
- req.SourceType = 0
143
- resp = client.CreateRecTask(req)
127
+ L.info(f"Uploading '{pathlib.Path(f).stem}'...")
128
+ # we will send the file for processing
129
+ if not str(f).startswith("http"):
130
+ with open(f, "rb") as image_file:
131
+ encoded_string = base64.b64encode(image_file.read())
132
+
133
+ req = models.CreateRecTaskRequest()
134
+ if lang in {'zho', 'yue', 'wuu', 'nan','hak'}:
135
+ req.EngineModelType = "16k_zh_large"
136
+ else:
137
+ req.EngineModelType = f"16k_{lang}"
138
+ req.ResTextFormat = 1
139
+ req.SpeakerDiarization = 1
140
+ req.ChannelNum = 1
141
+ if not str(f).startswith("http"):
142
+ req.Data = encoded_string.decode('ascii')
143
+ req.SourceType = 1
144
+ else:
145
+ req.Url = f
146
+ req.SourceType = 0
147
+ resp = client.CreateRecTask(req)
144
148
 
145
- L.info(f"Tencent is transcribing '{pathlib.Path(f).stem}'...")
146
- req = models.DescribeTaskStatusRequest()
147
- req.TaskId = resp.Data.TaskId
149
+ L.info(f"Tencent is transcribing '{pathlib.Path(f).stem}'...")
150
+ req = models.DescribeTaskStatusRequest()
151
+ req.TaskId = resp.Data.TaskId
148
152
 
153
+ res = client.DescribeTaskStatus(req)
154
+ while res.Data.Status not in [2, 3]:
155
+ time.sleep(15)
149
156
  res = client.DescribeTaskStatus(req)
150
- while res.Data.Status not in [2, 3]:
151
- time.sleep(15)
152
- res = client.DescribeTaskStatus(req)
153
-
154
- if res.Data.Status in ["3", 3]:
155
- raise RuntimeError(f"Tencent reports job failed! error='{res.Data.ErrorMsg}'")
156
-
157
- turns = []
158
- for i in res.Data.ResultDetail:
159
- turn = []
160
- start = i.StartMs
161
- for j in i.Words:
162
- word = j.Word
163
- if self.__lang == "yue":
164
- word = cc.convert(word)
165
-
166
- word = self.replace_cantonese_words(word)
167
157
 
158
+ if res.Data.Status in ["3", 3]:
159
+ raise RuntimeError(f"Tencent reports job failed! error='{res.Data.ErrorMsg}'")
160
+
161
+ turns = []
162
+ for i in res.Data.ResultDetail:
163
+ turn = []
164
+ start = i.StartMs
165
+ roman_cache = ""
166
+ roman_cache_start = i.StartMs
167
+ roman_cache_end = i.StartMs
168
+ for j in i.Words:
169
+ word = j.Word
170
+ if self.__lang == "yue":
171
+ word = cc.convert(word)
172
+
173
+ word = self.replace_cantonese_words(word)
174
+
175
+ if self.is_roman(word):
176
+ if roman_cache == "":
177
+ roman_cache_start = (j.OffsetStartMs + start)
178
+ roman_cache = roman_cache + word
179
+ roman_cache_end = (j.OffsetEndMs + start)
180
+ else:
181
+ if roman_cache != "":
182
+ turn.append({
183
+ "type": "text",
184
+ "ts": roman_cache_start / 1000,
185
+ "end_ts": roman_cache_end / 1000,
186
+ "value": roman_cache
187
+ })
188
+ roman_cache = ""
168
189
  turn.append({
169
190
  "type": "text",
170
191
  "ts": (j.OffsetStartMs + start) / 1000,
171
192
  "end_ts": (j.OffsetEndMs + start) / 1000,
172
193
  "value": word
173
194
  })
174
- turns.append({
175
- "elements": turn,
176
- "speaker": i.SpeakerId
195
+
196
+ if roman_cache != "":
197
+ turn.append({
198
+ "type": "text",
199
+ "ts": roman_cache_start / 1000,
200
+ "end_ts": roman_cache_end / 1000,
201
+ "value": roman_cache
177
202
  })
178
- L.debug(f"Tencent done.")
179
-
180
- # Extract the text from the small volume parts for translation
181
-
182
- doc = process_generation({"monologues": turns},
183
- self.__lang_code,
184
- utterance_engine=self.__engine)
185
- media = Media(type=MediaType.AUDIO, name=Path(f).stem, url=f)
186
- doc.media = media
187
- return doc
188
-
189
- finally:
190
- if processed_path != f and pathlib.Path(processed_path).exists():
191
- pathlib.Path(processed_path).unlink()
203
+
204
+ turns.append({
205
+ "elements": turn,
206
+ "speaker": i.SpeakerId
207
+ })
208
+ L.debug(f"Tencent done.")
209
+
210
+ # Extract the text from the small volume parts for translation
211
+
212
+ doc = process_generation({"monologues": turns},
213
+ self.__lang_code,
214
+ utterance_engine=self.__engine)
215
+ media = Media(type=MediaType.AUDIO, name=Path(f).stem, url=f)
216
+ doc.media = media
217
+ return doc
218
+
@@ -22,7 +22,7 @@ class NgramRetraceEngine(BatchalignEngine):
22
22
  if i.type in [TokenType.REGULAR, TokenType.PUNCT, TokenType.FP]:
23
23
  content.append(i)
24
24
  # scan for n-gram retraces
25
- for n in range(1, len(content)):
25
+ for n in range(1 if "yue" not in doc.langs and "zho" not in doc.langs else 2, len(content)):
26
26
  begin = 0
27
27
  while begin < len(content)-(n):
28
28
  # get the n gram info; we convert it to
@@ -0,0 +1,3 @@
1
+ 0.7.19-post.11
2
+ May 27th, 2025
3
+ tencent: tokenize roman words together
@@ -1,3 +0,0 @@
1
- 0.7.19-post.9
2
- May 24th, 2025
3
- reverts file only prep changes