BatchalignHK 0.7.17.post16__tar.gz → 0.7.17.post18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/PKG-INFO +4 -2
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/requires.txt +1 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/PKG-INFO +4 -2
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/cli/dispatch.py +10 -6
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/cantonese_infer.py +15 -12
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/tencent.py +4 -1
- batchalignhk-0.7.17.post18/batchalign/version +3 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/setup.py +1 -0
- batchalignhk-0.7.17.post16/batchalign/version +0 -3
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/SOURCES.txt +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/dependency_links.txt +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/entry_points.txt +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/top_level.txt +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/LICENSE +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/MANIFEST.in +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/README.md +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/__main__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/cli/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/cli/cli.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/constants.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/document.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/errors.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/base.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/file.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/generator.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/lexer.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/parser.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/file.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/resolve.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/config.yaml +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/infer.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/run.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/dataset.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/execute.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/infer.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/prep.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/train.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/wave2vec/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/wave2vec/infer_fa.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/base.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/dispatch.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/ud.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/pipeline.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/translate/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/translate/seamless.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/conftest.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/test_document.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/__init__.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/config.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/dp.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/utils.py +0 -0
- {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: BatchalignHK
|
|
3
|
-
Version: 0.7.17.
|
|
3
|
+
Version: 0.7.17.post18
|
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
|
@@ -15,6 +15,7 @@ Requires-Dist: torch>=2.6.0
|
|
|
15
15
|
Requires-Dist: torchaudio
|
|
16
16
|
Requires-Dist: hmmlearn==0.3.0
|
|
17
17
|
Requires-Dist: eyed3
|
|
18
|
+
Requires-Dist: opencc-python-reimplemented
|
|
18
19
|
Requires-Dist: pydub
|
|
19
20
|
Requires-Dist: imblearn
|
|
20
21
|
Requires-Dist: plotly>=5.3.0
|
|
@@ -47,6 +48,7 @@ Dynamic: author-email
|
|
|
47
48
|
Dynamic: classifier
|
|
48
49
|
Dynamic: description
|
|
49
50
|
Dynamic: description-content-type
|
|
51
|
+
Dynamic: license-file
|
|
50
52
|
Dynamic: provides-extra
|
|
51
53
|
Dynamic: requires-dist
|
|
52
54
|
Dynamic: summary
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: BatchalignHK
|
|
3
|
-
Version: 0.7.17.
|
|
3
|
+
Version: 0.7.17.post18
|
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
|
@@ -15,6 +15,7 @@ Requires-Dist: torch>=2.6.0
|
|
|
15
15
|
Requires-Dist: torchaudio
|
|
16
16
|
Requires-Dist: hmmlearn==0.3.0
|
|
17
17
|
Requires-Dist: eyed3
|
|
18
|
+
Requires-Dist: opencc-python-reimplemented
|
|
18
19
|
Requires-Dist: pydub
|
|
19
20
|
Requires-Dist: imblearn
|
|
20
21
|
Requires-Dist: plotly>=5.3.0
|
|
@@ -47,6 +48,7 @@ Dynamic: author-email
|
|
|
47
48
|
Dynamic: classifier
|
|
48
49
|
Dynamic: description
|
|
49
50
|
Dynamic: description-content-type
|
|
51
|
+
Dynamic: license-file
|
|
50
52
|
Dynamic: provides-extra
|
|
51
53
|
Dynamic: requires-dist
|
|
52
54
|
Dynamic: summary
|
|
@@ -66,12 +66,16 @@ def _dispatch(command, lang, num_speakers,
|
|
|
66
66
|
|
|
67
67
|
if kwargs.get("data"):
|
|
68
68
|
url = kwargs.get("data")
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
69
|
+
with open(url.strip()) as data:
|
|
70
|
+
data = data.readlines()
|
|
71
|
+
data = [i.strip() for i in data if i.strip() != ""]
|
|
72
|
+
for url in data:
|
|
73
|
+
url = urlparse(url)
|
|
74
|
+
if url.scheme == "":
|
|
75
|
+
url = url._replace(scheme="http")
|
|
76
|
+
base = os.path.basename(url.path)
|
|
77
|
+
files.append(url)
|
|
78
|
+
outputs.append(os.path.join(out_dir, base))
|
|
75
79
|
|
|
76
80
|
for basedir, _, fs in os.walk(in_dir):
|
|
77
81
|
for f in fs:
|
|
@@ -19,6 +19,9 @@ from transformers import DataCollatorForTokenClassification
|
|
|
19
19
|
# tqdm
|
|
20
20
|
from tqdm import tqdm
|
|
21
21
|
|
|
22
|
+
import logging
|
|
23
|
+
L = logging.getLogger("batchalign")
|
|
24
|
+
|
|
22
25
|
# seed device and tokens
|
|
23
26
|
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
24
27
|
|
|
@@ -34,8 +37,8 @@ class BertCantoneseUtteranceModel(object):
|
|
|
34
37
|
|
|
35
38
|
# eval mode
|
|
36
39
|
self.model.eval()
|
|
37
|
-
|
|
38
|
-
|
|
40
|
+
L.debug(f"Model and tokenizer initialized on device: {DEVICE}")
|
|
41
|
+
L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
|
|
39
42
|
|
|
40
43
|
def __call__(self, passage):
|
|
41
44
|
# Step 1: Clean up passage
|
|
@@ -79,14 +82,14 @@ class BertCantoneseUtteranceModel(object):
|
|
|
79
82
|
break
|
|
80
83
|
|
|
81
84
|
# Debugging: Print number of chunks and their content
|
|
82
|
-
|
|
85
|
+
L.debug(f"Created {len(chunks)} chunks based on keywords.")
|
|
83
86
|
for i, chunk in enumerate(chunks):
|
|
84
|
-
|
|
87
|
+
L.debug(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
|
|
85
88
|
|
|
86
89
|
# Step 3: Process each chunk and restore punctuation
|
|
87
90
|
final_passage = []
|
|
88
91
|
for chunk_index, chunk in enumerate(chunks):
|
|
89
|
-
|
|
92
|
+
L.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
|
|
90
93
|
|
|
91
94
|
# Step 3.1: Split chunk by characters (Chinese tokenization)
|
|
92
95
|
tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
|
|
@@ -103,7 +106,7 @@ class BertCantoneseUtteranceModel(object):
|
|
|
103
106
|
# Pass it through the model
|
|
104
107
|
res = self.model(**tokd).logits
|
|
105
108
|
except Exception as e:
|
|
106
|
-
|
|
109
|
+
L.error(f"Error during model inference: {e}")
|
|
107
110
|
return []
|
|
108
111
|
|
|
109
112
|
# Argmax for classification
|
|
@@ -152,7 +155,7 @@ class BertCantoneseUtteranceModel(object):
|
|
|
152
155
|
# Step 4: Join processed chunks together into the final passage
|
|
153
156
|
final_passage = ' '.join(final_passage)
|
|
154
157
|
|
|
155
|
-
|
|
158
|
+
L.info("Text processing completed. Generating final output...")
|
|
156
159
|
|
|
157
160
|
# Optionally, tokenize the final text into sentences based on punctuation
|
|
158
161
|
def custom_sent_tokenize(text):
|
|
@@ -164,12 +167,12 @@ class BertCantoneseUtteranceModel(object):
|
|
|
164
167
|
parts = re.split(sentence_endings, text)
|
|
165
168
|
|
|
166
169
|
# Debug: Output the parts after splitting
|
|
167
|
-
|
|
170
|
+
L.debug(f"Parts after splitting: {parts}")
|
|
168
171
|
|
|
169
172
|
# Combine parts and punctuation together
|
|
170
173
|
for i in range(0, len(parts) - 1, 2):
|
|
171
174
|
sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
|
|
172
|
-
|
|
175
|
+
L.debug(f"Sentence formed: {sentence}") # Debug: Output the current sentence
|
|
173
176
|
|
|
174
177
|
if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
|
|
175
178
|
split_passage.append(sentence)
|
|
@@ -177,18 +180,18 @@ class BertCantoneseUtteranceModel(object):
|
|
|
177
180
|
# If the last part doesn't have punctuation, we handle it here
|
|
178
181
|
if len(parts) % 2 != 0: # If there's no punctuation at the end
|
|
179
182
|
last_part = parts[-1].strip()
|
|
180
|
-
|
|
183
|
+
L.debug(f"Last part without punctuation: {last_part}") # Debug: Output the last part
|
|
181
184
|
|
|
182
185
|
if last_part: # Only add non-empty sentences
|
|
183
186
|
split_passage.append(last_part)
|
|
184
187
|
|
|
185
188
|
# Final output
|
|
186
|
-
|
|
189
|
+
L.debug(f"Final split passage: {split_passage}")
|
|
187
190
|
return split_passage
|
|
188
191
|
|
|
189
192
|
split_passage = custom_sent_tokenize(final_passage)
|
|
190
193
|
|
|
191
194
|
# Debugging: Output the sentences after splitting
|
|
192
|
-
|
|
195
|
+
L.debug(f"Final sentences: {split_passage}")
|
|
193
196
|
|
|
194
197
|
return split_passage
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/tencent.py
RENAMED
|
@@ -12,6 +12,9 @@ from batchalign.errors import *
|
|
|
12
12
|
|
|
13
13
|
from batchalign.models import BertUtteranceModel, BertCantoneseUtteranceModel, resolve
|
|
14
14
|
|
|
15
|
+
from opencc import OpenCC
|
|
16
|
+
cc = OpenCC('s2hk')
|
|
17
|
+
|
|
15
18
|
import time
|
|
16
19
|
import pathlib
|
|
17
20
|
import pycountry
|
|
@@ -113,7 +116,7 @@ class TencentEngine(BatchalignEngine):
|
|
|
113
116
|
"type": "text",
|
|
114
117
|
"ts": (j.OffsetStartMs+start)/1000,
|
|
115
118
|
"end_ts": (j.OffsetEndMs+start)/1000,
|
|
116
|
-
"value": j.Word
|
|
119
|
+
"value": cc.convert(j.Word)
|
|
117
120
|
})
|
|
118
121
|
turns.append({
|
|
119
122
|
"elements": turn,
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/generator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/file.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/generator.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/config.yaml
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/infer.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/utils.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/dataset.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/execute.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/infer.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/prep.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/train.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/wave2vec/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/wave2vec/infer_fa.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/infer_asr.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/infer_fa.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/analysis/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/analysis/eval.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/num2chinese.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/whisper.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/whisperx.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/cleanup.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/retrace.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/wave2vec_fa.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/whisper_fa.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/coref.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/ud.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/speaker/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/translate/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/translate/seamless.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/rev_utr.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/whisper_utr.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utterance/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/fixures.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|