batchalign 0.8.0.post4__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of batchalign might be problematic. Click here for more details.
- {batchalign-0.8.0.post4/batchalign.egg-info → batchalign-0.8.1}/PKG-INFO +3 -1
- batchalign-0.8.1/batchalign/__init__.py +48 -0
- batchalign-0.8.1/batchalign/cli/cache.py +263 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/cli.py +5 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/dispatch.py +6 -3
- batchalign-0.8.1/batchalign/formats/__init__.py +11 -0
- batchalign-0.8.1/batchalign/models/__init__.py +33 -0
- batchalign-0.8.1/batchalign/models/speaker/__init__.py +7 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utils.py +31 -0
- batchalign-0.8.1/batchalign/models/utterance/__init__.py +13 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/cantonese_infer.py +28 -40
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/infer.py +13 -23
- batchalign-0.8.1/batchalign/models/wave2vec/__init__.py +7 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/wave2vec/infer_fa.py +16 -31
- batchalign-0.8.1/batchalign/models/whisper/__init__.py +11 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/whisper/infer_asr.py +16 -30
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/whisper/infer_fa.py +21 -17
- batchalign-0.8.1/batchalign/pipelines/__init__.py +37 -0
- batchalign-0.8.1/batchalign/pipelines/analysis/__init__.py +15 -0
- batchalign-0.8.1/batchalign/pipelines/asr/__init__.py +24 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/rev.py +6 -1
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/whisperx.py +9 -17
- batchalign-0.8.1/batchalign/pipelines/avqi/__init__.py +15 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/avqi/engine.py +6 -5
- batchalign-0.8.1/batchalign/pipelines/cache.py +735 -0
- batchalign-0.8.1/batchalign/pipelines/cleanup/__init__.py +18 -0
- batchalign-0.8.1/batchalign/pipelines/diarization/__init__.py +15 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/diarization/pyannote.py +5 -17
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/dispatch.py +19 -6
- batchalign-0.8.1/batchalign/pipelines/fa/__init__.py +18 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/fa/wave2vec_fa.py +49 -10
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/fa/whisper_fa.py +52 -10
- batchalign-0.8.1/batchalign/pipelines/morphosyntax/__init__.py +18 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/coref.py +1 -1
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/ud.py +147 -21
- batchalign-0.8.1/batchalign/pipelines/opensmile/__init__.py +15 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/opensmile/engine.py +22 -12
- batchalign-0.8.1/batchalign/pipelines/speaker/__init__.py +15 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/speaker/nemo_speaker.py +4 -2
- batchalign-0.8.1/batchalign/pipelines/translate/__init__.py +18 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/translate/gtrans.py +2 -1
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/translate/seamless.py +2 -1
- batchalign-0.8.1/batchalign/pipelines/utr/__init__.py +18 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utr/rev_utr.py +8 -2
- batchalign-0.8.1/batchalign/pipelines/utterance/__init__.py +15 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utterance/ud_utterance.py +95 -41
- batchalign-0.8.1/batchalign/tests/pipelines/cache/__init__.py +1 -0
- batchalign-0.8.1/batchalign/tests/pipelines/cache/test_cache.py +407 -0
- batchalign-0.8.1/batchalign/version +3 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1/batchalign.egg-info}/PKG-INFO +3 -1
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/SOURCES.txt +4 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/requires.txt +2 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/setup.py +2 -0
- batchalign-0.8.0.post4/batchalign/__init__.py +0 -19
- batchalign-0.8.0.post4/batchalign/formats/__init__.py +0 -2
- batchalign-0.8.0.post4/batchalign/models/__init__.py +0 -6
- batchalign-0.8.0.post4/batchalign/models/speaker/__init__.py +0 -1
- batchalign-0.8.0.post4/batchalign/models/utterance/__init__.py +0 -4
- batchalign-0.8.0.post4/batchalign/models/wave2vec/__init__.py +0 -1
- batchalign-0.8.0.post4/batchalign/models/whisper/__init__.py +0 -2
- batchalign-0.8.0.post4/batchalign/pipelines/__init__.py +0 -19
- batchalign-0.8.0.post4/batchalign/pipelines/analysis/__init__.py +0 -1
- batchalign-0.8.0.post4/batchalign/pipelines/asr/__init__.py +0 -4
- batchalign-0.8.0.post4/batchalign/pipelines/avqi/__init__.py +0 -8
- batchalign-0.8.0.post4/batchalign/pipelines/cleanup/__init__.py +0 -3
- batchalign-0.8.0.post4/batchalign/pipelines/diarization/__init__.py +0 -1
- batchalign-0.8.0.post4/batchalign/pipelines/fa/__init__.py +0 -2
- batchalign-0.8.0.post4/batchalign/pipelines/morphosyntax/__init__.py +0 -3
- batchalign-0.8.0.post4/batchalign/pipelines/opensmile/__init__.py +0 -7
- batchalign-0.8.0.post4/batchalign/pipelines/speaker/__init__.py +0 -1
- batchalign-0.8.0.post4/batchalign/pipelines/translate/__init__.py +0 -2
- batchalign-0.8.0.post4/batchalign/pipelines/utr/__init__.py +0 -2
- batchalign-0.8.0.post4/batchalign/pipelines/utterance/__init__.py +0 -1
- batchalign-0.8.0.post4/batchalign/version +0 -3
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/LICENSE +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/MANIFEST.in +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/README.md +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/__main__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/cli/__init__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/constants.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/document.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/errors.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/base.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/__init__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/file.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/generator.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/lexer.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/parser.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/chat/utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/file.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/resolve.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/speaker/config.yaml +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/speaker/infer.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/speaker/utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/training/__init__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/training/run.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/training/utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/dataset.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/execute.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/prep.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/models/utterance/train.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/por.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/oai_whisper.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/base.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/pipeline.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/translate/utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/__init__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/conftest.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/tests/test_document.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/__init__.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/abbrev.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/compounds.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/config.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/dp.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/names.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign/utils/utils.py +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/dependency_links.txt +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/entry_points.txt +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/batchalign.egg-info/top_level.txt +0 -0
- {batchalign-0.8.0.post4 → batchalign-0.8.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: batchalign
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
|
@@ -9,6 +9,8 @@ Classifier: Topic :: Utilities
|
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: pydantic>=2.4
|
|
12
|
+
Requires-Dist: platformdirs>=4.3.0
|
|
13
|
+
Requires-Dist: filelock>=3.0.0
|
|
12
14
|
Requires-Dist: nltk>=3.8
|
|
13
15
|
Requires-Dist: praatio<6.1.0,>=6.0.0
|
|
14
16
|
Requires-Dist: torch>=2.6.0
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = str(1)
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
# clear all of nemo's loggers
|
|
7
|
+
logging.getLogger().handlers.clear()
|
|
8
|
+
logging.getLogger('nemo_logger').handlers.clear()
|
|
9
|
+
logging.getLogger().setLevel(logging.CRITICAL)
|
|
10
|
+
logging.getLogger('nemo_logger').disabled = True
|
|
11
|
+
|
|
12
|
+
from .document import *
|
|
13
|
+
from .constants import *
|
|
14
|
+
from .errors import *
|
|
15
|
+
|
|
16
|
+
# Defer slow imports
|
|
17
|
+
# from .formats import *
|
|
18
|
+
# from .pipelines import *
|
|
19
|
+
# from .models import *
|
|
20
|
+
# from .cli import batchalign as cli
|
|
21
|
+
|
|
22
|
+
def __getattr__(name):
|
|
23
|
+
if name == 'cli':
|
|
24
|
+
from .cli import batchalign
|
|
25
|
+
return batchalign
|
|
26
|
+
if name == 'BatchalignPipeline':
|
|
27
|
+
from .pipelines import BatchalignPipeline
|
|
28
|
+
return BatchalignPipeline
|
|
29
|
+
if name == 'CHATFile':
|
|
30
|
+
from .formats.chat import CHATFile
|
|
31
|
+
return CHATFile
|
|
32
|
+
# Add other common engines if needed for dispatch.py
|
|
33
|
+
if name in ['WhisperEngine', 'WhisperFAEngine', 'StanzaEngine', 'RevEngine',
|
|
34
|
+
'NgramRetraceEngine', 'DisfluencyReplacementEngine', 'WhisperUTREngine',
|
|
35
|
+
'RevUTREngine', 'EvaluationEngine', 'WhisperXEngine', 'NemoSpeakerEngine',
|
|
36
|
+
'StanzaUtteranceEngine', 'CorefEngine', 'Wave2VecFAEngine', 'SeamlessTranslationModel',
|
|
37
|
+
'GoogleTranslateEngine', 'OAIWhisperEngine', 'PyannoteEngine']:
|
|
38
|
+
from .pipelines import dispatch
|
|
39
|
+
# This is a bit recursive, let's just let dispatch import them locally
|
|
40
|
+
# which it already does now.
|
|
41
|
+
import importlib
|
|
42
|
+
# We need to find which subpackage it's in.
|
|
43
|
+
# Actually, if we use local imports in dispatch.py, we don't need these here.
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
47
|
+
|
|
48
|
+
logging.getLogger('nemo_logger').disabled = False
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cache.py
|
|
3
|
+
CLI subcommand for managing the Batchalign cache.
|
|
4
|
+
|
|
5
|
+
Provides commands to:
|
|
6
|
+
- Show cache statistics (--stats)
|
|
7
|
+
- Clear all cached data (--clear)
|
|
8
|
+
- Prepopulate cache from existing CHAT files (--warm)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import rich_click as click
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
|
|
17
|
+
C = Console()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _format_bytes(count: int | None, precision: int = 2) -> str:
|
|
21
|
+
"""Format byte count as human-readable string."""
|
|
22
|
+
if count is None:
|
|
23
|
+
return "unknown"
|
|
24
|
+
units = ["B", "KB", "MB", "GB", "TB"]
|
|
25
|
+
idx = 0
|
|
26
|
+
size = float(count)
|
|
27
|
+
while size >= 1024 and idx < len(units) - 1:
|
|
28
|
+
size /= 1024
|
|
29
|
+
idx += 1
|
|
30
|
+
if idx == 0:
|
|
31
|
+
return f"{int(size)} {units[idx]}"
|
|
32
|
+
return f"{size:.{precision}f} {units[idx]}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@click.group(invoke_without_command=True)
|
|
36
|
+
@click.option("--stats", is_flag=True, help="Show cache statistics.")
|
|
37
|
+
@click.option(
|
|
38
|
+
"--clear",
|
|
39
|
+
is_flag=True,
|
|
40
|
+
help="Clear all cached data (requires confirmation)."
|
|
41
|
+
)
|
|
42
|
+
@click.pass_context
|
|
43
|
+
def cache(ctx, stats, clear):
|
|
44
|
+
"""Manage the Batchalign cache.
|
|
45
|
+
|
|
46
|
+
The cache stores per-utterance analysis results to avoid redundant
|
|
47
|
+
computation when re-processing unchanged content.
|
|
48
|
+
|
|
49
|
+
Examples:
|
|
50
|
+
batchalign cache --stats
|
|
51
|
+
batchalign cache --clear
|
|
52
|
+
batchalign cache warm INPUT_DIR --lang eng
|
|
53
|
+
"""
|
|
54
|
+
# Handle --stats flag
|
|
55
|
+
if stats:
|
|
56
|
+
ctx.invoke(show_stats)
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
# Handle --clear flag
|
|
60
|
+
if clear:
|
|
61
|
+
ctx.invoke(clear_cache)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
# If no flags and no subcommand, show help
|
|
65
|
+
if ctx.invoked_subcommand is None:
|
|
66
|
+
click.echo(ctx.get_help())
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@cache.command("stats")
|
|
70
|
+
def show_stats():
|
|
71
|
+
"""Show cache statistics."""
|
|
72
|
+
from batchalign.pipelines.cache import CacheManager
|
|
73
|
+
|
|
74
|
+
manager = CacheManager()
|
|
75
|
+
stats = manager.stats()
|
|
76
|
+
|
|
77
|
+
C.print()
|
|
78
|
+
C.print("[bold]Batchalign Cache Statistics[/bold]")
|
|
79
|
+
C.print("-" * 35)
|
|
80
|
+
C.print(f"[cyan]Location:[/cyan] {stats['location']}")
|
|
81
|
+
C.print(f"[cyan]Size:[/cyan] {_format_bytes(stats['size_bytes'])}")
|
|
82
|
+
C.print(f"[cyan]Entries:[/cyan] {stats['total_entries']:,}")
|
|
83
|
+
C.print()
|
|
84
|
+
|
|
85
|
+
# Show breakdown by task
|
|
86
|
+
if stats["by_task"]:
|
|
87
|
+
C.print("[bold]By task:[/bold]")
|
|
88
|
+
for task, count in sorted(stats["by_task"].items()):
|
|
89
|
+
C.print(f" {task}: {count:,} entries")
|
|
90
|
+
C.print()
|
|
91
|
+
|
|
92
|
+
# Show breakdown by engine version
|
|
93
|
+
if stats["by_engine_version"]:
|
|
94
|
+
# Get current stanza version to mark outdated entries
|
|
95
|
+
try:
|
|
96
|
+
import stanza
|
|
97
|
+
current_stanza = stanza.__version__
|
|
98
|
+
except ImportError:
|
|
99
|
+
current_stanza = None
|
|
100
|
+
|
|
101
|
+
C.print("[bold]Engine versions:[/bold]")
|
|
102
|
+
for key, count in sorted(stats["by_engine_version"].items()):
|
|
103
|
+
# Check if this version is outdated
|
|
104
|
+
outdated = ""
|
|
105
|
+
if current_stanza and "morphosyntax" in key:
|
|
106
|
+
version_part = key.split()[-1] if " " in key else ""
|
|
107
|
+
if version_part and version_part != current_stanza:
|
|
108
|
+
outdated = " [dim](outdated)[/dim]"
|
|
109
|
+
C.print(f" {key}: {count:,} entries{outdated}")
|
|
110
|
+
C.print()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@cache.command("clear")
|
|
114
|
+
@click.confirmation_option(
|
|
115
|
+
prompt="Are you sure you want to clear all cached data?"
|
|
116
|
+
)
|
|
117
|
+
def clear_cache():
|
|
118
|
+
"""Clear all cached data."""
|
|
119
|
+
from batchalign.pipelines.cache import CacheManager
|
|
120
|
+
|
|
121
|
+
manager = CacheManager()
|
|
122
|
+
stats = manager.stats()
|
|
123
|
+
entries_before = stats["total_entries"]
|
|
124
|
+
|
|
125
|
+
bytes_freed = manager.clear()
|
|
126
|
+
|
|
127
|
+
C.print()
|
|
128
|
+
C.print(f"[bold green]Cache cleared.[/bold green]")
|
|
129
|
+
C.print(f" Entries removed: {entries_before:,}")
|
|
130
|
+
C.print(f" Space freed: {_format_bytes(bytes_freed)}")
|
|
131
|
+
C.print()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@cache.command("warm")
|
|
135
|
+
@click.argument("input_dir", type=click.Path(exists=True, file_okay=False))
|
|
136
|
+
@click.option(
|
|
137
|
+
"--lang",
|
|
138
|
+
default="eng",
|
|
139
|
+
help="Language code (3-letter ISO). Default: eng"
|
|
140
|
+
)
|
|
141
|
+
@click.option(
|
|
142
|
+
"--retokenize/--keeptokens",
|
|
143
|
+
default=False,
|
|
144
|
+
help="Whether files were processed with retokenization."
|
|
145
|
+
)
|
|
146
|
+
def warm_cache(input_dir, lang, retokenize):
|
|
147
|
+
"""Prepopulate cache from existing CHAT files with %mor/%gra tiers.
|
|
148
|
+
|
|
149
|
+
Reads CHAT files that already have morphosyntactic analysis (%mor and %gra
|
|
150
|
+
tiers) and populates the cache with their content. This allows subsequent
|
|
151
|
+
processing of identical utterances to use cached results.
|
|
152
|
+
|
|
153
|
+
IMPORTANT: The command trusts the input files. It does not validate that
|
|
154
|
+
the %mor/%gra content is correct.
|
|
155
|
+
"""
|
|
156
|
+
from batchalign.pipelines.cache import (
|
|
157
|
+
CacheManager, MorphotagCacheKey, _get_batchalign_version
|
|
158
|
+
)
|
|
159
|
+
from batchalign.formats.chat import CHATFile
|
|
160
|
+
from batchalign.document import Utterance
|
|
161
|
+
|
|
162
|
+
# Get engine version
|
|
163
|
+
try:
|
|
164
|
+
import stanza
|
|
165
|
+
engine_version = stanza.__version__
|
|
166
|
+
except ImportError:
|
|
167
|
+
C.print("[bold red]Error:[/bold red] stanza is not installed. Cannot warm cache.")
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
manager = CacheManager()
|
|
171
|
+
key_gen = MorphotagCacheKey()
|
|
172
|
+
ba_version = _get_batchalign_version()
|
|
173
|
+
|
|
174
|
+
# Collect all .cha files
|
|
175
|
+
cha_files = []
|
|
176
|
+
for root, dirs, files in os.walk(input_dir):
|
|
177
|
+
for f in files:
|
|
178
|
+
if f.lower().endswith(".cha"):
|
|
179
|
+
cha_files.append(os.path.join(root, f))
|
|
180
|
+
|
|
181
|
+
if not cha_files:
|
|
182
|
+
C.print(f"[bold yellow]No .cha files found in {input_dir}[/bold yellow]")
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
C.print(f"\nWarming cache from {len(cha_files)} CHAT file(s)...")
|
|
186
|
+
C.print(f" Language: {lang}")
|
|
187
|
+
C.print(f" Retokenize: {retokenize}")
|
|
188
|
+
C.print(f" Stanza version: {engine_version}")
|
|
189
|
+
C.print()
|
|
190
|
+
|
|
191
|
+
entries_added = 0
|
|
192
|
+
entries_skipped = 0
|
|
193
|
+
files_processed = 0
|
|
194
|
+
|
|
195
|
+
for cha_path in cha_files:
|
|
196
|
+
try:
|
|
197
|
+
cf = CHATFile(path=cha_path, special_mor_=True)
|
|
198
|
+
doc = cf.doc
|
|
199
|
+
|
|
200
|
+
# Map for batching within a file
|
|
201
|
+
utterances_to_check = []
|
|
202
|
+
idx_to_key = {}
|
|
203
|
+
|
|
204
|
+
for idx, item in enumerate(doc.content):
|
|
205
|
+
if not isinstance(item, Utterance):
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
# Check if utterance has morphology/dependency
|
|
209
|
+
has_morphology = any(
|
|
210
|
+
form.morphology and len(form.morphology) > 0
|
|
211
|
+
for form in item.content
|
|
212
|
+
)
|
|
213
|
+
has_dependency = any(
|
|
214
|
+
form.dependency and len(form.dependency) > 0
|
|
215
|
+
for form in item.content
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if not (has_morphology or has_dependency):
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
# Generate cache key
|
|
222
|
+
key = key_gen.generate_key(
|
|
223
|
+
item,
|
|
224
|
+
lang=lang,
|
|
225
|
+
retokenize=retokenize,
|
|
226
|
+
mwt={}
|
|
227
|
+
)
|
|
228
|
+
utterances_to_check.append((idx, key))
|
|
229
|
+
idx_to_key[idx] = key
|
|
230
|
+
|
|
231
|
+
if not utterances_to_check:
|
|
232
|
+
files_processed += 1
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
# Batch check
|
|
236
|
+
keys = [k for _, k in utterances_to_check]
|
|
237
|
+
cached_results = manager.get_batch(keys, "morphosyntax", engine_version)
|
|
238
|
+
|
|
239
|
+
entries_skipped += len(cached_results)
|
|
240
|
+
|
|
241
|
+
# Filter out already cached ones and prepare for batch put
|
|
242
|
+
to_put = []
|
|
243
|
+
for idx, key in utterances_to_check:
|
|
244
|
+
if key not in cached_results:
|
|
245
|
+
item = doc.content[idx]
|
|
246
|
+
data = key_gen.serialize_output(item)
|
|
247
|
+
to_put.append((key, data))
|
|
248
|
+
|
|
249
|
+
if to_put:
|
|
250
|
+
manager.put_batch(to_put, "morphosyntax", engine_version, ba_version)
|
|
251
|
+
entries_added += len(to_put)
|
|
252
|
+
|
|
253
|
+
files_processed += 1
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
C.print(f"[yellow]Warning:[/yellow] Could not process {cha_path}: {e}")
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
C.print(f"[bold green]Cache warming complete.[/bold green]")
|
|
260
|
+
C.print(f" Files processed: {files_processed}")
|
|
261
|
+
C.print(f" Entries added: {entries_added}")
|
|
262
|
+
C.print(f" Entries skipped (already cached): {entries_skipped}")
|
|
263
|
+
C.print()
|
|
@@ -92,6 +92,9 @@ def batchalign(ctx, verbose, workers):
|
|
|
92
92
|
|
|
93
93
|
batchalign.add_command(train, "models")
|
|
94
94
|
|
|
95
|
+
from batchalign.cli.cache import cache
|
|
96
|
+
batchalign.add_command(cache, "cache")
|
|
97
|
+
|
|
95
98
|
#################### ALIGN ################################
|
|
96
99
|
|
|
97
100
|
@batchalign.command()
|
|
@@ -230,6 +233,8 @@ def translate(ctx, in_dir, out_dir, **kwargs):
|
|
|
230
233
|
type=click.Path(exists=True,
|
|
231
234
|
file_okay=True, dir_okay=False),
|
|
232
235
|
help="Comma seperated manual lexicon override")
|
|
236
|
+
@click.option("--override-cache/--use-cache",
|
|
237
|
+
default=False, help="Bypass cache and recompute all utterances.")
|
|
233
238
|
@click.pass_context
|
|
234
239
|
def morphotag(ctx, in_dir, out_dir, **kwargs):
|
|
235
240
|
"""Perform morphosyntactic analysis on transcripts."""
|
|
@@ -94,8 +94,9 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
|
|
|
94
94
|
else:
|
|
95
95
|
baL.setLevel(logging.DEBUG)
|
|
96
96
|
|
|
97
|
-
# Always capture output to avoid interleaving with progress rendering
|
|
98
|
-
|
|
97
|
+
# Always capture output to avoid interleaving with progress rendering,
|
|
98
|
+
# unless high verbosity is requested for debugging.
|
|
99
|
+
should_capture = verbose < 2
|
|
99
100
|
|
|
100
101
|
if should_capture:
|
|
101
102
|
# Use a temporary file to capture ALL output at the FD level
|
|
@@ -128,6 +129,7 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
|
|
|
128
129
|
mwt = kwargs.pop("mwt", {})
|
|
129
130
|
retokenize = kwargs.pop("retokenize", False)
|
|
130
131
|
skipmultilang = kwargs.pop("skipmultilang", False)
|
|
132
|
+
override_cache = kwargs.pop("override_cache", False)
|
|
131
133
|
|
|
132
134
|
cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
|
|
133
135
|
doc = cf.doc
|
|
@@ -138,7 +140,8 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
|
|
|
138
140
|
pipeline_kwargs = {
|
|
139
141
|
"retokenize": retokenize,
|
|
140
142
|
"skipmultilang": skipmultilang,
|
|
141
|
-
"mwt": mwt
|
|
143
|
+
"mwt": mwt,
|
|
144
|
+
"override_cache": override_cache
|
|
142
145
|
}
|
|
143
146
|
# Add any remaining kwargs
|
|
144
147
|
pipeline_kwargs.update(kwargs)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# from .chat import CHATFile
|
|
2
|
+
# from .textgrid import TextGridFile
|
|
3
|
+
|
|
4
|
+
def __getattr__(name):
|
|
5
|
+
if name == 'CHATFile':
|
|
6
|
+
from .chat import CHATFile
|
|
7
|
+
return CHATFile
|
|
8
|
+
if name == 'TextGridFile':
|
|
9
|
+
from .textgrid import TextGridFile
|
|
10
|
+
return TextGridFile
|
|
11
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
|
|
2
|
+
# from .whisper import WhisperASRModel, WhisperFAModel
|
|
3
|
+
# from .speaker import NemoSpeakerModel
|
|
4
|
+
# from .utils import ASRAudioFile
|
|
5
|
+
# from .resolve import resolve
|
|
6
|
+
# from .wave2vec import Wave2VecFAModel
|
|
7
|
+
|
|
8
|
+
def __getattr__(name):
|
|
9
|
+
if name == 'BertUtteranceModel':
|
|
10
|
+
from .utterance import BertUtteranceModel
|
|
11
|
+
return BertUtteranceModel
|
|
12
|
+
if name == 'BertCantoneseUtteranceModel':
|
|
13
|
+
from .utterance import BertCantoneseUtteranceModel
|
|
14
|
+
return BertCantoneseUtteranceModel
|
|
15
|
+
if name == 'WhisperASRModel':
|
|
16
|
+
from .whisper import WhisperASRModel
|
|
17
|
+
return WhisperASRModel
|
|
18
|
+
if name == 'WhisperFAModel':
|
|
19
|
+
from .whisper import WhisperFAModel
|
|
20
|
+
return WhisperFAModel
|
|
21
|
+
if name == 'NemoSpeakerModel':
|
|
22
|
+
from .speaker import NemoSpeakerModel
|
|
23
|
+
return NemoSpeakerModel
|
|
24
|
+
if name == 'ASRAudioFile':
|
|
25
|
+
from .utils import ASRAudioFile
|
|
26
|
+
return ASRAudioFile
|
|
27
|
+
if name == 'resolve':
|
|
28
|
+
from .resolve import resolve
|
|
29
|
+
return resolve
|
|
30
|
+
if name == 'Wave2VecFAModel':
|
|
31
|
+
from .wave2vec import Wave2VecFAModel
|
|
32
|
+
return Wave2VecFAModel
|
|
33
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
@@ -187,6 +187,37 @@ class ASRAudioFile:
|
|
|
187
187
|
|
|
188
188
|
return data
|
|
189
189
|
|
|
190
|
+
def hash_chunk(self, begin_ms, end_ms):
|
|
191
|
+
"""Generate a tiny SHA256 hash of a chunk of audio for caching."""
|
|
192
|
+
import hashlib
|
|
193
|
+
data = self.chunk(begin_ms, end_ms)
|
|
194
|
+
num_samples = data.numel()
|
|
195
|
+
|
|
196
|
+
# Tiny fingerprint: 100 samples from the middle + total length
|
|
197
|
+
if num_samples > 100:
|
|
198
|
+
mid = num_samples // 2
|
|
199
|
+
samples = data[mid-50:mid+50]
|
|
200
|
+
else:
|
|
201
|
+
samples = data
|
|
202
|
+
|
|
203
|
+
# Include length to catch simple duration changes
|
|
204
|
+
header = f"{num_samples}|".encode()
|
|
205
|
+
return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
|
|
206
|
+
|
|
207
|
+
def hash_all(self):
|
|
208
|
+
"""Generate a tiny SHA256 hash of the entire audio file."""
|
|
209
|
+
import hashlib
|
|
210
|
+
num_samples = self.tensor.numel()
|
|
211
|
+
|
|
212
|
+
if num_samples > 100:
|
|
213
|
+
mid = num_samples // 2
|
|
214
|
+
samples = self.tensor[mid-50:mid+50]
|
|
215
|
+
else:
|
|
216
|
+
samples = self.tensor
|
|
217
|
+
|
|
218
|
+
header = f"{num_samples}|".encode()
|
|
219
|
+
return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
|
|
220
|
+
|
|
190
221
|
def all(self):
|
|
191
222
|
"""Get the audio in its entirety
|
|
192
223
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# from .infer import BertUtteranceModel
|
|
2
|
+
# from .cantonese_infer import BertCantoneseUtteranceModel
|
|
3
|
+
|
|
4
|
+
def __getattr__(name):
|
|
5
|
+
if name == 'BertUtteranceModel':
|
|
6
|
+
from .infer import BertUtteranceModel
|
|
7
|
+
return BertUtteranceModel
|
|
8
|
+
if name == 'BertCantoneseUtteranceModel':
|
|
9
|
+
from .cantonese_infer import BertCantoneseUtteranceModel
|
|
10
|
+
return BertCantoneseUtteranceModel
|
|
11
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
12
|
+
|
|
13
|
+
|
|
@@ -1,43 +1,35 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import string
|
|
3
3
|
import random
|
|
4
|
+
import logging
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
import nltk
|
|
7
|
-
from nltk import word_tokenize, sent_tokenize
|
|
6
|
+
L = logging.getLogger("batchalign")
|
|
8
7
|
|
|
9
|
-
#
|
|
10
|
-
import torch
|
|
11
|
-
from torch.utils.data import dataset
|
|
12
|
-
from torch.utils.data.dataloader import DataLoader
|
|
13
|
-
from torch.optim import AdamW
|
|
14
|
-
|
|
15
|
-
# import huggingface utils
|
|
16
|
-
from transformers import AutoTokenizer, BertForTokenClassification
|
|
17
|
-
from transformers import DataCollatorForTokenClassification
|
|
18
|
-
|
|
19
|
-
# tqdm
|
|
20
|
-
from tqdm import tqdm
|
|
21
|
-
|
|
22
|
-
# seed device and tokens
|
|
23
|
-
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
8
|
+
# heavy imports moved to local scope
|
|
24
9
|
|
|
25
10
|
# seed model
|
|
26
11
|
class BertCantoneseUtteranceModel(object):
|
|
27
12
|
|
|
28
13
|
def __init__(self, model):
|
|
14
|
+
import torch
|
|
15
|
+
from transformers import AutoTokenizer, BertForTokenClassification
|
|
16
|
+
|
|
17
|
+
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
18
|
+
|
|
29
19
|
# seed tokenizers and model
|
|
30
20
|
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
|
31
|
-
self.model = BertForTokenClassification.from_pretrained(model).to(
|
|
21
|
+
self.model = BertForTokenClassification.from_pretrained(model).to(device)
|
|
22
|
+
self.device = device
|
|
32
23
|
self.max_length = 512
|
|
33
24
|
self.overlap = 20
|
|
34
25
|
|
|
35
26
|
# eval mode
|
|
36
27
|
self.model.eval()
|
|
37
|
-
|
|
38
|
-
|
|
28
|
+
L.debug(f"Model and tokenizer initialized on device: {device}")
|
|
29
|
+
L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
|
|
39
30
|
|
|
40
31
|
def __call__(self, passage):
|
|
32
|
+
import torch
|
|
41
33
|
# Step 1: Clean up passage
|
|
42
34
|
passage = passage.lower()
|
|
43
35
|
passage = passage.replace('.','')
|
|
@@ -78,15 +70,14 @@ class BertCantoneseUtteranceModel(object):
|
|
|
78
70
|
chunks.append(passage[start:])
|
|
79
71
|
break
|
|
80
72
|
|
|
81
|
-
|
|
82
|
-
print(f"Created {len(chunks)} chunks based on keywords.")
|
|
73
|
+
L.debug(f"Created {len(chunks)} chunks based on keywords.")
|
|
83
74
|
for i, chunk in enumerate(chunks):
|
|
84
|
-
|
|
85
|
-
|
|
75
|
+
L.debug(f"Chunk {i + 1}: {chunk[:100]}...")
|
|
76
|
+
|
|
86
77
|
# Step 3: Process each chunk and restore punctuation
|
|
87
78
|
final_passage = []
|
|
88
79
|
for chunk_index, chunk in enumerate(chunks):
|
|
89
|
-
|
|
80
|
+
L.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
|
|
90
81
|
|
|
91
82
|
# Step 3.1: Split chunk by characters (Chinese tokenization)
|
|
92
83
|
tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
|
|
@@ -97,13 +88,13 @@ class BertCantoneseUtteranceModel(object):
|
|
|
97
88
|
truncation=True,
|
|
98
89
|
padding=True,
|
|
99
90
|
max_length=self.max_length,
|
|
100
|
-
is_split_into_words=True).to(
|
|
91
|
+
is_split_into_words=True).to(self.device)
|
|
101
92
|
|
|
102
93
|
try:
|
|
103
94
|
# Pass it through the model
|
|
104
95
|
res = self.model(**tokd).logits
|
|
105
96
|
except Exception as e:
|
|
106
|
-
|
|
97
|
+
L.error(f"Error during model inference: {e}")
|
|
107
98
|
return []
|
|
108
99
|
|
|
109
100
|
# Argmax for classification
|
|
@@ -152,7 +143,7 @@ class BertCantoneseUtteranceModel(object):
|
|
|
152
143
|
# Step 4: Join processed chunks together into the final passage
|
|
153
144
|
final_passage = ' '.join(final_passage)
|
|
154
145
|
|
|
155
|
-
|
|
146
|
+
L.debug("Text processing completed. Generating final output...")
|
|
156
147
|
|
|
157
148
|
# Optionally, tokenize the final text into sentences based on punctuation
|
|
158
149
|
def custom_sent_tokenize(text):
|
|
@@ -163,32 +154,29 @@ class BertCantoneseUtteranceModel(object):
|
|
|
163
154
|
# Split the passage based on punctuation marks and keep them
|
|
164
155
|
parts = re.split(sentence_endings, text)
|
|
165
156
|
|
|
166
|
-
|
|
167
|
-
print(f"Parts after splitting: {parts}")
|
|
157
|
+
L.debug(f"Parts after splitting: {parts}")
|
|
168
158
|
|
|
169
159
|
# Combine parts and punctuation together
|
|
170
160
|
for i in range(0, len(parts) - 1, 2):
|
|
171
161
|
sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
|
|
172
|
-
|
|
173
|
-
|
|
162
|
+
L.debug(f"Sentence formed: {sentence}")
|
|
163
|
+
|
|
174
164
|
if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
|
|
175
165
|
split_passage.append(sentence)
|
|
176
166
|
|
|
177
167
|
# If the last part doesn't have punctuation, we handle it here
|
|
178
168
|
if len(parts) % 2 != 0: # If there's no punctuation at the end
|
|
179
169
|
last_part = parts[-1].strip()
|
|
180
|
-
|
|
181
|
-
|
|
170
|
+
L.debug(f"Last part without punctuation: {last_part}")
|
|
171
|
+
|
|
182
172
|
if last_part: # Only add non-empty sentences
|
|
183
173
|
split_passage.append(last_part)
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
print(f"Final split passage: {split_passage}")
|
|
174
|
+
|
|
175
|
+
L.debug(f"Final split passage: {split_passage}")
|
|
187
176
|
return split_passage
|
|
188
177
|
|
|
189
178
|
split_passage = custom_sent_tokenize(final_passage)
|
|
190
179
|
|
|
191
|
-
|
|
192
|
-
print(f"Final sentences: {split_passage}")
|
|
180
|
+
L.debug(f"Final sentences: {split_passage}")
|
|
193
181
|
|
|
194
182
|
return split_passage
|