BatchalignHK 0.8.0.post6__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/BatchalignHK.egg-info/PKG-INFO +3 -1
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/BatchalignHK.egg-info/SOURCES.txt +4 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/BatchalignHK.egg-info/requires.txt +2 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/PKG-INFO +3 -1
- batchalignhk-0.8.1/batchalign/__init__.py +48 -0
- batchalignhk-0.8.1/batchalign/cli/cache.py +263 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/cli/cli.py +5 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/cli/dispatch.py +6 -3
- batchalignhk-0.8.1/batchalign/formats/__init__.py +11 -0
- batchalignhk-0.8.1/batchalign/models/__init__.py +33 -0
- batchalignhk-0.8.1/batchalign/models/speaker/__init__.py +7 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utils.py +31 -0
- batchalignhk-0.8.1/batchalign/models/utterance/__init__.py +13 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utterance/cantonese_infer.py +17 -31
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utterance/infer.py +13 -23
- batchalignhk-0.8.1/batchalign/models/wave2vec/__init__.py +7 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/wave2vec/infer_fa.py +16 -31
- batchalignhk-0.8.1/batchalign/models/whisper/__init__.py +11 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/whisper/infer_asr.py +16 -30
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/whisper/infer_fa.py +21 -17
- batchalignhk-0.8.1/batchalign/pipelines/__init__.py +49 -0
- batchalignhk-0.8.1/batchalign/pipelines/analysis/__init__.py +15 -0
- batchalignhk-0.8.1/batchalign/pipelines/asr/__init__.py +24 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/rev.py +6 -1
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/whisperx.py +9 -17
- batchalignhk-0.8.1/batchalign/pipelines/avqi/__init__.py +15 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/avqi/engine.py +6 -5
- batchalignhk-0.8.1/batchalign/pipelines/cache.py +735 -0
- batchalignhk-0.8.1/batchalign/pipelines/cleanup/__init__.py +18 -0
- batchalignhk-0.8.1/batchalign/pipelines/diarization/__init__.py +15 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/diarization/pyannote.py +5 -17
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/dispatch.py +26 -9
- batchalignhk-0.8.1/batchalign/pipelines/fa/__init__.py +18 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/fa/wave2vec_fa.py +49 -10
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/fa/whisper_fa.py +52 -10
- batchalignhk-0.8.1/batchalign/pipelines/morphosyntax/__init__.py +18 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/morphosyntax/coref.py +1 -1
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/morphosyntax/ud.py +147 -21
- batchalignhk-0.8.1/batchalign/pipelines/opensmile/__init__.py +15 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/opensmile/engine.py +22 -12
- batchalignhk-0.8.1/batchalign/pipelines/speaker/__init__.py +15 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/speaker/nemo_speaker.py +4 -2
- batchalignhk-0.8.1/batchalign/pipelines/translate/__init__.py +18 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/translate/gtrans.py +2 -1
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/translate/seamless.py +2 -1
- batchalignhk-0.8.1/batchalign/pipelines/utr/__init__.py +18 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/utr/rev_utr.py +8 -2
- batchalignhk-0.8.1/batchalign/pipelines/utterance/__init__.py +15 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/utterance/ud_utterance.py +95 -41
- batchalignhk-0.8.1/batchalign/tests/pipelines/cache/__init__.py +1 -0
- batchalignhk-0.8.1/batchalign/tests/pipelines/cache/test_cache.py +407 -0
- batchalignhk-0.8.1/batchalign/version +3 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/setup.py +2 -0
- batchalignhk-0.8.0.post6/batchalign/__init__.py +0 -19
- batchalignhk-0.8.0.post6/batchalign/formats/__init__.py +0 -2
- batchalignhk-0.8.0.post6/batchalign/models/__init__.py +0 -6
- batchalignhk-0.8.0.post6/batchalign/models/speaker/__init__.py +0 -1
- batchalignhk-0.8.0.post6/batchalign/models/utterance/__init__.py +0 -4
- batchalignhk-0.8.0.post6/batchalign/models/wave2vec/__init__.py +0 -1
- batchalignhk-0.8.0.post6/batchalign/models/whisper/__init__.py +0 -2
- batchalignhk-0.8.0.post6/batchalign/pipelines/__init__.py +0 -20
- batchalignhk-0.8.0.post6/batchalign/pipelines/analysis/__init__.py +0 -1
- batchalignhk-0.8.0.post6/batchalign/pipelines/asr/__init__.py +0 -7
- batchalignhk-0.8.0.post6/batchalign/pipelines/avqi/__init__.py +0 -8
- batchalignhk-0.8.0.post6/batchalign/pipelines/cleanup/__init__.py +0 -3
- batchalignhk-0.8.0.post6/batchalign/pipelines/diarization/__init__.py +0 -1
- batchalignhk-0.8.0.post6/batchalign/pipelines/fa/__init__.py +0 -4
- batchalignhk-0.8.0.post6/batchalign/pipelines/morphosyntax/__init__.py +0 -3
- batchalignhk-0.8.0.post6/batchalign/pipelines/opensmile/__init__.py +0 -7
- batchalignhk-0.8.0.post6/batchalign/pipelines/speaker/__init__.py +0 -1
- batchalignhk-0.8.0.post6/batchalign/pipelines/translate/__init__.py +0 -2
- batchalignhk-0.8.0.post6/batchalign/pipelines/utr/__init__.py +0 -4
- batchalignhk-0.8.0.post6/batchalign/pipelines/utterance/__init__.py +0 -1
- batchalignhk-0.8.0.post6/batchalign/version +0 -3
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/BatchalignHK.egg-info/dependency_links.txt +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/BatchalignHK.egg-info/entry_points.txt +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/BatchalignHK.egg-info/top_level.txt +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/LICENSE +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/MANIFEST.in +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/README.md +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/__main__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/cli/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/constants.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/document.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/errors.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/core.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/exception.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/logging.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/realtime_meeting.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/speech_recognizer.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/speech_synthesizer.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/speech_transcriber.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/stream_input_tts.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/token.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/util.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/version.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_abnf.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_app.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_cookiejar.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_core.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_exceptions.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_handshake.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_http.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_logging.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_socket.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_ssl_compat.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_url.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/_utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/echo-server.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/test_abnf.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/test_app.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/test_cookiejar.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/test_http.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/test_url.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/extern/nls/websocket/tests/test_websocket.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/base.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/chat/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/chat/file.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/chat/generator.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/chat/lexer.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/chat/parser.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/chat/utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/textgrid/file.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/resolve.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/speaker/config.yaml +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/speaker/infer.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/speaker/utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/training/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/training/run.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/training/utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utterance/dataset.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utterance/execute.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utterance/prep.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utterance/train.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/aliyun.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/funaudio.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/por.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/oai_whisper.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/tencent.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/base.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/fa/iic_fa.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/fa/wave2vec_fa_canto.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/pipeline.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/translate/utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/utr/funaudio_utr.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/utr/tencent_utr.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/conftest.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/tests/test_document.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/utils/__init__.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/utils/abbrev.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/utils/compounds.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/utils/config.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/utils/dp.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/utils/names.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/utils/utils.py +0 -0
- {batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: BatchalignHK
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
|
@@ -9,6 +9,8 @@ Classifier: Topic :: Utilities
|
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: pydantic>=2.4
|
|
12
|
+
Requires-Dist: platformdirs>=4.3.0
|
|
13
|
+
Requires-Dist: filelock>=3.0.0
|
|
12
14
|
Requires-Dist: nltk>=3.8
|
|
13
15
|
Requires-Dist: praatio<6.1.0,>=6.0.0
|
|
14
16
|
Requires-Dist: torch>=2.6.0
|
|
@@ -15,6 +15,7 @@ batchalign/document.py
|
|
|
15
15
|
batchalign/errors.py
|
|
16
16
|
batchalign/version
|
|
17
17
|
batchalign/cli/__init__.py
|
|
18
|
+
batchalign/cli/cache.py
|
|
18
19
|
batchalign/cli/cli.py
|
|
19
20
|
batchalign/cli/dispatch.py
|
|
20
21
|
batchalign/extern/nls/__init__.py
|
|
@@ -86,6 +87,7 @@ batchalign/models/whisper/infer_asr.py
|
|
|
86
87
|
batchalign/models/whisper/infer_fa.py
|
|
87
88
|
batchalign/pipelines/__init__.py
|
|
88
89
|
batchalign/pipelines/base.py
|
|
90
|
+
batchalign/pipelines/cache.py
|
|
89
91
|
batchalign/pipelines/dispatch.py
|
|
90
92
|
batchalign/pipelines/pipeline.py
|
|
91
93
|
batchalign/pipelines/analysis/__init__.py
|
|
@@ -169,6 +171,8 @@ batchalign/tests/pipelines/test_pipeline_models.py
|
|
|
169
171
|
batchalign/tests/pipelines/analysis/test_eval.py
|
|
170
172
|
batchalign/tests/pipelines/asr/test_asr_pipeline.py
|
|
171
173
|
batchalign/tests/pipelines/asr/test_asr_utils.py
|
|
174
|
+
batchalign/tests/pipelines/cache/__init__.py
|
|
175
|
+
batchalign/tests/pipelines/cache/test_cache.py
|
|
172
176
|
batchalign/tests/pipelines/cleanup/test_disfluency.py
|
|
173
177
|
batchalign/tests/pipelines/cleanup/test_parse_support.py
|
|
174
178
|
batchalign/tests/pipelines/fa/test_fa_pipeline.py
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: BatchalignHK
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
|
@@ -9,6 +9,8 @@ Classifier: Topic :: Utilities
|
|
|
9
9
|
Description-Content-Type: text/markdown
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
Requires-Dist: pydantic>=2.4
|
|
12
|
+
Requires-Dist: platformdirs>=4.3.0
|
|
13
|
+
Requires-Dist: filelock>=3.0.0
|
|
12
14
|
Requires-Dist: nltk>=3.8
|
|
13
15
|
Requires-Dist: praatio<6.1.0,>=6.0.0
|
|
14
16
|
Requires-Dist: torch>=2.6.0
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import os
|
|
2
|
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = str(1)
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
# clear all of nemo's loggers
|
|
7
|
+
logging.getLogger().handlers.clear()
|
|
8
|
+
logging.getLogger('nemo_logger').handlers.clear()
|
|
9
|
+
logging.getLogger().setLevel(logging.CRITICAL)
|
|
10
|
+
logging.getLogger('nemo_logger').disabled = True
|
|
11
|
+
|
|
12
|
+
from .document import *
|
|
13
|
+
from .constants import *
|
|
14
|
+
from .errors import *
|
|
15
|
+
|
|
16
|
+
# Defer slow imports
|
|
17
|
+
# from .formats import *
|
|
18
|
+
# from .pipelines import *
|
|
19
|
+
# from .models import *
|
|
20
|
+
# from .cli import batchalign as cli
|
|
21
|
+
|
|
22
|
+
def __getattr__(name):
|
|
23
|
+
if name == 'cli':
|
|
24
|
+
from .cli import batchalign
|
|
25
|
+
return batchalign
|
|
26
|
+
if name == 'BatchalignPipeline':
|
|
27
|
+
from .pipelines import BatchalignPipeline
|
|
28
|
+
return BatchalignPipeline
|
|
29
|
+
if name == 'CHATFile':
|
|
30
|
+
from .formats.chat import CHATFile
|
|
31
|
+
return CHATFile
|
|
32
|
+
# Add other common engines if needed for dispatch.py
|
|
33
|
+
if name in ['WhisperEngine', 'WhisperFAEngine', 'StanzaEngine', 'RevEngine',
|
|
34
|
+
'NgramRetraceEngine', 'DisfluencyReplacementEngine', 'WhisperUTREngine',
|
|
35
|
+
'RevUTREngine', 'EvaluationEngine', 'WhisperXEngine', 'NemoSpeakerEngine',
|
|
36
|
+
'StanzaUtteranceEngine', 'CorefEngine', 'Wave2VecFAEngine', 'SeamlessTranslationModel',
|
|
37
|
+
'GoogleTranslateEngine', 'OAIWhisperEngine', 'PyannoteEngine']:
|
|
38
|
+
from .pipelines import dispatch
|
|
39
|
+
# This is a bit recursive, let's just let dispatch import them locally
|
|
40
|
+
# which it already does now.
|
|
41
|
+
import importlib
|
|
42
|
+
# We need to find which subpackage it's in.
|
|
43
|
+
# Actually, if we use local imports in dispatch.py, we don't need these here.
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
47
|
+
|
|
48
|
+
logging.getLogger('nemo_logger').disabled = False
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
"""
|
|
2
|
+
cache.py
|
|
3
|
+
CLI subcommand for managing the Batchalign cache.
|
|
4
|
+
|
|
5
|
+
Provides commands to:
|
|
6
|
+
- Show cache statistics (--stats)
|
|
7
|
+
- Clear all cached data (--clear)
|
|
8
|
+
- Prepopulate cache from existing CHAT files (--warm)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import rich_click as click
|
|
15
|
+
from rich.console import Console
|
|
16
|
+
|
|
17
|
+
C = Console()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _format_bytes(count: int | None, precision: int = 2) -> str:
|
|
21
|
+
"""Format byte count as human-readable string."""
|
|
22
|
+
if count is None:
|
|
23
|
+
return "unknown"
|
|
24
|
+
units = ["B", "KB", "MB", "GB", "TB"]
|
|
25
|
+
idx = 0
|
|
26
|
+
size = float(count)
|
|
27
|
+
while size >= 1024 and idx < len(units) - 1:
|
|
28
|
+
size /= 1024
|
|
29
|
+
idx += 1
|
|
30
|
+
if idx == 0:
|
|
31
|
+
return f"{int(size)} {units[idx]}"
|
|
32
|
+
return f"{size:.{precision}f} {units[idx]}"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@click.group(invoke_without_command=True)
|
|
36
|
+
@click.option("--stats", is_flag=True, help="Show cache statistics.")
|
|
37
|
+
@click.option(
|
|
38
|
+
"--clear",
|
|
39
|
+
is_flag=True,
|
|
40
|
+
help="Clear all cached data (requires confirmation)."
|
|
41
|
+
)
|
|
42
|
+
@click.pass_context
|
|
43
|
+
def cache(ctx, stats, clear):
|
|
44
|
+
"""Manage the Batchalign cache.
|
|
45
|
+
|
|
46
|
+
The cache stores per-utterance analysis results to avoid redundant
|
|
47
|
+
computation when re-processing unchanged content.
|
|
48
|
+
|
|
49
|
+
Examples:
|
|
50
|
+
batchalign cache --stats
|
|
51
|
+
batchalign cache --clear
|
|
52
|
+
batchalign cache warm INPUT_DIR --lang eng
|
|
53
|
+
"""
|
|
54
|
+
# Handle --stats flag
|
|
55
|
+
if stats:
|
|
56
|
+
ctx.invoke(show_stats)
|
|
57
|
+
return
|
|
58
|
+
|
|
59
|
+
# Handle --clear flag
|
|
60
|
+
if clear:
|
|
61
|
+
ctx.invoke(clear_cache)
|
|
62
|
+
return
|
|
63
|
+
|
|
64
|
+
# If no flags and no subcommand, show help
|
|
65
|
+
if ctx.invoked_subcommand is None:
|
|
66
|
+
click.echo(ctx.get_help())
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@cache.command("stats")
|
|
70
|
+
def show_stats():
|
|
71
|
+
"""Show cache statistics."""
|
|
72
|
+
from batchalign.pipelines.cache import CacheManager
|
|
73
|
+
|
|
74
|
+
manager = CacheManager()
|
|
75
|
+
stats = manager.stats()
|
|
76
|
+
|
|
77
|
+
C.print()
|
|
78
|
+
C.print("[bold]Batchalign Cache Statistics[/bold]")
|
|
79
|
+
C.print("-" * 35)
|
|
80
|
+
C.print(f"[cyan]Location:[/cyan] {stats['location']}")
|
|
81
|
+
C.print(f"[cyan]Size:[/cyan] {_format_bytes(stats['size_bytes'])}")
|
|
82
|
+
C.print(f"[cyan]Entries:[/cyan] {stats['total_entries']:,}")
|
|
83
|
+
C.print()
|
|
84
|
+
|
|
85
|
+
# Show breakdown by task
|
|
86
|
+
if stats["by_task"]:
|
|
87
|
+
C.print("[bold]By task:[/bold]")
|
|
88
|
+
for task, count in sorted(stats["by_task"].items()):
|
|
89
|
+
C.print(f" {task}: {count:,} entries")
|
|
90
|
+
C.print()
|
|
91
|
+
|
|
92
|
+
# Show breakdown by engine version
|
|
93
|
+
if stats["by_engine_version"]:
|
|
94
|
+
# Get current stanza version to mark outdated entries
|
|
95
|
+
try:
|
|
96
|
+
import stanza
|
|
97
|
+
current_stanza = stanza.__version__
|
|
98
|
+
except ImportError:
|
|
99
|
+
current_stanza = None
|
|
100
|
+
|
|
101
|
+
C.print("[bold]Engine versions:[/bold]")
|
|
102
|
+
for key, count in sorted(stats["by_engine_version"].items()):
|
|
103
|
+
# Check if this version is outdated
|
|
104
|
+
outdated = ""
|
|
105
|
+
if current_stanza and "morphosyntax" in key:
|
|
106
|
+
version_part = key.split()[-1] if " " in key else ""
|
|
107
|
+
if version_part and version_part != current_stanza:
|
|
108
|
+
outdated = " [dim](outdated)[/dim]"
|
|
109
|
+
C.print(f" {key}: {count:,} entries{outdated}")
|
|
110
|
+
C.print()
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@cache.command("clear")
|
|
114
|
+
@click.confirmation_option(
|
|
115
|
+
prompt="Are you sure you want to clear all cached data?"
|
|
116
|
+
)
|
|
117
|
+
def clear_cache():
|
|
118
|
+
"""Clear all cached data."""
|
|
119
|
+
from batchalign.pipelines.cache import CacheManager
|
|
120
|
+
|
|
121
|
+
manager = CacheManager()
|
|
122
|
+
stats = manager.stats()
|
|
123
|
+
entries_before = stats["total_entries"]
|
|
124
|
+
|
|
125
|
+
bytes_freed = manager.clear()
|
|
126
|
+
|
|
127
|
+
C.print()
|
|
128
|
+
C.print(f"[bold green]Cache cleared.[/bold green]")
|
|
129
|
+
C.print(f" Entries removed: {entries_before:,}")
|
|
130
|
+
C.print(f" Space freed: {_format_bytes(bytes_freed)}")
|
|
131
|
+
C.print()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@cache.command("warm")
|
|
135
|
+
@click.argument("input_dir", type=click.Path(exists=True, file_okay=False))
|
|
136
|
+
@click.option(
|
|
137
|
+
"--lang",
|
|
138
|
+
default="eng",
|
|
139
|
+
help="Language code (3-letter ISO). Default: eng"
|
|
140
|
+
)
|
|
141
|
+
@click.option(
|
|
142
|
+
"--retokenize/--keeptokens",
|
|
143
|
+
default=False,
|
|
144
|
+
help="Whether files were processed with retokenization."
|
|
145
|
+
)
|
|
146
|
+
def warm_cache(input_dir, lang, retokenize):
|
|
147
|
+
"""Prepopulate cache from existing CHAT files with %mor/%gra tiers.
|
|
148
|
+
|
|
149
|
+
Reads CHAT files that already have morphosyntactic analysis (%mor and %gra
|
|
150
|
+
tiers) and populates the cache with their content. This allows subsequent
|
|
151
|
+
processing of identical utterances to use cached results.
|
|
152
|
+
|
|
153
|
+
IMPORTANT: The command trusts the input files. It does not validate that
|
|
154
|
+
the %mor/%gra content is correct.
|
|
155
|
+
"""
|
|
156
|
+
from batchalign.pipelines.cache import (
|
|
157
|
+
CacheManager, MorphotagCacheKey, _get_batchalign_version
|
|
158
|
+
)
|
|
159
|
+
from batchalign.formats.chat import CHATFile
|
|
160
|
+
from batchalign.document import Utterance
|
|
161
|
+
|
|
162
|
+
# Get engine version
|
|
163
|
+
try:
|
|
164
|
+
import stanza
|
|
165
|
+
engine_version = stanza.__version__
|
|
166
|
+
except ImportError:
|
|
167
|
+
C.print("[bold red]Error:[/bold red] stanza is not installed. Cannot warm cache.")
|
|
168
|
+
return
|
|
169
|
+
|
|
170
|
+
manager = CacheManager()
|
|
171
|
+
key_gen = MorphotagCacheKey()
|
|
172
|
+
ba_version = _get_batchalign_version()
|
|
173
|
+
|
|
174
|
+
# Collect all .cha files
|
|
175
|
+
cha_files = []
|
|
176
|
+
for root, dirs, files in os.walk(input_dir):
|
|
177
|
+
for f in files:
|
|
178
|
+
if f.lower().endswith(".cha"):
|
|
179
|
+
cha_files.append(os.path.join(root, f))
|
|
180
|
+
|
|
181
|
+
if not cha_files:
|
|
182
|
+
C.print(f"[bold yellow]No .cha files found in {input_dir}[/bold yellow]")
|
|
183
|
+
return
|
|
184
|
+
|
|
185
|
+
C.print(f"\nWarming cache from {len(cha_files)} CHAT file(s)...")
|
|
186
|
+
C.print(f" Language: {lang}")
|
|
187
|
+
C.print(f" Retokenize: {retokenize}")
|
|
188
|
+
C.print(f" Stanza version: {engine_version}")
|
|
189
|
+
C.print()
|
|
190
|
+
|
|
191
|
+
entries_added = 0
|
|
192
|
+
entries_skipped = 0
|
|
193
|
+
files_processed = 0
|
|
194
|
+
|
|
195
|
+
for cha_path in cha_files:
|
|
196
|
+
try:
|
|
197
|
+
cf = CHATFile(path=cha_path, special_mor_=True)
|
|
198
|
+
doc = cf.doc
|
|
199
|
+
|
|
200
|
+
# Map for batching within a file
|
|
201
|
+
utterances_to_check = []
|
|
202
|
+
idx_to_key = {}
|
|
203
|
+
|
|
204
|
+
for idx, item in enumerate(doc.content):
|
|
205
|
+
if not isinstance(item, Utterance):
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
# Check if utterance has morphology/dependency
|
|
209
|
+
has_morphology = any(
|
|
210
|
+
form.morphology and len(form.morphology) > 0
|
|
211
|
+
for form in item.content
|
|
212
|
+
)
|
|
213
|
+
has_dependency = any(
|
|
214
|
+
form.dependency and len(form.dependency) > 0
|
|
215
|
+
for form in item.content
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if not (has_morphology or has_dependency):
|
|
219
|
+
continue
|
|
220
|
+
|
|
221
|
+
# Generate cache key
|
|
222
|
+
key = key_gen.generate_key(
|
|
223
|
+
item,
|
|
224
|
+
lang=lang,
|
|
225
|
+
retokenize=retokenize,
|
|
226
|
+
mwt={}
|
|
227
|
+
)
|
|
228
|
+
utterances_to_check.append((idx, key))
|
|
229
|
+
idx_to_key[idx] = key
|
|
230
|
+
|
|
231
|
+
if not utterances_to_check:
|
|
232
|
+
files_processed += 1
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
# Batch check
|
|
236
|
+
keys = [k for _, k in utterances_to_check]
|
|
237
|
+
cached_results = manager.get_batch(keys, "morphosyntax", engine_version)
|
|
238
|
+
|
|
239
|
+
entries_skipped += len(cached_results)
|
|
240
|
+
|
|
241
|
+
# Filter out already cached ones and prepare for batch put
|
|
242
|
+
to_put = []
|
|
243
|
+
for idx, key in utterances_to_check:
|
|
244
|
+
if key not in cached_results:
|
|
245
|
+
item = doc.content[idx]
|
|
246
|
+
data = key_gen.serialize_output(item)
|
|
247
|
+
to_put.append((key, data))
|
|
248
|
+
|
|
249
|
+
if to_put:
|
|
250
|
+
manager.put_batch(to_put, "morphosyntax", engine_version, ba_version)
|
|
251
|
+
entries_added += len(to_put)
|
|
252
|
+
|
|
253
|
+
files_processed += 1
|
|
254
|
+
|
|
255
|
+
except Exception as e:
|
|
256
|
+
C.print(f"[yellow]Warning:[/yellow] Could not process {cha_path}: {e}")
|
|
257
|
+
continue
|
|
258
|
+
|
|
259
|
+
C.print(f"[bold green]Cache warming complete.[/bold green]")
|
|
260
|
+
C.print(f" Files processed: {files_processed}")
|
|
261
|
+
C.print(f" Entries added: {entries_added}")
|
|
262
|
+
C.print(f" Entries skipped (already cached): {entries_skipped}")
|
|
263
|
+
C.print()
|
|
@@ -92,6 +92,9 @@ def batchalign(ctx, verbose, workers):
|
|
|
92
92
|
|
|
93
93
|
batchalign.add_command(train, "models")
|
|
94
94
|
|
|
95
|
+
from batchalign.cli.cache import cache
|
|
96
|
+
batchalign.add_command(cache, "cache")
|
|
97
|
+
|
|
95
98
|
#################### ALIGN ################################
|
|
96
99
|
|
|
97
100
|
@batchalign.command()
|
|
@@ -254,6 +257,8 @@ def translate(ctx, in_dir, out_dir, **kwargs):
|
|
|
254
257
|
type=click.Path(exists=True,
|
|
255
258
|
file_okay=True, dir_okay=False),
|
|
256
259
|
help="Comma seperated manual lexicon override")
|
|
260
|
+
@click.option("--override-cache/--use-cache",
|
|
261
|
+
default=False, help="Bypass cache and recompute all utterances.")
|
|
257
262
|
@click.pass_context
|
|
258
263
|
def morphotag(ctx, in_dir, out_dir, **kwargs):
|
|
259
264
|
"""Perform morphosyntactic analysis on transcripts."""
|
|
@@ -95,8 +95,9 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
|
|
|
95
95
|
else:
|
|
96
96
|
baL.setLevel(logging.DEBUG)
|
|
97
97
|
|
|
98
|
-
# Always capture output to avoid interleaving with progress rendering
|
|
99
|
-
|
|
98
|
+
# Always capture output to avoid interleaving with progress rendering,
|
|
99
|
+
# unless high verbosity is requested for debugging.
|
|
100
|
+
should_capture = verbose < 2
|
|
100
101
|
|
|
101
102
|
if should_capture:
|
|
102
103
|
# Use a temporary file to capture ALL output at the FD level
|
|
@@ -129,6 +130,7 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
|
|
|
129
130
|
mwt = kwargs.pop("mwt", {})
|
|
130
131
|
retokenize = kwargs.pop("retokenize", False)
|
|
131
132
|
skipmultilang = kwargs.pop("skipmultilang", False)
|
|
133
|
+
override_cache = kwargs.pop("override_cache", False)
|
|
132
134
|
|
|
133
135
|
cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
|
|
134
136
|
doc = cf.doc
|
|
@@ -139,7 +141,8 @@ def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_inf
|
|
|
139
141
|
pipeline_kwargs = {
|
|
140
142
|
"retokenize": retokenize,
|
|
141
143
|
"skipmultilang": skipmultilang,
|
|
142
|
-
"mwt": mwt
|
|
144
|
+
"mwt": mwt,
|
|
145
|
+
"override_cache": override_cache
|
|
143
146
|
}
|
|
144
147
|
# Add any remaining kwargs
|
|
145
148
|
pipeline_kwargs.update(kwargs)
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# from .chat import CHATFile
|
|
2
|
+
# from .textgrid import TextGridFile
|
|
3
|
+
|
|
4
|
+
def __getattr__(name):
|
|
5
|
+
if name == 'CHATFile':
|
|
6
|
+
from .chat import CHATFile
|
|
7
|
+
return CHATFile
|
|
8
|
+
if name == 'TextGridFile':
|
|
9
|
+
from .textgrid import TextGridFile
|
|
10
|
+
return TextGridFile
|
|
11
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
# from .utterance import BertUtteranceModel, BertCantoneseUtteranceModel
|
|
2
|
+
# from .whisper import WhisperASRModel, WhisperFAModel
|
|
3
|
+
# from .speaker import NemoSpeakerModel
|
|
4
|
+
# from .utils import ASRAudioFile
|
|
5
|
+
# from .resolve import resolve
|
|
6
|
+
# from .wave2vec import Wave2VecFAModel
|
|
7
|
+
|
|
8
|
+
def __getattr__(name):
|
|
9
|
+
if name == 'BertUtteranceModel':
|
|
10
|
+
from .utterance import BertUtteranceModel
|
|
11
|
+
return BertUtteranceModel
|
|
12
|
+
if name == 'BertCantoneseUtteranceModel':
|
|
13
|
+
from .utterance import BertCantoneseUtteranceModel
|
|
14
|
+
return BertCantoneseUtteranceModel
|
|
15
|
+
if name == 'WhisperASRModel':
|
|
16
|
+
from .whisper import WhisperASRModel
|
|
17
|
+
return WhisperASRModel
|
|
18
|
+
if name == 'WhisperFAModel':
|
|
19
|
+
from .whisper import WhisperFAModel
|
|
20
|
+
return WhisperFAModel
|
|
21
|
+
if name == 'NemoSpeakerModel':
|
|
22
|
+
from .speaker import NemoSpeakerModel
|
|
23
|
+
return NemoSpeakerModel
|
|
24
|
+
if name == 'ASRAudioFile':
|
|
25
|
+
from .utils import ASRAudioFile
|
|
26
|
+
return ASRAudioFile
|
|
27
|
+
if name == 'resolve':
|
|
28
|
+
from .resolve import resolve
|
|
29
|
+
return resolve
|
|
30
|
+
if name == 'Wave2VecFAModel':
|
|
31
|
+
from .wave2vec import Wave2VecFAModel
|
|
32
|
+
return Wave2VecFAModel
|
|
33
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
@@ -187,6 +187,37 @@ class ASRAudioFile:
|
|
|
187
187
|
|
|
188
188
|
return data
|
|
189
189
|
|
|
190
|
+
def hash_chunk(self, begin_ms, end_ms):
|
|
191
|
+
"""Generate a tiny SHA256 hash of a chunk of audio for caching."""
|
|
192
|
+
import hashlib
|
|
193
|
+
data = self.chunk(begin_ms, end_ms)
|
|
194
|
+
num_samples = data.numel()
|
|
195
|
+
|
|
196
|
+
# Tiny fingerprint: 100 samples from the middle + total length
|
|
197
|
+
if num_samples > 100:
|
|
198
|
+
mid = num_samples // 2
|
|
199
|
+
samples = data[mid-50:mid+50]
|
|
200
|
+
else:
|
|
201
|
+
samples = data
|
|
202
|
+
|
|
203
|
+
# Include length to catch simple duration changes
|
|
204
|
+
header = f"{num_samples}|".encode()
|
|
205
|
+
return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
|
|
206
|
+
|
|
207
|
+
def hash_all(self):
|
|
208
|
+
"""Generate a tiny SHA256 hash of the entire audio file."""
|
|
209
|
+
import hashlib
|
|
210
|
+
num_samples = self.tensor.numel()
|
|
211
|
+
|
|
212
|
+
if num_samples > 100:
|
|
213
|
+
mid = num_samples // 2
|
|
214
|
+
samples = self.tensor[mid-50:mid+50]
|
|
215
|
+
else:
|
|
216
|
+
samples = self.tensor
|
|
217
|
+
|
|
218
|
+
header = f"{num_samples}|".encode()
|
|
219
|
+
return hashlib.sha256(header + samples.cpu().numpy().tobytes()).hexdigest()
|
|
220
|
+
|
|
190
221
|
def all(self):
|
|
191
222
|
"""Get the audio in its entirety
|
|
192
223
|
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# from .infer import BertUtteranceModel
|
|
2
|
+
# from .cantonese_infer import BertCantoneseUtteranceModel
|
|
3
|
+
|
|
4
|
+
def __getattr__(name):
|
|
5
|
+
if name == 'BertUtteranceModel':
|
|
6
|
+
from .infer import BertUtteranceModel
|
|
7
|
+
return BertUtteranceModel
|
|
8
|
+
if name == 'BertCantoneseUtteranceModel':
|
|
9
|
+
from .cantonese_infer import BertCantoneseUtteranceModel
|
|
10
|
+
return BertCantoneseUtteranceModel
|
|
11
|
+
raise AttributeError(f"module {__name__} has no attribute {name}")
|
|
12
|
+
|
|
13
|
+
|
{batchalignhk-0.8.0.post6 → batchalignhk-0.8.1}/batchalign/models/utterance/cantonese_infer.py
RENAMED
|
@@ -1,46 +1,38 @@
|
|
|
1
1
|
import re
|
|
2
2
|
import string
|
|
3
3
|
import random
|
|
4
|
+
import logging
|
|
4
5
|
|
|
5
|
-
|
|
6
|
-
import nltk
|
|
7
|
-
from nltk import word_tokenize, sent_tokenize
|
|
8
|
-
|
|
9
|
-
# torch
|
|
10
|
-
import torch
|
|
11
|
-
from torch.utils.data import dataset
|
|
12
|
-
from torch.utils.data.dataloader import DataLoader
|
|
13
|
-
from torch.optim import AdamW
|
|
14
|
-
|
|
15
|
-
# import huggingface utils
|
|
16
|
-
from transformers import AutoTokenizer, BertForTokenClassification
|
|
17
|
-
from transformers import DataCollatorForTokenClassification
|
|
18
|
-
|
|
19
|
-
# tqdm
|
|
20
|
-
from tqdm import tqdm
|
|
6
|
+
L = logging.getLogger("batchalign")
|
|
21
7
|
|
|
22
8
|
import logging
|
|
23
9
|
L = logging.getLogger("batchalign")
|
|
24
10
|
|
|
25
11
|
# seed device and tokens
|
|
26
|
-
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
27
12
|
|
|
28
13
|
# seed model
|
|
29
14
|
class BertCantoneseUtteranceModel(object):
|
|
30
15
|
|
|
31
16
|
def __init__(self, model):
|
|
17
|
+
import torch
|
|
18
|
+
from transformers import AutoTokenizer, BertForTokenClassification
|
|
19
|
+
|
|
20
|
+
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
21
|
+
|
|
32
22
|
# seed tokenizers and model
|
|
33
23
|
self.tokenizer = AutoTokenizer.from_pretrained(model)
|
|
34
|
-
self.model = BertForTokenClassification.from_pretrained(model).to(
|
|
24
|
+
self.model = BertForTokenClassification.from_pretrained(model).to(device)
|
|
25
|
+
self.device = device
|
|
35
26
|
self.max_length = 512
|
|
36
27
|
self.overlap = 20
|
|
37
28
|
|
|
38
29
|
# eval mode
|
|
39
30
|
self.model.eval()
|
|
40
|
-
L.debug(f"Model and tokenizer initialized on device: {
|
|
31
|
+
L.debug(f"Model and tokenizer initialized on device: {device}")
|
|
41
32
|
L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
|
|
42
33
|
|
|
43
34
|
def __call__(self, passage):
|
|
35
|
+
import torch
|
|
44
36
|
# Step 1: Clean up passage
|
|
45
37
|
passage = passage.lower()
|
|
46
38
|
passage = passage.replace('.','')
|
|
@@ -81,11 +73,9 @@ class BertCantoneseUtteranceModel(object):
|
|
|
81
73
|
chunks.append(passage[start:])
|
|
82
74
|
break
|
|
83
75
|
|
|
84
|
-
# Debugging: Print number of chunks and their content
|
|
85
76
|
L.debug(f"Created {len(chunks)} chunks based on keywords.")
|
|
86
77
|
for i, chunk in enumerate(chunks):
|
|
87
|
-
L.debug(f"Chunk {i + 1}: {chunk[:100]}...")
|
|
88
|
-
|
|
78
|
+
L.debug(f"Chunk {i + 1}: {chunk[:100]}...")
|
|
89
79
|
# Step 3: Process each chunk and restore punctuation
|
|
90
80
|
final_passage = []
|
|
91
81
|
for chunk_index, chunk in enumerate(chunks):
|
|
@@ -100,7 +90,7 @@ class BertCantoneseUtteranceModel(object):
|
|
|
100
90
|
truncation=True,
|
|
101
91
|
padding=True,
|
|
102
92
|
max_length=self.max_length,
|
|
103
|
-
is_split_into_words=True).to(
|
|
93
|
+
is_split_into_words=True).to(self.device)
|
|
104
94
|
|
|
105
95
|
try:
|
|
106
96
|
# Pass it through the model
|
|
@@ -155,7 +145,7 @@ class BertCantoneseUtteranceModel(object):
|
|
|
155
145
|
# Step 4: Join processed chunks together into the final passage
|
|
156
146
|
final_passage = ' '.join(final_passage)
|
|
157
147
|
|
|
158
|
-
L.
|
|
148
|
+
L.debug("Text processing completed. Generating final output...")
|
|
159
149
|
|
|
160
150
|
# Optionally, tokenize the final text into sentences based on punctuation
|
|
161
151
|
def custom_sent_tokenize(text):
|
|
@@ -166,32 +156,28 @@ class BertCantoneseUtteranceModel(object):
|
|
|
166
156
|
# Split the passage based on punctuation marks and keep them
|
|
167
157
|
parts = re.split(sentence_endings, text)
|
|
168
158
|
|
|
169
|
-
# Debug: Output the parts after splitting
|
|
170
159
|
L.debug(f"Parts after splitting: {parts}")
|
|
171
160
|
|
|
172
161
|
# Combine parts and punctuation together
|
|
173
162
|
for i in range(0, len(parts) - 1, 2):
|
|
174
163
|
sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
|
|
175
|
-
L.debug(f"Sentence formed: {sentence}") # Debug: Output the current sentence
|
|
176
164
|
|
|
165
|
+
L.debug(f"Sentence formed: {sentence}")
|
|
177
166
|
if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
|
|
178
167
|
split_passage.append(sentence)
|
|
179
168
|
|
|
180
169
|
# If the last part doesn't have punctuation, we handle it here
|
|
181
170
|
if len(parts) % 2 != 0: # If there's no punctuation at the end
|
|
182
171
|
last_part = parts[-1].strip()
|
|
183
|
-
L.debug(f"Last part without punctuation: {last_part}")
|
|
184
|
-
|
|
172
|
+
L.debug(f"Last part without punctuation: {last_part}")
|
|
173
|
+
|
|
185
174
|
if last_part: # Only add non-empty sentences
|
|
186
175
|
split_passage.append(last_part)
|
|
187
|
-
|
|
188
|
-
# Final output
|
|
189
176
|
L.debug(f"Final split passage: {split_passage}")
|
|
190
177
|
return split_passage
|
|
191
178
|
|
|
192
179
|
split_passage = custom_sent_tokenize(final_passage)
|
|
193
180
|
|
|
194
|
-
# Debugging: Output the sentences after splitting
|
|
195
181
|
L.debug(f"Final sentences: {split_passage}")
|
|
196
182
|
|
|
197
183
|
return split_passage
|