BatchalignHK 0.7.23.post1__tar.gz → 0.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/BatchalignHK.egg-info/PKG-INFO +1 -1
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/PKG-INFO +1 -1
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/cli/cli.py +22 -21
- batchalignhk-0.8.0/batchalign/cli/dispatch.py +390 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/chat/generator.py +2 -1
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/ud.py +115 -81
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/translate/gtrans.py +1 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utterance/ud_utterance.py +1 -1
- batchalignhk-0.8.0/batchalign/utils/dp.py +225 -0
- batchalignhk-0.8.0/batchalign/version +3 -0
- batchalignhk-0.7.23.post1/batchalign/cli/dispatch.py +0 -223
- batchalignhk-0.7.23.post1/batchalign/utils/dp.py +0 -225
- batchalignhk-0.7.23.post1/batchalign/version +0 -3
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/BatchalignHK.egg-info/SOURCES.txt +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/BatchalignHK.egg-info/dependency_links.txt +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/BatchalignHK.egg-info/entry_points.txt +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/BatchalignHK.egg-info/requires.txt +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/BatchalignHK.egg-info/top_level.txt +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/LICENSE +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/MANIFEST.in +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/README.md +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/__main__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/cli/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/constants.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/document.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/errors.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/core.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/exception.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/logging.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/realtime_meeting.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/speech_recognizer.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/speech_synthesizer.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/speech_transcriber.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/stream_input_tts.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/token.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/util.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/version.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_abnf.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_app.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_cookiejar.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_core.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_exceptions.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_handshake.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_http.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_logging.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_socket.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_ssl_compat.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_url.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/_utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/echo-server.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/test_abnf.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/test_app.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/test_cookiejar.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/test_http.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/test_url.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/extern/nls/websocket/tests/test_websocket.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/base.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/chat/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/chat/file.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/chat/lexer.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/chat/parser.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/chat/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/textgrid/file.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/resolve.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/speaker/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/speaker/config.yaml +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/speaker/infer.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/speaker/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/training/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/training/run.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/training/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utterance/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utterance/cantonese_infer.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utterance/dataset.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utterance/execute.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utterance/infer.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utterance/prep.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/utterance/train.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/wave2vec/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/wave2vec/infer_fa.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/whisper/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/aliyun.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/funaudio.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/deu.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/ell.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/eng.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/eus.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/fra.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/hrv.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/ind.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/jpn.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/nld.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/por.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/spa.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/num2lang/tha.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/oai_whisper.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/tencent.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/avqi/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/avqi/engine.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/base.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/diarization/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/diarization/pyannote.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/dispatch.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/fa/iic_fa.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/fa/wave2vec_fa_canto.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/opensmile/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/opensmile/engine.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/pipeline.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/translate/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/translate/seamless.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/translate/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utr/funaudio_utr.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utr/tencent_utr.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/conftest.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/tests/test_document.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/utils/__init__.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/utils/abbrev.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/utils/compounds.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/utils/config.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/utils/names.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/batchalign/utils/utils.py +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/setup.cfg +0 -0
- {batchalignhk-0.7.23.post1 → batchalignhk-0.8.0}/setup.py +0 -0
|
@@ -3,37 +3,23 @@ cli.py
|
|
|
3
3
|
The Batchalign command-line interface
|
|
4
4
|
"""
|
|
5
5
|
|
|
6
|
-
import multiprocessing
|
|
7
6
|
import rich_click as click
|
|
8
7
|
import functools
|
|
9
8
|
|
|
10
9
|
import os
|
|
11
|
-
from glob import glob
|
|
12
10
|
|
|
13
|
-
from multiprocessing import
|
|
14
|
-
|
|
15
|
-
from batchalign.pipelines import BatchalignPipeline
|
|
11
|
+
from multiprocessing import freeze_support
|
|
16
12
|
|
|
13
|
+
from pathlib import Path
|
|
17
14
|
from rich.traceback import install
|
|
18
15
|
from rich.console import Console
|
|
19
|
-
from rich.panel import Panel
|
|
20
|
-
from pathlib import Path
|
|
21
|
-
from batchalign.document import *
|
|
22
|
-
from batchalign.formats.chat import CHATFile
|
|
23
|
-
from batchalign.utils import config
|
|
24
16
|
from rich.logging import RichHandler
|
|
25
17
|
|
|
26
18
|
from batchalign.cli.dispatch import _dispatch
|
|
27
19
|
from batchalign.models.training.run import cli as train
|
|
28
20
|
|
|
29
|
-
from enum import Enum
|
|
30
|
-
|
|
31
|
-
import traceback
|
|
32
|
-
|
|
33
21
|
import pyfiglet
|
|
34
|
-
|
|
35
|
-
import logging as L
|
|
36
|
-
baL = L.getLogger('batchalign')
|
|
22
|
+
import logging as L
|
|
37
23
|
|
|
38
24
|
C = Console()
|
|
39
25
|
|
|
@@ -62,7 +48,7 @@ def handle_verbosity(verbosity):
|
|
|
62
48
|
L.getLogger('stanza').handlers.clear()
|
|
63
49
|
L.getLogger('transformers').handlers.clear()
|
|
64
50
|
L.getLogger('nemo_logger').handlers.clear()
|
|
65
|
-
L.getLogger("stanza").setLevel(L.
|
|
51
|
+
L.getLogger("stanza").setLevel(L.WARN)
|
|
66
52
|
L.getLogger('nemo_logger').setLevel(L.CRITICAL)
|
|
67
53
|
L.getLogger('batchalign').setLevel(L.WARN)
|
|
68
54
|
L.getLogger('lightning.pytorch.utilities.migration.utils').setLevel(L.ERROR)
|
|
@@ -73,6 +59,7 @@ def handle_verbosity(verbosity):
|
|
|
73
59
|
L.getLogger('batchalign').setLevel(L.INFO)
|
|
74
60
|
if verbosity >= 3:
|
|
75
61
|
L.getLogger('batchalign').setLevel(L.DEBUG)
|
|
62
|
+
L.getLogger("stanza").setLevel(L.INFO)
|
|
76
63
|
if verbosity >= 4:
|
|
77
64
|
L.getLogger('batchalign').setLevel(L.DEBUG)
|
|
78
65
|
L.getLogger('transformers').setLevel(L.INFO)
|
|
@@ -81,7 +68,8 @@ def handle_verbosity(verbosity):
|
|
|
81
68
|
@click.pass_context
|
|
82
69
|
@click.version_option(VERSION_NUMBER)
|
|
83
70
|
@click.option("-v", "--verbose", type=int, count=True, default=0, help="How loquacious Batchalign should be.")
|
|
84
|
-
|
|
71
|
+
@click.option("--workers", type=int, default=os.cpu_count(), help="Number of worker processes to use.")
|
|
72
|
+
def batchalign(ctx, verbose, workers):
|
|
85
73
|
"""process .cha and/or audio files in IN_DIR and dumps them to OUT_DIR using recipe COMMAND"""
|
|
86
74
|
|
|
87
75
|
## setup commands ##
|
|
@@ -93,7 +81,9 @@ def batchalign(ctx, verbose):
|
|
|
93
81
|
handle_verbosity(verbose)
|
|
94
82
|
# add to arguments
|
|
95
83
|
ctx.obj["verbose"] = verbose
|
|
84
|
+
ctx.obj["workers"] = workers
|
|
96
85
|
# setup config
|
|
86
|
+
from batchalign.utils import config
|
|
97
87
|
ctx.obj["config"] = config.config_read(True)
|
|
98
88
|
# make everything look better
|
|
99
89
|
# pretty.install()
|
|
@@ -122,6 +112,7 @@ batchalign.add_command(train, "models")
|
|
|
122
112
|
@click.pass_context
|
|
123
113
|
def align(ctx, in_dir, out_dir, whisper, wav2vec, iic, wav2vec_yue, tencent, funaudio, **kwargs):
|
|
124
114
|
"""Align transcripts against corresponding media files."""
|
|
115
|
+
from batchalign.formats.chat import CHATFile
|
|
125
116
|
def loader(file):
|
|
126
117
|
return (
|
|
127
118
|
CHATFile(path=os.path.abspath(file)).doc,
|
|
@@ -180,6 +171,8 @@ def align(ctx, in_dir, out_dir, whisper, wav2vec, iic, wav2vec_yue, tencent, fun
|
|
|
180
171
|
@click.pass_context
|
|
181
172
|
def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
182
173
|
"""Create a transcript from audio files."""
|
|
174
|
+
from batchalign.document import CustomLine, CustomLineType
|
|
175
|
+
from batchalign.formats.chat import CHATFile
|
|
183
176
|
def loader(file):
|
|
184
177
|
return file
|
|
185
178
|
|
|
@@ -229,6 +222,7 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
|
229
222
|
@click.pass_context
|
|
230
223
|
def translate(ctx, in_dir, out_dir, **kwargs):
|
|
231
224
|
"""Translate the transcript to English."""
|
|
225
|
+
from batchalign.formats.chat import CHATFile
|
|
232
226
|
|
|
233
227
|
def loader(file):
|
|
234
228
|
cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
|
|
@@ -259,6 +253,7 @@ def translate(ctx, in_dir, out_dir, **kwargs):
|
|
|
259
253
|
@click.pass_context
|
|
260
254
|
def morphotag(ctx, in_dir, out_dir, **kwargs):
|
|
261
255
|
"""Perform morphosyntactic analysis on transcripts."""
|
|
256
|
+
from batchalign.formats.chat import CHATFile
|
|
262
257
|
|
|
263
258
|
def loader(file):
|
|
264
259
|
mwt = {}
|
|
@@ -285,7 +280,7 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
|
|
|
285
280
|
|
|
286
281
|
_dispatch("morphotag", "eng", 1, ["cha"], ctx,
|
|
287
282
|
in_dir, out_dir,
|
|
288
|
-
loader, writer, C)
|
|
283
|
+
loader, writer, C, **kwargs)
|
|
289
284
|
|
|
290
285
|
|
|
291
286
|
#################### MORPHOTAG ################################
|
|
@@ -295,6 +290,7 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
|
|
|
295
290
|
@click.pass_context
|
|
296
291
|
def coref(ctx, in_dir, out_dir, **kwargs):
|
|
297
292
|
"""Perform coreference analysis on transcripts."""
|
|
293
|
+
from batchalign.formats.chat import CHATFile
|
|
298
294
|
|
|
299
295
|
def loader(file):
|
|
300
296
|
cf = CHATFile(path=os.path.abspath(file))
|
|
@@ -322,6 +318,7 @@ def coref(ctx, in_dir, out_dir, **kwargs):
|
|
|
322
318
|
@click.pass_context
|
|
323
319
|
def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
324
320
|
"""Perform morphosyntactic analysis on transcripts."""
|
|
321
|
+
from batchalign.formats.chat import CHATFile
|
|
325
322
|
|
|
326
323
|
def loader(file):
|
|
327
324
|
return CHATFile(path=os.path.abspath(file)).doc
|
|
@@ -354,6 +351,7 @@ def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
|
|
|
354
351
|
@click.pass_context
|
|
355
352
|
def benchmark(ctx, in_dir, out_dir, lang, num_speakers, whisper, tencent, funaudio, whisper_oai, **kwargs):
|
|
356
353
|
"""Benchmark ASR utilities for their word accuracy"""
|
|
354
|
+
from batchalign.formats.chat import CHATFile
|
|
357
355
|
def loader(file):
|
|
358
356
|
# try to find a .cha in the same directory
|
|
359
357
|
p = Path(file)
|
|
@@ -397,6 +395,7 @@ def avqi(ctx, input_dir, output_dir, lang, **kwargs):
|
|
|
397
395
|
"""Calculate AVQI from paired .cs and .sv audio files in input directory."""
|
|
398
396
|
|
|
399
397
|
from batchalign.pipelines.avqi import AVQIEngine
|
|
398
|
+
from batchalign.document import Document
|
|
400
399
|
from pathlib import Path
|
|
401
400
|
import os
|
|
402
401
|
|
|
@@ -464,6 +463,7 @@ def avqi(ctx, input_dir, output_dir, lang, **kwargs):
|
|
|
464
463
|
@click.pass_context
|
|
465
464
|
def opensmile(ctx, input_dir, output_dir, feature_set, lang, **kwargs):
|
|
466
465
|
"""Extract openSMILE audio features from speech samples."""
|
|
466
|
+
from batchalign.document import Document
|
|
467
467
|
|
|
468
468
|
def loader(file):
|
|
469
469
|
doc = Document.new(media_path=file, lang=lang)
|
|
@@ -491,6 +491,7 @@ def opensmile(ctx, input_dir, output_dir, feature_set, lang, **kwargs):
|
|
|
491
491
|
def setup(ctx):
|
|
492
492
|
"""Reconfigure Batchalign settings, such as Rev.AI key."""
|
|
493
493
|
|
|
494
|
+
from batchalign.utils import config
|
|
494
495
|
config.interactive_setup()
|
|
495
496
|
|
|
496
497
|
#################### VERSION ################################
|
|
@@ -503,5 +504,5 @@ def version(ctx, **kwargs):
|
|
|
503
504
|
ptr = (pyfiglet.figlet_format("Batchalign2")+"\n" +
|
|
504
505
|
f"Version: [bold]{VERSION_NUMBER.strip()}[/bold], released {RELEASE_DATE.strip()}\n" +
|
|
505
506
|
f"[italic]{RELEASE_NOTES.strip()}[/italic]"+"\n" +
|
|
506
|
-
"\nDeveloped by Brian MacWhinney and Houjun Liu")
|
|
507
|
+
"\nDeveloped by Brian MacWhinney and Houjun Liu\ncontributions from Sebastian Song and Franklin Chen")
|
|
507
508
|
C.print("\n\n"+ptr+"\n\n")
|
|
@@ -0,0 +1,390 @@
|
|
|
1
|
+
"""
|
|
2
|
+
dispatch.py
|
|
3
|
+
CLI runner dispatch. Essentially the translation layer between `command` in CLI
|
|
4
|
+
and actual BatchalignPipeline.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, TimeElapsedColumn, BarColumn
|
|
8
|
+
from urllib.parse import urlparse
|
|
9
|
+
|
|
10
|
+
import warnings
|
|
11
|
+
|
|
12
|
+
import shutil
|
|
13
|
+
import os
|
|
14
|
+
import glob
|
|
15
|
+
import queue
|
|
16
|
+
|
|
17
|
+
from rich.console import Console
|
|
18
|
+
from rich.markup import escape
|
|
19
|
+
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import concurrent.futures
|
|
23
|
+
import multiprocessing
|
|
24
|
+
from functools import partial
|
|
25
|
+
|
|
26
|
+
# Oneliner of directory-based glob and replace
|
|
27
|
+
globase = lambda path, statement: glob(os.path.join(path, statement))
|
|
28
|
+
repath_file = lambda file_path, new_dir: os.path.join(new_dir, Path(file_path).name)
|
|
29
|
+
|
|
30
|
+
import tempfile
|
|
31
|
+
import time
|
|
32
|
+
|
|
33
|
+
import traceback
|
|
34
|
+
import logging as L
|
|
35
|
+
baL = L.getLogger('batchalign')
|
|
36
|
+
|
|
37
|
+
warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
|
|
38
|
+
|
|
39
|
+
# Global cache for the pipeline in worker processes
|
|
40
|
+
_worker_pipeline = None
|
|
41
|
+
|
|
42
|
+
def _get_worker_pipeline(command, lang, num_speakers, **kwargs):
|
|
43
|
+
global _worker_pipeline
|
|
44
|
+
if _worker_pipeline is None:
|
|
45
|
+
from batchalign.pipelines import BatchalignPipeline
|
|
46
|
+
_worker_pipeline = BatchalignPipeline.new(Cmd2Task[command],
|
|
47
|
+
lang=lang, num_speakers=num_speakers, **kwargs)
|
|
48
|
+
return _worker_pipeline
|
|
49
|
+
|
|
50
|
+
def _worker_task(file_info, command, lang, num_speakers, loader_info, writer_info, progress_queue=None, **kwargs):
|
|
51
|
+
"""The task executed in each worker process."""
|
|
52
|
+
import sys
|
|
53
|
+
import os
|
|
54
|
+
import tempfile
|
|
55
|
+
|
|
56
|
+
file, output = file_info
|
|
57
|
+
pid = os.getpid()
|
|
58
|
+
|
|
59
|
+
# Use a temporary file to capture ALL output at the FD level
|
|
60
|
+
# This is the most robust way to prevent interleaved output
|
|
61
|
+
with tempfile.TemporaryFile(mode='w+') as log_file:
|
|
62
|
+
old_stdout_fd = os.dup(sys.stdout.fileno())
|
|
63
|
+
old_stderr_fd = os.dup(sys.stderr.fileno())
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Redirect FD 1 and 2 to our temp file
|
|
67
|
+
os.dup2(log_file.fileno(), sys.stdout.fileno())
|
|
68
|
+
os.dup2(log_file.fileno(), sys.stderr.fileno())
|
|
69
|
+
|
|
70
|
+
pipeline = _get_worker_pipeline(command, lang, num_speakers, **kwargs)
|
|
71
|
+
|
|
72
|
+
def progress_callback(completed, total, tasks):
|
|
73
|
+
if not progress_queue:
|
|
74
|
+
return
|
|
75
|
+
try:
|
|
76
|
+
progress_queue.put((file, completed, total, tasks))
|
|
77
|
+
except Exception:
|
|
78
|
+
pass
|
|
79
|
+
|
|
80
|
+
# For now, we'll re-import what we need
|
|
81
|
+
from batchalign.formats.chat import CHATFile
|
|
82
|
+
|
|
83
|
+
# Morphosyntax specific loader/writer logic moved here for picklability
|
|
84
|
+
if command == "morphotag":
|
|
85
|
+
# Extract morphotag-specific arguments from kwargs
|
|
86
|
+
mwt = kwargs.pop("mwt", {})
|
|
87
|
+
retokenize = kwargs.pop("retokenize", False)
|
|
88
|
+
skipmultilang = kwargs.pop("skipmultilang", False)
|
|
89
|
+
|
|
90
|
+
cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
|
|
91
|
+
doc = cf.doc
|
|
92
|
+
if str(cf).count("%mor") > 0:
|
|
93
|
+
doc.ba_special_["special_mor_notation"] = True
|
|
94
|
+
|
|
95
|
+
# Prepare arguments for the pipeline
|
|
96
|
+
pipeline_kwargs = {
|
|
97
|
+
"retokenize": retokenize,
|
|
98
|
+
"skipmultilang": skipmultilang,
|
|
99
|
+
"mwt": mwt
|
|
100
|
+
}
|
|
101
|
+
# Add any remaining kwargs
|
|
102
|
+
pipeline_kwargs.update(kwargs)
|
|
103
|
+
|
|
104
|
+
# Process
|
|
105
|
+
doc = pipeline(doc, callback=progress_callback, **pipeline_kwargs)
|
|
106
|
+
|
|
107
|
+
# Write
|
|
108
|
+
CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
|
|
109
|
+
|
|
110
|
+
# Add other commands as needed, or use a more generic registry
|
|
111
|
+
elif command == "align":
|
|
112
|
+
cf = CHATFile(path=os.path.abspath(file))
|
|
113
|
+
doc = cf.doc
|
|
114
|
+
kw = {"pauses": kwargs.get("pauses", False)}
|
|
115
|
+
doc = pipeline(doc, callback=progress_callback, **kw)
|
|
116
|
+
CHATFile(doc=doc).write(output, write_wor=kwargs.get("wor", True))
|
|
117
|
+
|
|
118
|
+
else:
|
|
119
|
+
loader, writer = loader_info, writer_info
|
|
120
|
+
doc = loader(os.path.abspath(file))
|
|
121
|
+
kw = {}
|
|
122
|
+
if isinstance(doc, tuple) and len(doc) > 1:
|
|
123
|
+
doc, kw = doc
|
|
124
|
+
doc = pipeline(doc, callback=progress_callback, **kw)
|
|
125
|
+
writer(doc, output)
|
|
126
|
+
|
|
127
|
+
# Flush everything before reading back
|
|
128
|
+
sys.stdout.flush()
|
|
129
|
+
sys.stderr.flush()
|
|
130
|
+
log_file.seek(0)
|
|
131
|
+
captured = log_file.read()
|
|
132
|
+
|
|
133
|
+
return file, None, None, captured
|
|
134
|
+
except Exception as e:
|
|
135
|
+
# Flush everything before reading back
|
|
136
|
+
sys.stdout.flush()
|
|
137
|
+
sys.stderr.flush()
|
|
138
|
+
log_file.seek(0)
|
|
139
|
+
captured = log_file.read()
|
|
140
|
+
return file, traceback.format_exc(), e, captured
|
|
141
|
+
finally:
|
|
142
|
+
# Restore original FDs
|
|
143
|
+
os.dup2(old_stdout_fd, sys.stdout.fileno())
|
|
144
|
+
os.dup2(old_stderr_fd, sys.stderr.fileno())
|
|
145
|
+
os.close(old_stdout_fd)
|
|
146
|
+
os.close(old_stderr_fd)
|
|
147
|
+
|
|
148
|
+
# this dictionary maps what commands are executed
|
|
149
|
+
# against what BatchalignPipeline tasks are actually ran
|
|
150
|
+
Cmd2Task = {
|
|
151
|
+
"align": "fa",
|
|
152
|
+
"transcribe": "asr",
|
|
153
|
+
"transcribe_s": "asr,speaker",
|
|
154
|
+
"morphotag": "morphosyntax",
|
|
155
|
+
"benchmark": "asr,eval",
|
|
156
|
+
"utseg": "utterance",
|
|
157
|
+
"coref": "coref",
|
|
158
|
+
"translate": "translate",
|
|
159
|
+
"opensmile": "opensmile",
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# this is the main runner used by all functions
|
|
163
|
+
def _dispatch(command, lang, num_speakers,
|
|
164
|
+
extensions, ctx, in_dir, out_dir,
|
|
165
|
+
loader:callable, writer:callable, console,
|
|
166
|
+
**kwargs):
|
|
167
|
+
|
|
168
|
+
C = console
|
|
169
|
+
from batchalign.constants import FORCED_CONVERSION
|
|
170
|
+
from batchalign.document import TaskFriendlyName
|
|
171
|
+
|
|
172
|
+
# get files by walking the directory
|
|
173
|
+
files = []
|
|
174
|
+
outputs = []
|
|
175
|
+
|
|
176
|
+
if kwargs.get("data"):
|
|
177
|
+
url = kwargs.get("data")
|
|
178
|
+
with open(url.strip()) as data:
|
|
179
|
+
data = data.readlines()
|
|
180
|
+
data = [i.strip() for i in data if i.strip() != ""]
|
|
181
|
+
for url in data:
|
|
182
|
+
url = urlparse(url)
|
|
183
|
+
if url.scheme == "":
|
|
184
|
+
url = url._replace(scheme="http")
|
|
185
|
+
base = os.path.basename(url.path)
|
|
186
|
+
files.append(url)
|
|
187
|
+
outputs.append(os.path.join(out_dir, base))
|
|
188
|
+
|
|
189
|
+
extr_data_mapping = {}
|
|
190
|
+
|
|
191
|
+
for basedir, _, fs in os.walk(in_dir):
|
|
192
|
+
for f in fs:
|
|
193
|
+
path = Path(os.path.join(basedir, f))
|
|
194
|
+
ext = path.suffix.strip(".").strip().lower()
|
|
195
|
+
|
|
196
|
+
# calculate input path, convert if needed
|
|
197
|
+
inp_path = str(path)
|
|
198
|
+
if ext in FORCED_CONVERSION:
|
|
199
|
+
# check for ffmpeg
|
|
200
|
+
if not shutil.which("ffmpeg"):
|
|
201
|
+
raise ValueError(f"ffmpeg not found in Path! Cannot load input media at {inp_path}.\nHint: Please convert your input audio sample to .wav before proceeding witch Batchalign, or install ffmpeg (https://ffmpeg.org/download.html)")
|
|
202
|
+
# convert
|
|
203
|
+
from pydub import AudioSegment
|
|
204
|
+
seg = AudioSegment.from_file(inp_path, ext)
|
|
205
|
+
seg.export(inp_path.replace(f".{ext}", ".wav"), format="wav")
|
|
206
|
+
inp_path = inp_path.replace(f".{ext}", ".wav")
|
|
207
|
+
|
|
208
|
+
# repath the file to the output
|
|
209
|
+
rel = os.path.relpath(inp_path, in_dir)
|
|
210
|
+
repathed = Path(os.path.join(out_dir, rel))
|
|
211
|
+
# make the repathed dir, if it doesn't exist
|
|
212
|
+
parent = repathed.parent.absolute()
|
|
213
|
+
os.makedirs(parent, exist_ok=True)
|
|
214
|
+
|
|
215
|
+
# HACK check for @Options:\tdummy in the file
|
|
216
|
+
# and simply copy it
|
|
217
|
+
if ext == "cha":
|
|
218
|
+
with open(inp_path, 'r', encoding="utf-8") as df:
|
|
219
|
+
data = df.read()
|
|
220
|
+
if "@Options:\tdummy" in data:
|
|
221
|
+
shutil.copy2(inp_path, str(repathed))
|
|
222
|
+
continue
|
|
223
|
+
elif "This is a dummy file to permit playback from the TalkBank browser" in data:
|
|
224
|
+
shutil.copy2(inp_path, str(repathed))
|
|
225
|
+
continue
|
|
226
|
+
|
|
227
|
+
# if the file needs to get processed, append it to the list
|
|
228
|
+
# to be processed and compute the output
|
|
229
|
+
if ext in extensions:
|
|
230
|
+
for indx, i in enumerate(files):
|
|
231
|
+
# check if this is a duplicate file
|
|
232
|
+
if (not isinstance(i, str) and
|
|
233
|
+
Path(i.geturl()).stem == Path(inp_path).stem):
|
|
234
|
+
extr_data_mapping[inp_path] = i.geturl()
|
|
235
|
+
|
|
236
|
+
files.pop(indx)
|
|
237
|
+
outputs.pop(indx)
|
|
238
|
+
break
|
|
239
|
+
|
|
240
|
+
files.append(inp_path)
|
|
241
|
+
outputs.append(str(repathed))
|
|
242
|
+
# otherwise just copy the file
|
|
243
|
+
else:
|
|
244
|
+
shutil.copy2(inp_path, str(repathed))
|
|
245
|
+
|
|
246
|
+
__tf = None
|
|
247
|
+
# output file
|
|
248
|
+
if ctx.obj["verbose"] > 1:
|
|
249
|
+
__tf = tempfile.NamedTemporaryFile(delete=True, mode='w')
|
|
250
|
+
C = Console(file=__tf)
|
|
251
|
+
|
|
252
|
+
# process largest inputs first to avoid late stragglers
|
|
253
|
+
file_pairs = list(zip(files, outputs))
|
|
254
|
+
file_pairs.sort(key=lambda fo: os.path.getsize(fo[0]) if os.path.exists(fo[0]) else 0, reverse=True)
|
|
255
|
+
files, outputs = zip(*file_pairs) if file_pairs else ([], [])
|
|
256
|
+
|
|
257
|
+
C.print(f"\nMode: [blue]{command}[/blue]; got [bold cyan]{len(files)}[/bold cyan] transcript{'s' if len(files) > 1 else ''} to process from {in_dir}:\n")
|
|
258
|
+
|
|
259
|
+
# Determine number of workers
|
|
260
|
+
num_workers = kwargs.get("num_workers", ctx.obj.get("workers", os.cpu_count()))
|
|
261
|
+
|
|
262
|
+
# Pre-download stanza resources if needed to avoid interleaved downloads in workers
|
|
263
|
+
if command in ["morphotag", "utseg", "coref"]:
|
|
264
|
+
try:
|
|
265
|
+
import stanza
|
|
266
|
+
stanza.download_resources_json()
|
|
267
|
+
except Exception:
|
|
268
|
+
pass
|
|
269
|
+
|
|
270
|
+
# For some commands or environments, we might want to limit this
|
|
271
|
+
if command in ["transcribe", "transcribe_s"]:
|
|
272
|
+
num_workers = min(num_workers, 2) # GPU memory limits
|
|
273
|
+
|
|
274
|
+
C.print(f"Using [bold]{num_workers}[/bold] worker processes.\n")
|
|
275
|
+
|
|
276
|
+
manager = multiprocessing.Manager() if files else None
|
|
277
|
+
progress_queue = manager.Queue() if manager else None
|
|
278
|
+
|
|
279
|
+
def render_stage(stage_tasks):
|
|
280
|
+
if not stage_tasks:
|
|
281
|
+
return "Processing..."
|
|
282
|
+
if not isinstance(stage_tasks, (list, tuple)):
|
|
283
|
+
stage_tasks = [stage_tasks]
|
|
284
|
+
names = [TaskFriendlyName.get(task, str(task)) for task in stage_tasks]
|
|
285
|
+
return ", ".join(names)
|
|
286
|
+
|
|
287
|
+
# create the spinner
|
|
288
|
+
prog = Progress(SpinnerColumn(), *Progress.get_default_columns()[:-1],
|
|
289
|
+
TimeElapsedColumn(),
|
|
290
|
+
TextColumn("[cyan]{task.fields[processor]}[/cyan]"), console=C)
|
|
291
|
+
errors = []
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
with prog as prog:
|
|
295
|
+
tasks = {}
|
|
296
|
+
task_totals = {}
|
|
297
|
+
|
|
298
|
+
for f in files:
|
|
299
|
+
tasks[f] = prog.add_task(Path(f).name, start=False, total=1, processor="Waiting...")
|
|
300
|
+
task_totals[f] = 1
|
|
301
|
+
|
|
302
|
+
def drain_progress_queue():
|
|
303
|
+
if not progress_queue:
|
|
304
|
+
return
|
|
305
|
+
while True:
|
|
306
|
+
try:
|
|
307
|
+
file, completed, total, stage_tasks = progress_queue.get_nowait()
|
|
308
|
+
except queue.Empty:
|
|
309
|
+
break
|
|
310
|
+
except Exception:
|
|
311
|
+
break
|
|
312
|
+
if file not in tasks:
|
|
313
|
+
continue
|
|
314
|
+
task_total = max(int(total) if total else task_totals.get(file, 1), 1)
|
|
315
|
+
task_totals[file] = task_total
|
|
316
|
+
prog.update(tasks[file],
|
|
317
|
+
total=task_total,
|
|
318
|
+
completed=min(int(completed), task_total),
|
|
319
|
+
processor=render_stage(stage_tasks))
|
|
320
|
+
|
|
321
|
+
with concurrent.futures.ProcessPoolExecutor(max_workers=num_workers) as executor:
|
|
322
|
+
worker_func = partial(_worker_task,
|
|
323
|
+
command=command,
|
|
324
|
+
lang=lang,
|
|
325
|
+
num_speakers=num_speakers,
|
|
326
|
+
loader_info=None,
|
|
327
|
+
writer_info=None,
|
|
328
|
+
progress_queue=progress_queue,
|
|
329
|
+
**kwargs)
|
|
330
|
+
|
|
331
|
+
future_to_file = {executor.submit(worker_func, (f, o)): f for f, o in zip(files, outputs)}
|
|
332
|
+
|
|
333
|
+
for f in files:
|
|
334
|
+
prog.start_task(tasks[f])
|
|
335
|
+
prog.update(tasks[f], processor="Processing...")
|
|
336
|
+
|
|
337
|
+
pending = set(future_to_file.keys())
|
|
338
|
+
while pending:
|
|
339
|
+
done, pending = concurrent.futures.wait(
|
|
340
|
+
pending,
|
|
341
|
+
timeout=0.1,
|
|
342
|
+
return_when=concurrent.futures.FIRST_COMPLETED,
|
|
343
|
+
)
|
|
344
|
+
drain_progress_queue()
|
|
345
|
+
|
|
346
|
+
for future in done:
|
|
347
|
+
file = future_to_file[future]
|
|
348
|
+
try:
|
|
349
|
+
res_file, trcbk, e, captured = future.result()
|
|
350
|
+
final_total = max(task_totals.get(file, 1), 1)
|
|
351
|
+
if e:
|
|
352
|
+
prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
|
|
353
|
+
errors.append((res_file, trcbk, e, captured))
|
|
354
|
+
else:
|
|
355
|
+
prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold green]DONE[/bold green]")
|
|
356
|
+
if ctx.obj["verbose"] >= 1 and captured.strip():
|
|
357
|
+
errors.append((res_file, "Logs only (Success)", None, captured))
|
|
358
|
+
except Exception as e:
|
|
359
|
+
final_total = max(task_totals.get(file, 1), 1)
|
|
360
|
+
prog.update(tasks[file], total=final_total, completed=final_total, processor="[bold red]FAIL[/bold red]")
|
|
361
|
+
errors.append((file, traceback.format_exc(), e, ""))
|
|
362
|
+
|
|
363
|
+
drain_progress_queue()
|
|
364
|
+
finally:
|
|
365
|
+
if manager:
|
|
366
|
+
manager.shutdown()
|
|
367
|
+
|
|
368
|
+
if len(errors) > 0:
|
|
369
|
+
C.print()
|
|
370
|
+
for file, trcbk, e, captured in errors:
|
|
371
|
+
rel_path = os.path.relpath(str(Path(file).absolute()), in_dir)
|
|
372
|
+
if e:
|
|
373
|
+
C.print(f"[bold red]ERROR[/bold red] on file [italic]{rel_path}[/italic]: {escape(str(e))}\n")
|
|
374
|
+
if captured.strip():
|
|
375
|
+
C.print(f"[dim]Captured Worker Output:[/dim]\n{escape(captured.strip())}\n")
|
|
376
|
+
if ctx.obj["verbose"] == 1:
|
|
377
|
+
C.print(escape(str(trcbk)))
|
|
378
|
+
elif ctx.obj["verbose"] > 1:
|
|
379
|
+
Console().print(escape(str(trcbk)))
|
|
380
|
+
elif captured.strip():
|
|
381
|
+
C.print(f"[bold blue]INFO[/bold blue] on file [italic]{rel_path}[/italic]:\n")
|
|
382
|
+
C.print(f"{escape(captured.strip())}\n")
|
|
383
|
+
else:
|
|
384
|
+
C.print(f"\nAll done. Results saved to {out_dir}!\n")
|
|
385
|
+
|
|
386
|
+
if ctx.obj["verbose"] > 1:
|
|
387
|
+
C.end_capture()
|
|
388
|
+
|
|
389
|
+
if __tf:
|
|
390
|
+
__tf.close()
|
|
@@ -42,10 +42,11 @@ def generate_chat_utterance(utterance: Utterance, special_mor=False, write_wor=T
|
|
|
42
42
|
main_line = re.sub(r"(?:[a-z]) ?\(([a-z]+) ?\)", r"(\1)", main_line)
|
|
43
43
|
main_line = re.sub(r"([a-z]) _", r"\1_", main_line)
|
|
44
44
|
main_line = re.sub(r" ", r" ", main_line)
|
|
45
|
+
main_line = re.sub(r"^,", "", main_line.strip()) # remove initial commas
|
|
45
46
|
main_line = re.sub(r"«", "“", main_line)
|
|
46
47
|
main_line = re.sub(r"»", "”", main_line)
|
|
47
48
|
main_line = re.sub(r"—", "-", main_line)
|
|
48
|
-
main_line = re.sub(r"–", "-", main_line)
|
|
49
|
+
main_line = re.sub(r"–", "-", main_line).strip()
|
|
49
50
|
tier = utterance.tier
|
|
50
51
|
|
|
51
52
|
mors = []
|