BatchalignHK 0.7.17.post17__tar.gz → 0.7.17.post19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/PKG-INFO +2 -3
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/PKG-INFO +2 -3
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/cli/dispatch.py +10 -6
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/cantonese_infer.py +15 -12
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/ud.py +4 -4
- batchalignhk-0.7.17.post19/batchalign/version +3 -0
- batchalignhk-0.7.17.post17/batchalign/version +0 -3
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/SOURCES.txt +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/dependency_links.txt +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/entry_points.txt +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/requires.txt +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/top_level.txt +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/LICENSE +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/MANIFEST.in +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/README.md +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/__main__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/cli/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/cli/cli.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/constants.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/document.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/errors.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/base.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/file.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/generator.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/lexer.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/parser.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/file.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/generator.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/parser.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/resolve.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/config.yaml +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/infer.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/run.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/dataset.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/execute.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/infer.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/prep.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/train.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/wave2vec/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/wave2vec/infer_fa.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/infer_asr.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/infer_fa.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/analysis/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/analysis/eval.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/num2chinese.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/rev.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/tencent.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/whisper.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/whisperx.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/base.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/cleanup.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/parse_support.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/retrace.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/support/test.test +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/dispatch.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/whisper_fa.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/coref.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/pipeline.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/speaker/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/translate/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/translate/seamless.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/rev_utr.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/whisper_utr.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utterance/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/conftest.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/fixures.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/test_pipeline.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/test_document.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/__init__.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/config.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/dp.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/utils/utils.py +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/setup.cfg +0 -0
- {batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/setup.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: BatchalignHK
|
|
3
|
-
Version: 0.7.17.
|
|
3
|
+
Version: 0.7.17.post19
|
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
|
@@ -48,7 +48,6 @@ Dynamic: author-email
|
|
|
48
48
|
Dynamic: classifier
|
|
49
49
|
Dynamic: description
|
|
50
50
|
Dynamic: description-content-type
|
|
51
|
-
Dynamic: license-file
|
|
52
51
|
Dynamic: provides-extra
|
|
53
52
|
Dynamic: requires-dist
|
|
54
53
|
Dynamic: summary
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
2
|
Name: BatchalignHK
|
|
3
|
-
Version: 0.7.17.
|
|
3
|
+
Version: 0.7.17.post19
|
|
4
4
|
Summary: Python Speech Language Sample Analysis
|
|
5
5
|
Author: Brian MacWhinney, Houjun Liu
|
|
6
6
|
Author-email: macw@cmu.edu, houjun@cmu.edu
|
|
@@ -48,7 +48,6 @@ Dynamic: author-email
|
|
|
48
48
|
Dynamic: classifier
|
|
49
49
|
Dynamic: description
|
|
50
50
|
Dynamic: description-content-type
|
|
51
|
-
Dynamic: license-file
|
|
52
51
|
Dynamic: provides-extra
|
|
53
52
|
Dynamic: requires-dist
|
|
54
53
|
Dynamic: summary
|
|
@@ -66,12 +66,16 @@ def _dispatch(command, lang, num_speakers,
|
|
|
66
66
|
|
|
67
67
|
if kwargs.get("data"):
|
|
68
68
|
url = kwargs.get("data")
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
69
|
+
with open(url.strip()) as data:
|
|
70
|
+
data = data.readlines()
|
|
71
|
+
data = [i.strip() for i in data if i.strip() != ""]
|
|
72
|
+
for url in data:
|
|
73
|
+
url = urlparse(url)
|
|
74
|
+
if url.scheme == "":
|
|
75
|
+
url = url._replace(scheme="http")
|
|
76
|
+
base = os.path.basename(url.path)
|
|
77
|
+
files.append(url)
|
|
78
|
+
outputs.append(os.path.join(out_dir, base))
|
|
75
79
|
|
|
76
80
|
for basedir, _, fs in os.walk(in_dir):
|
|
77
81
|
for f in fs:
|
|
@@ -19,6 +19,9 @@ from transformers import DataCollatorForTokenClassification
|
|
|
19
19
|
# tqdm
|
|
20
20
|
from tqdm import tqdm
|
|
21
21
|
|
|
22
|
+
import logging
|
|
23
|
+
L = logging.getLogger("batchalign")
|
|
24
|
+
|
|
22
25
|
# seed device and tokens
|
|
23
26
|
DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
|
24
27
|
|
|
@@ -34,8 +37,8 @@ class BertCantoneseUtteranceModel(object):
|
|
|
34
37
|
|
|
35
38
|
# eval mode
|
|
36
39
|
self.model.eval()
|
|
37
|
-
|
|
38
|
-
|
|
40
|
+
L.debug(f"Model and tokenizer initialized on device: {DEVICE}")
|
|
41
|
+
L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
|
|
39
42
|
|
|
40
43
|
def __call__(self, passage):
|
|
41
44
|
# Step 1: Clean up passage
|
|
@@ -79,14 +82,14 @@ class BertCantoneseUtteranceModel(object):
|
|
|
79
82
|
break
|
|
80
83
|
|
|
81
84
|
# Debugging: Print number of chunks and their content
|
|
82
|
-
|
|
85
|
+
L.debug(f"Created {len(chunks)} chunks based on keywords.")
|
|
83
86
|
for i, chunk in enumerate(chunks):
|
|
84
|
-
|
|
87
|
+
L.debug(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
|
|
85
88
|
|
|
86
89
|
# Step 3: Process each chunk and restore punctuation
|
|
87
90
|
final_passage = []
|
|
88
91
|
for chunk_index, chunk in enumerate(chunks):
|
|
89
|
-
|
|
92
|
+
L.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
|
|
90
93
|
|
|
91
94
|
# Step 3.1: Split chunk by characters (Chinese tokenization)
|
|
92
95
|
tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
|
|
@@ -103,7 +106,7 @@ class BertCantoneseUtteranceModel(object):
|
|
|
103
106
|
# Pass it through the model
|
|
104
107
|
res = self.model(**tokd).logits
|
|
105
108
|
except Exception as e:
|
|
106
|
-
|
|
109
|
+
L.error(f"Error during model inference: {e}")
|
|
107
110
|
return []
|
|
108
111
|
|
|
109
112
|
# Argmax for classification
|
|
@@ -152,7 +155,7 @@ class BertCantoneseUtteranceModel(object):
|
|
|
152
155
|
# Step 4: Join processed chunks together into the final passage
|
|
153
156
|
final_passage = ' '.join(final_passage)
|
|
154
157
|
|
|
155
|
-
|
|
158
|
+
L.info("Text processing completed. Generating final output...")
|
|
156
159
|
|
|
157
160
|
# Optionally, tokenize the final text into sentences based on punctuation
|
|
158
161
|
def custom_sent_tokenize(text):
|
|
@@ -164,12 +167,12 @@ class BertCantoneseUtteranceModel(object):
|
|
|
164
167
|
parts = re.split(sentence_endings, text)
|
|
165
168
|
|
|
166
169
|
# Debug: Output the parts after splitting
|
|
167
|
-
|
|
170
|
+
L.debug(f"Parts after splitting: {parts}")
|
|
168
171
|
|
|
169
172
|
# Combine parts and punctuation together
|
|
170
173
|
for i in range(0, len(parts) - 1, 2):
|
|
171
174
|
sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
|
|
172
|
-
|
|
175
|
+
L.debug(f"Sentence formed: {sentence}") # Debug: Output the current sentence
|
|
173
176
|
|
|
174
177
|
if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
|
|
175
178
|
split_passage.append(sentence)
|
|
@@ -177,18 +180,18 @@ class BertCantoneseUtteranceModel(object):
|
|
|
177
180
|
# If the last part doesn't have punctuation, we handle it here
|
|
178
181
|
if len(parts) % 2 != 0: # If there's no punctuation at the end
|
|
179
182
|
last_part = parts[-1].strip()
|
|
180
|
-
|
|
183
|
+
L.debug(f"Last part without punctuation: {last_part}") # Debug: Output the last part
|
|
181
184
|
|
|
182
185
|
if last_part: # Only add non-empty sentences
|
|
183
186
|
split_passage.append(last_part)
|
|
184
187
|
|
|
185
188
|
# Final output
|
|
186
|
-
|
|
189
|
+
L.debug(f"Final split passage: {split_passage}")
|
|
187
190
|
return split_passage
|
|
188
191
|
|
|
189
192
|
split_passage = custom_sent_tokenize(final_passage)
|
|
190
193
|
|
|
191
194
|
# Debugging: Output the sentences after splitting
|
|
192
|
-
|
|
195
|
+
L.debug(f"Final sentences: {split_passage}")
|
|
193
196
|
|
|
194
197
|
return split_passage
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/ud.py
RENAMED
|
@@ -462,7 +462,7 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
|
|
|
462
462
|
# specivl forms: recall the special form marker is xbxxx
|
|
463
463
|
if "xbxxx" in word.text.strip():
|
|
464
464
|
form = special_forms.pop(0)
|
|
465
|
-
mor.append(f"
|
|
465
|
+
mor.append(f"{form[1].strip()}|{form[0].strip().replace(',', 'cm')}")
|
|
466
466
|
special_form_ids.append(word.id)
|
|
467
467
|
else:
|
|
468
468
|
mor.append(mor_word)
|
|
@@ -555,7 +555,6 @@ def parse_sentence(sentence, delimiter=".", special_forms=[], lang="$nospecial$"
|
|
|
555
555
|
# add a deliminator
|
|
556
556
|
mor_str = mor_str + " " + delimiter
|
|
557
557
|
|
|
558
|
-
|
|
559
558
|
mor_str = mor_str.replace("<UNK>", "")
|
|
560
559
|
gra_str = gra_str.replace("<UNK>", "")
|
|
561
560
|
|
|
@@ -843,7 +842,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
|
843
842
|
special_forms_cleaned = []
|
|
844
843
|
for form in special_forms:
|
|
845
844
|
line_cut = line_cut.replace(form, "xbxxx")
|
|
846
|
-
special_forms_cleaned.append(
|
|
845
|
+
special_forms_cleaned.append(form.split("@"))
|
|
847
846
|
|
|
848
847
|
# if line cut is still nothing, we get very angry
|
|
849
848
|
if line_cut == "":
|
|
@@ -942,7 +941,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
|
942
941
|
if ut[i.payload].text != ",":
|
|
943
942
|
ut[i.payload].morphology = [Morphology(
|
|
944
943
|
lemma = sents[0].tokens[i.payload].text if len(sents) > 0 and len(sents[0].tokens) > i.payload and sents[0].tokens[i.payload].text != "xbxxx" else ut[i.payload].text,
|
|
945
|
-
pos = "x",
|
|
944
|
+
pos = ut[i.payload].morphology[0].pos if (ut[i.payload].morphology and len(ut[i.payload].morphology) > 0) else "x",
|
|
946
945
|
feats = ""
|
|
947
946
|
)]
|
|
948
947
|
poses = [i.morphology[0].pos.upper() for i in ut
|
|
@@ -999,6 +998,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
|
|
|
999
998
|
content.dependency = form.dependency
|
|
1000
999
|
|
|
1001
1000
|
except Exception as e:
|
|
1001
|
+
raise e
|
|
1002
1002
|
pass
|
|
1003
1003
|
# warnings.warn(f"Utterance failed parsing, skipping ud tagging... line='{line}', error='{e}'.\n")
|
|
1004
1004
|
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/dependency_links.txt
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/entry_points.txt
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/requires.txt
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/BatchalignHK.egg-info/top_level.txt
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/chat/generator.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/file.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/generator.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/formats/textgrid/parser.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/config.yaml
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/infer.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/speaker/utils.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/training/utils.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/dataset.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/execute.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/infer.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/prep.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/utterance/train.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/wave2vec/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/wave2vec/infer_fa.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/infer_asr.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/models/whisper/infer_fa.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/analysis/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/analysis/eval.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/num2chinese.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/tencent.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/whisper.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/asr/whisperx.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/cleanup.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/cleanup/retrace.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/wave2vec_fa.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/fa/whisper_fa.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/morphosyntax/coref.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/speaker/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/translate/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/translate/seamless.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/__init__.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/rev_utr.py
RENAMED
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utr/whisper_utr.py
RENAMED
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/pipelines/utterance/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{batchalignhk-0.7.17.post17 → batchalignhk-0.7.17.post19}/batchalign/tests/pipelines/fixures.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|