BatchalignHK 0.7.17.post16__tar.gz → 0.7.17.post18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/PKG-INFO +4 -2
  2. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/requires.txt +1 -0
  3. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/PKG-INFO +4 -2
  4. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/cli/dispatch.py +10 -6
  5. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/cantonese_infer.py +15 -12
  6. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/tencent.py +4 -1
  7. batchalignhk-0.7.17.post18/batchalign/version +3 -0
  8. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/setup.py +1 -0
  9. batchalignhk-0.7.17.post16/batchalign/version +0 -3
  10. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/SOURCES.txt +0 -0
  11. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/dependency_links.txt +0 -0
  12. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/entry_points.txt +0 -0
  13. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/BatchalignHK.egg-info/top_level.txt +0 -0
  14. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/LICENSE +0 -0
  15. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/MANIFEST.in +0 -0
  16. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/README.md +0 -0
  17. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/__init__.py +0 -0
  18. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/__main__.py +0 -0
  19. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/cli/__init__.py +0 -0
  20. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/cli/cli.py +0 -0
  21. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/constants.py +0 -0
  22. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/document.py +0 -0
  23. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/errors.py +0 -0
  24. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/__init__.py +0 -0
  25. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/base.py +0 -0
  26. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/__init__.py +0 -0
  27. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/file.py +0 -0
  28. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/generator.py +0 -0
  29. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/lexer.py +0 -0
  30. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/parser.py +0 -0
  31. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/chat/utils.py +0 -0
  32. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/__init__.py +0 -0
  33. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/file.py +0 -0
  34. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/generator.py +0 -0
  35. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/formats/textgrid/parser.py +0 -0
  36. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/__init__.py +0 -0
  37. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/resolve.py +0 -0
  38. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/__init__.py +0 -0
  39. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/config.yaml +0 -0
  40. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/infer.py +0 -0
  41. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/speaker/utils.py +0 -0
  42. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/__init__.py +0 -0
  43. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/run.py +0 -0
  44. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/training/utils.py +0 -0
  45. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utils.py +0 -0
  46. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/__init__.py +0 -0
  47. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/dataset.py +0 -0
  48. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/execute.py +0 -0
  49. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/infer.py +0 -0
  50. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/prep.py +0 -0
  51. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/utterance/train.py +0 -0
  52. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/wave2vec/__init__.py +0 -0
  53. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/wave2vec/infer_fa.py +0 -0
  54. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/__init__.py +0 -0
  55. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/infer_asr.py +0 -0
  56. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/models/whisper/infer_fa.py +0 -0
  57. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/__init__.py +0 -0
  58. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/analysis/__init__.py +0 -0
  59. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/analysis/eval.py +0 -0
  60. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/__init__.py +0 -0
  61. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/num2chinese.py +0 -0
  62. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/rev.py +0 -0
  63. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/utils.py +0 -0
  64. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/whisper.py +0 -0
  65. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/asr/whisperx.py +0 -0
  66. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/base.py +0 -0
  67. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/__init__.py +0 -0
  68. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  69. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  70. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  71. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/retrace.py +0 -0
  72. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  73. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  74. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/cleanup/support/test.test +0 -0
  75. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/dispatch.py +0 -0
  76. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/__init__.py +0 -0
  77. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  78. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  79. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  80. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  81. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  82. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  83. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  84. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  85. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  86. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/morphosyntax/ud.py +0 -0
  87. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/pipeline.py +0 -0
  88. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/speaker/__init__.py +0 -0
  89. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  90. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/translate/__init__.py +0 -0
  91. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/translate/seamless.py +0 -0
  92. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/__init__.py +0 -0
  93. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/rev_utr.py +0 -0
  94. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/utils.py +0 -0
  95. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  96. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utterance/__init__.py +0 -0
  97. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  98. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/__init__.py +0 -0
  99. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/conftest.py +0 -0
  100. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  101. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  102. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  103. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  104. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  105. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  106. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  107. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  108. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  109. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  110. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  111. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  112. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/fixures.py +0 -0
  113. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  114. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  115. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/tests/test_document.py +0 -0
  116. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/__init__.py +0 -0
  117. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/config.py +0 -0
  118. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/dp.py +0 -0
  119. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/batchalign/utils/utils.py +0 -0
  120. {batchalignhk-0.7.17.post16 → batchalignhk-0.7.17.post18}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: BatchalignHK
3
- Version: 0.7.17.post16
3
+ Version: 0.7.17.post18
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -15,6 +15,7 @@ Requires-Dist: torch>=2.6.0
15
15
  Requires-Dist: torchaudio
16
16
  Requires-Dist: hmmlearn==0.3.0
17
17
  Requires-Dist: eyed3
18
+ Requires-Dist: opencc-python-reimplemented
18
19
  Requires-Dist: pydub
19
20
  Requires-Dist: imblearn
20
21
  Requires-Dist: plotly>=5.3.0
@@ -47,6 +48,7 @@ Dynamic: author-email
47
48
  Dynamic: classifier
48
49
  Dynamic: description
49
50
  Dynamic: description-content-type
51
+ Dynamic: license-file
50
52
  Dynamic: provides-extra
51
53
  Dynamic: requires-dist
52
54
  Dynamic: summary
@@ -5,6 +5,7 @@ torch>=2.6.0
5
5
  torchaudio
6
6
  hmmlearn==0.3.0
7
7
  eyed3
8
+ opencc-python-reimplemented
8
9
  pydub
9
10
  imblearn
10
11
  plotly>=5.3.0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: BatchalignHK
3
- Version: 0.7.17.post16
3
+ Version: 0.7.17.post18
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -15,6 +15,7 @@ Requires-Dist: torch>=2.6.0
15
15
  Requires-Dist: torchaudio
16
16
  Requires-Dist: hmmlearn==0.3.0
17
17
  Requires-Dist: eyed3
18
+ Requires-Dist: opencc-python-reimplemented
18
19
  Requires-Dist: pydub
19
20
  Requires-Dist: imblearn
20
21
  Requires-Dist: plotly>=5.3.0
@@ -47,6 +48,7 @@ Dynamic: author-email
47
48
  Dynamic: classifier
48
49
  Dynamic: description
49
50
  Dynamic: description-content-type
51
+ Dynamic: license-file
50
52
  Dynamic: provides-extra
51
53
  Dynamic: requires-dist
52
54
  Dynamic: summary
@@ -66,12 +66,16 @@ def _dispatch(command, lang, num_speakers,
66
66
 
67
67
  if kwargs.get("data"):
68
68
  url = kwargs.get("data")
69
- url = urlparse(url)
70
- if url.scheme == "":
71
- url = url._replace(scheme="http")
72
- base = os.path.basename(url.path)
73
- files.append(url)
74
- outputs.append(os.path.join(out_dir, base))
69
+ with open(url.strip()) as data:
70
+ data = data.readlines()
71
+ data = [i.strip() for i in data if i.strip() != ""]
72
+ for url in data:
73
+ url = urlparse(url)
74
+ if url.scheme == "":
75
+ url = url._replace(scheme="http")
76
+ base = os.path.basename(url.path)
77
+ files.append(url)
78
+ outputs.append(os.path.join(out_dir, base))
75
79
 
76
80
  for basedir, _, fs in os.walk(in_dir):
77
81
  for f in fs:
@@ -19,6 +19,9 @@ from transformers import DataCollatorForTokenClassification
19
19
  # tqdm
20
20
  from tqdm import tqdm
21
21
 
22
+ import logging
23
+ L = logging.getLogger("batchalign")
24
+
22
25
  # seed device and tokens
23
26
  DEVICE = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
24
27
 
@@ -34,8 +37,8 @@ class BertCantoneseUtteranceModel(object):
34
37
 
35
38
  # eval mode
36
39
  self.model.eval()
37
- print(f"Model and tokenizer initialized on device: {DEVICE}")
38
- print(f"Max length set to {self.max_length} with overlap of {self.overlap}")
40
+ L.debug(f"Model and tokenizer initialized on device: {DEVICE}")
41
+ L.debug(f"Max length set to {self.max_length} with overlap of {self.overlap}")
39
42
 
40
43
  def __call__(self, passage):
41
44
  # Step 1: Clean up passage
@@ -79,14 +82,14 @@ class BertCantoneseUtteranceModel(object):
79
82
  break
80
83
 
81
84
  # Debugging: Print number of chunks and their content
82
- print(f"Created {len(chunks)} chunks based on keywords.")
85
+ L.debug(f"Created {len(chunks)} chunks based on keywords.")
83
86
  for i, chunk in enumerate(chunks):
84
- print(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
87
+ L.debug(f"Chunk {i + 1}: {chunk[:100]}...") # Print the first 100 characters of each chunk
85
88
 
86
89
  # Step 3: Process each chunk and restore punctuation
87
90
  final_passage = []
88
91
  for chunk_index, chunk in enumerate(chunks):
89
- print(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
92
+ L.debug(f"Processing chunk {chunk_index + 1}/{len(chunks)}...")
90
93
 
91
94
  # Step 3.1: Split chunk by characters (Chinese tokenization)
92
95
  tokenized_chunk = list(chunk) # Simply split by characters for Chinese text
@@ -103,7 +106,7 @@ class BertCantoneseUtteranceModel(object):
103
106
  # Pass it through the model
104
107
  res = self.model(**tokd).logits
105
108
  except Exception as e:
106
- print(f"Error during model inference: {e}")
109
+ L.error(f"Error during model inference: {e}")
107
110
  return []
108
111
 
109
112
  # Argmax for classification
@@ -152,7 +155,7 @@ class BertCantoneseUtteranceModel(object):
152
155
  # Step 4: Join processed chunks together into the final passage
153
156
  final_passage = ' '.join(final_passage)
154
157
 
155
- print("Text processing completed. Generating final output...")
158
+ L.info("Text processing completed. Generating final output...")
156
159
 
157
160
  # Optionally, tokenize the final text into sentences based on punctuation
158
161
  def custom_sent_tokenize(text):
@@ -164,12 +167,12 @@ class BertCantoneseUtteranceModel(object):
164
167
  parts = re.split(sentence_endings, text)
165
168
 
166
169
  # Debug: Output the parts after splitting
167
- print(f"Parts after splitting: {parts}")
170
+ L.debug(f"Parts after splitting: {parts}")
168
171
 
169
172
  # Combine parts and punctuation together
170
173
  for i in range(0, len(parts) - 1, 2):
171
174
  sentence = parts[i] + parts[i + 1] # Join sentence with punctuation
172
- print(f"Sentence formed: {sentence}") # Debug: Output the current sentence
175
+ L.debug(f"Sentence formed: {sentence}") # Debug: Output the current sentence
173
176
 
174
177
  if sentence.strip(): # Only add non-empty sentences (check for non-whitespace content)
175
178
  split_passage.append(sentence)
@@ -177,18 +180,18 @@ class BertCantoneseUtteranceModel(object):
177
180
  # If the last part doesn't have punctuation, we handle it here
178
181
  if len(parts) % 2 != 0: # If there's no punctuation at the end
179
182
  last_part = parts[-1].strip()
180
- print(f"Last part without punctuation: {last_part}") # Debug: Output the last part
183
+ L.debug(f"Last part without punctuation: {last_part}") # Debug: Output the last part
181
184
 
182
185
  if last_part: # Only add non-empty sentences
183
186
  split_passage.append(last_part)
184
187
 
185
188
  # Final output
186
- print(f"Final split passage: {split_passage}")
189
+ L.debug(f"Final split passage: {split_passage}")
187
190
  return split_passage
188
191
 
189
192
  split_passage = custom_sent_tokenize(final_passage)
190
193
 
191
194
  # Debugging: Output the sentences after splitting
192
- print(f"Final sentences: {split_passage}")
195
+ L.debug(f"Final sentences: {split_passage}")
193
196
 
194
197
  return split_passage
@@ -12,6 +12,9 @@ from batchalign.errors import *
12
12
 
13
13
  from batchalign.models import BertUtteranceModel, BertCantoneseUtteranceModel, resolve
14
14
 
15
+ from opencc import OpenCC
16
+ cc = OpenCC('s2hk')
17
+
15
18
  import time
16
19
  import pathlib
17
20
  import pycountry
@@ -113,7 +116,7 @@ class TencentEngine(BatchalignEngine):
113
116
  "type": "text",
114
117
  "ts": (j.OffsetStartMs+start)/1000,
115
118
  "end_ts": (j.OffsetEndMs+start)/1000,
116
- "value": j.Word
119
+ "value": cc.convert(j.Word)
117
120
  })
118
121
  turns.append({
119
122
  "elements": turn,
@@ -0,0 +1,3 @@
1
+ 0.7.17-post.18
2
+ March 26th, 2025
3
+ cantonese ASR with general fixes for inputting a list
@@ -34,6 +34,7 @@ setup(
34
34
  # "pyAudioAnalysis",
35
35
  "hmmlearn==0.3.0",
36
36
  "eyed3",
37
+ "opencc-python-reimplemented",
37
38
  "pydub",
38
39
  "imblearn",
39
40
  "plotly>=5.3.0",
@@ -1,3 +0,0 @@
1
- 0.7.17-post.16
2
- March 26th, 2025
3
- better tencent ASR