batchalign 0.7.12__tar.gz → 0.7.13.post1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (116) hide show
  1. {batchalign-0.7.12/batchalign.egg-info → batchalign-0.7.13.post1}/PKG-INFO +1 -1
  2. batchalign-0.7.13.post1/batchalign/pipelines/asr/num2chinese.py +88 -0
  3. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/asr/utils.py +12 -1
  4. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/ud.py +1 -0
  5. batchalign-0.7.13.post1/batchalign/version +3 -0
  6. {batchalign-0.7.12 → batchalign-0.7.13.post1/batchalign.egg-info}/PKG-INFO +1 -1
  7. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign.egg-info/SOURCES.txt +1 -0
  8. batchalign-0.7.12/batchalign/version +0 -3
  9. {batchalign-0.7.12 → batchalign-0.7.13.post1}/LICENSE +0 -0
  10. {batchalign-0.7.12 → batchalign-0.7.13.post1}/MANIFEST.in +0 -0
  11. {batchalign-0.7.12 → batchalign-0.7.13.post1}/README.md +0 -0
  12. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/__init__.py +0 -0
  13. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/__main__.py +0 -0
  14. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/cli/__init__.py +0 -0
  15. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/cli/cli.py +0 -0
  16. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/cli/dispatch.py +0 -0
  17. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/constants.py +0 -0
  18. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/document.py +0 -0
  19. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/errors.py +0 -0
  20. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/__init__.py +0 -0
  21. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/base.py +0 -0
  22. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/chat/__init__.py +0 -0
  23. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/chat/file.py +0 -0
  24. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/chat/generator.py +0 -0
  25. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/chat/lexer.py +0 -0
  26. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/chat/parser.py +0 -0
  27. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/chat/utils.py +0 -0
  28. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/textgrid/__init__.py +0 -0
  29. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/textgrid/file.py +0 -0
  30. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/textgrid/generator.py +0 -0
  31. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/formats/textgrid/parser.py +0 -0
  32. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/__init__.py +0 -0
  33. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/resolve.py +0 -0
  34. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/speaker/__init__.py +0 -0
  35. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/speaker/config.yaml +0 -0
  36. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/speaker/infer.py +0 -0
  37. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/speaker/utils.py +0 -0
  38. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/training/__init__.py +0 -0
  39. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/training/run.py +0 -0
  40. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/training/utils.py +0 -0
  41. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/utils.py +0 -0
  42. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/utterance/__init__.py +0 -0
  43. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/utterance/dataset.py +0 -0
  44. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/utterance/execute.py +0 -0
  45. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/utterance/infer.py +0 -0
  46. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/utterance/prep.py +0 -0
  47. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/utterance/train.py +0 -0
  48. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/wave2vec/__init__.py +0 -0
  49. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/wave2vec/infer_fa.py +0 -0
  50. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/whisper/__init__.py +0 -0
  51. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/whisper/infer_asr.py +0 -0
  52. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/models/whisper/infer_fa.py +0 -0
  53. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/__init__.py +0 -0
  54. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/analysis/__init__.py +0 -0
  55. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/analysis/eval.py +0 -0
  56. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/asr/__init__.py +0 -0
  57. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/asr/rev.py +0 -0
  58. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/asr/whisper.py +0 -0
  59. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/asr/whisperx.py +0 -0
  60. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/base.py +0 -0
  61. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/__init__.py +0 -0
  62. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  63. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  64. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  65. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/retrace.py +0 -0
  66. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  67. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  68. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/cleanup/support/test.test +0 -0
  69. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/dispatch.py +0 -0
  70. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/fa/__init__.py +0 -0
  71. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  72. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  73. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  74. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  75. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  76. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  77. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  78. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  79. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  80. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/pipeline.py +0 -0
  81. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/speaker/__init__.py +0 -0
  82. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  83. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/utr/__init__.py +0 -0
  84. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/utr/rev_utr.py +0 -0
  85. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/utr/utils.py +0 -0
  86. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  87. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/utterance/__init__.py +0 -0
  88. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  89. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/__init__.py +0 -0
  90. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/conftest.py +0 -0
  91. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  92. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  93. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  94. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  95. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  96. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  97. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  98. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  99. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  100. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  101. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  102. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  103. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/fixures.py +0 -0
  104. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  105. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  106. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/tests/test_document.py +0 -0
  107. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/utils/__init__.py +0 -0
  108. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/utils/config.py +0 -0
  109. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/utils/dp.py +0 -0
  110. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign/utils/utils.py +0 -0
  111. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign.egg-info/dependency_links.txt +0 -0
  112. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign.egg-info/entry_points.txt +0 -0
  113. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign.egg-info/requires.txt +0 -0
  114. {batchalign-0.7.12 → batchalign-0.7.13.post1}/batchalign.egg-info/top_level.txt +0 -0
  115. {batchalign-0.7.12 → batchalign-0.7.13.post1}/setup.cfg +0 -0
  116. {batchalign-0.7.12 → batchalign-0.7.13.post1}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.12
3
+ Version: 0.7.13.post1
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -0,0 +1,88 @@
1
+ import itertools
2
+
3
+ def num2chinese(num, big=False, simp=True, o=False, twoalt=False):
4
+ """
5
+ Converts numbers to Chinese representations.
6
+
7
+ `big` : use financial characters.
8
+ `simp` : use simplified characters instead of traditional characters.
9
+ `o` : use 〇 for zero.
10
+ `twoalt`: use 两/兩 for two when appropriate.
11
+
12
+ Note that `o` and `twoalt` is ignored when `big` is used,
13
+ and `twoalt` is ignored when `o` is used for formal representations.
14
+ """
15
+
16
+ # check num first
17
+ nd = str(num)
18
+ if abs(float(nd)) >= 1e48:
19
+ raise ValueError('number out of range')
20
+ elif 'e' in nd:
21
+ raise ValueError('scientific notation is not supported')
22
+ c_symbol = '正负点' if simp else '正負點'
23
+ if o: # formal
24
+ twoalt = False
25
+ if big:
26
+ c_basic = '零壹贰叁肆伍陆柒捌玖' if simp else '零壹貳參肆伍陸柒捌玖'
27
+ c_unit1 = '拾佰仟'
28
+ c_twoalt = '贰' if simp else '貳'
29
+ else:
30
+ c_basic = '〇一二三四五六七八九' if o else '零一二三四五六七八九'
31
+ c_unit1 = '十百千'
32
+ if twoalt:
33
+ c_twoalt = '两' if simp else '兩'
34
+ else:
35
+ c_twoalt = '二'
36
+ c_unit2 = '万亿兆京垓秭穰沟涧正载' if simp else '萬億兆京垓秭穰溝澗正載'
37
+ revuniq = lambda l: ''.join(k for k, g in itertools.groupby(reversed(l)))
38
+ nd = str(num)
39
+ result = []
40
+ if nd[0] == '+':
41
+ result.append(c_symbol[0])
42
+ elif nd[0] == '-':
43
+ result.append(c_symbol[1])
44
+ if '.' in nd:
45
+ integer, remainder = nd.lstrip('+-').split('.')
46
+ else:
47
+ integer, remainder = nd.lstrip('+-'), None
48
+ if int(integer):
49
+ splitted = [integer[max(i - 4, 0):i]
50
+ for i in range(len(integer), 0, -4)]
51
+ intresult = []
52
+ for nu, unit in enumerate(splitted):
53
+ # special cases
54
+ if int(unit) == 0: # 0000
55
+ intresult.append(c_basic[0])
56
+ continue
57
+ elif nu > 0 and int(unit) == 2: # 0002
58
+ intresult.append(c_twoalt + c_unit2[nu - 1])
59
+ continue
60
+ ulist = []
61
+ unit = unit.zfill(4)
62
+ for nc, ch in enumerate(reversed(unit)):
63
+ if ch == '0':
64
+ if ulist: # ???0
65
+ ulist.append(c_basic[0])
66
+ elif nc == 0:
67
+ ulist.append(c_basic[int(ch)])
68
+ elif nc == 1 and ch == '1' and unit[1] == '0':
69
+ # special case for tens
70
+ # edit the 'elif' if you don't like
71
+ # 十四, 三千零十四, 三千三百一十四
72
+ ulist.append(c_unit1[0])
73
+ elif nc > 1 and ch == '2':
74
+ ulist.append(c_twoalt + c_unit1[nc - 1])
75
+ else:
76
+ ulist.append(c_basic[int(ch)] + c_unit1[nc - 1])
77
+ ustr = revuniq(ulist)
78
+ if nu == 0:
79
+ intresult.append(ustr)
80
+ else:
81
+ intresult.append(ustr + c_unit2[nu - 1])
82
+ result.append(revuniq(intresult).strip(c_basic[0]))
83
+ else:
84
+ result.append(c_basic[0])
85
+ if remainder:
86
+ result.append(c_symbol[2])
87
+ result.append(''.join(c_basic[int(ch)] for ch in remainder))
88
+ return ''.join(result)
@@ -3,6 +3,7 @@ from batchalign.document import *
3
3
  from batchalign.utils import *
4
4
 
5
5
  from batchalign.constants import ENDING_PUNCT
6
+ from batchalign.pipelines.asr.num2chinese import num2chinese
6
7
 
7
8
  from num2words import num2words
8
9
  import pycountry
@@ -165,7 +166,17 @@ def process_generation(output, lang="eng", utterance_engine=None):
165
166
  try:
166
167
  return num2words(i, lang=lang_2)
167
168
  except NotImplementedError:
168
- return i
169
+ try:
170
+ if lang == "zho":
171
+ return num2chinese(i)
172
+ elif lang == "jpn":
173
+ return num2chinese(i, simp=False)
174
+ elif lang == "yue":
175
+ return num2chinese(i, simp=False)
176
+ else:
177
+ return i
178
+ except:
179
+ return i
169
180
  final_words = [[catched_num2words(i), j] for i,j in final_words]
170
181
 
171
182
  # if the final words is > 300, split into n parts
@@ -826,6 +826,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
826
826
  line_cut = line_cut.replace("+//", "")
827
827
  line_cut = line_cut.replace("+...", "")
828
828
  line_cut = line_cut.replace("_", "")
829
+ line_cut = line_cut.replace("#", "")
829
830
 
830
831
  # xbxxx is a sepecial xxx-class token to mark
831
832
  # special form markers, used for processing later
@@ -0,0 +1,3 @@
1
+ 0.7.13-post.1
2
+ Feburary 14nd, 2025
3
+ Remove hash sign.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.12
3
+ Version: 0.7.13.post1
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -57,6 +57,7 @@ batchalign/pipelines/pipeline.py
57
57
  batchalign/pipelines/analysis/__init__.py
58
58
  batchalign/pipelines/analysis/eval.py
59
59
  batchalign/pipelines/asr/__init__.py
60
+ batchalign/pipelines/asr/num2chinese.py
60
61
  batchalign/pipelines/asr/rev.py
61
62
  batchalign/pipelines/asr/utils.py
62
63
  batchalign/pipelines/asr/whisper.py
@@ -1,3 +0,0 @@
1
- 0.7.12
2
- Feburary 6nd, 2025
3
- Wav2vec support!
File without changes
File without changes
File without changes
File without changes