batchalign 0.7.20.post11__tar.gz → 0.7.20.post13__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of batchalign might be problematic. Click here for more details.

Files changed (125) hide show
  1. {batchalign-0.7.20.post11/batchalign.egg-info → batchalign-0.7.20.post13}/PKG-INFO +3 -2
  2. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/document.py +5 -2
  3. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/chat/lexer.py +10 -8
  4. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/resolve.py +1 -1
  5. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/whisper/infer_asr.py +2 -2
  6. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/ud.py +5 -1
  7. batchalign-0.7.20.post13/batchalign/version +3 -0
  8. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13/batchalign.egg-info}/PKG-INFO +3 -2
  9. batchalign-0.7.20.post11/batchalign/version +0 -3
  10. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/LICENSE +0 -0
  11. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/MANIFEST.in +0 -0
  12. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/README.md +0 -0
  13. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/__init__.py +0 -0
  14. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/__main__.py +0 -0
  15. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/cli/__init__.py +0 -0
  16. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/cli/cli.py +0 -0
  17. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/cli/dispatch.py +0 -0
  18. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/constants.py +0 -0
  19. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/errors.py +0 -0
  20. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/__init__.py +0 -0
  21. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/base.py +0 -0
  22. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/chat/__init__.py +0 -0
  23. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/chat/file.py +0 -0
  24. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/chat/generator.py +0 -0
  25. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/chat/parser.py +0 -0
  26. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/chat/utils.py +0 -0
  27. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/textgrid/__init__.py +0 -0
  28. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/textgrid/file.py +0 -0
  29. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/textgrid/generator.py +0 -0
  30. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/formats/textgrid/parser.py +0 -0
  31. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/__init__.py +0 -0
  32. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/speaker/__init__.py +0 -0
  33. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/speaker/config.yaml +0 -0
  34. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/speaker/infer.py +0 -0
  35. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/speaker/utils.py +0 -0
  36. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/training/__init__.py +0 -0
  37. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/training/run.py +0 -0
  38. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/training/utils.py +0 -0
  39. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utils.py +0 -0
  40. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utterance/__init__.py +0 -0
  41. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utterance/cantonese_infer.py +0 -0
  42. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utterance/dataset.py +0 -0
  43. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utterance/execute.py +0 -0
  44. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utterance/infer.py +0 -0
  45. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utterance/prep.py +0 -0
  46. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/utterance/train.py +0 -0
  47. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/wave2vec/__init__.py +0 -0
  48. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/wave2vec/infer_fa.py +0 -0
  49. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/whisper/__init__.py +0 -0
  50. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/models/whisper/infer_fa.py +0 -0
  51. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/__init__.py +0 -0
  52. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/analysis/__init__.py +0 -0
  53. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/analysis/eval.py +0 -0
  54. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/asr/__init__.py +0 -0
  55. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/asr/num2chinese.py +0 -0
  56. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/asr/oai_whisper.py +0 -0
  57. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/asr/rev.py +0 -0
  58. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/asr/utils.py +0 -0
  59. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/asr/whisper.py +0 -0
  60. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/asr/whisperx.py +0 -0
  61. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/base.py +0 -0
  62. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/__init__.py +0 -0
  63. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  64. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  65. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  66. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/retrace.py +0 -0
  67. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  68. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  69. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/cleanup/support/test.test +0 -0
  70. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/dispatch.py +0 -0
  71. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/fa/__init__.py +0 -0
  72. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  73. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  74. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  75. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  76. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  77. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  78. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  79. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  80. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  81. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/pipeline.py +0 -0
  82. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/speaker/__init__.py +0 -0
  83. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  84. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/translate/__init__.py +0 -0
  85. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/translate/gtrans.py +0 -0
  86. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/translate/seamless.py +0 -0
  87. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/translate/utils.py +0 -0
  88. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/utr/__init__.py +0 -0
  89. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/utr/rev_utr.py +0 -0
  90. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/utr/utils.py +0 -0
  91. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  92. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/utterance/__init__.py +0 -0
  93. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  94. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/__init__.py +0 -0
  95. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/conftest.py +0 -0
  96. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  97. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  98. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  99. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  100. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  101. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  102. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  103. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  104. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  105. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  106. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  107. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  108. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/fixures.py +0 -0
  109. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  110. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  111. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/tests/test_document.py +0 -0
  112. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/utils/__init__.py +0 -0
  113. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/utils/abbrev.py +0 -0
  114. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/utils/compounds.py +0 -0
  115. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/utils/config.py +0 -0
  116. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/utils/dp.py +0 -0
  117. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/utils/names.py +0 -0
  118. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign/utils/utils.py +0 -0
  119. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign.egg-info/SOURCES.txt +0 -0
  120. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign.egg-info/dependency_links.txt +0 -0
  121. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign.egg-info/entry_points.txt +0 -0
  122. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign.egg-info/requires.txt +0 -0
  123. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/batchalign.egg-info/top_level.txt +0 -0
  124. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/setup.cfg +0 -0
  125. {batchalign-0.7.20.post11 → batchalign-0.7.20.post13}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: batchalign
3
- Version: 0.7.20.post11
3
+ Version: 0.7.20.post13
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -48,6 +48,7 @@ Dynamic: author-email
48
48
  Dynamic: classifier
49
49
  Dynamic: description
50
50
  Dynamic: description-content-type
51
+ Dynamic: license-file
51
52
  Dynamic: provides-extra
52
53
  Dynamic: requires-dist
53
54
  Dynamic: summary
@@ -206,8 +206,8 @@ class Utterance(BaseModel):
206
206
  def __len__(self):
207
207
  return len(self.content)
208
208
 
209
- def __str__(self):
210
- if self.text != None:
209
+ def tostring(self, always_detokenize=False):
210
+ if self.text != None and not always_detokenize:
211
211
  t = self.text
212
212
  else:
213
213
  t = self._detokenize()
@@ -231,6 +231,9 @@ class Utterance(BaseModel):
231
231
 
232
232
  return t
233
233
 
234
+ def __str__(self):
235
+ return self.tostring()
236
+
234
237
  def __repr__(self):
235
238
  return str(self)
236
239
 
@@ -79,14 +79,16 @@ class UtteranceLexer:
79
79
  # self.__clauses.append((form.strip(), TokenType.FEAT))
80
80
  elif form.strip() in NORMAL_GROUP_MARKS:
81
81
  # basically ignore the form
82
- popped = self.__clauses.pop(-1)[0]
83
- if not isinstance(popped, str):
84
- for i in popped:
85
- if i[0] not in CHAT_IGNORE and i[0] != "&":
86
- self.__clauses.append(i)
87
- else:
88
- if popped not in CHAT_IGNORE and popped[0] != "&":
89
- self.__clauses.append((popped, TokenType.REGULAR))
82
+ o = self.__clauses.pop(-1)
83
+ if len(o) <= 1 or o[1] != TokenType.FP:
84
+ popped = o[0]
85
+ if not isinstance(popped, str):
86
+ for i in popped:
87
+ if i[0] not in CHAT_IGNORE and i[0] != "&":
88
+ self.__clauses.append(i)
89
+ else:
90
+ if popped not in CHAT_IGNORE and popped[0] != "&":
91
+ self.__clauses.append((popped, TokenType.REGULAR))
90
92
  # if isinstance(popped, str) and :
91
93
  # pass
92
94
  # self.__clauses.append((form.strip(), TokenType.FEAT))
@@ -11,7 +11,7 @@ resolver = {
11
11
  "yue": "PolyU-AngelChanLab/Cantonese-Utterance-Segmentation",
12
12
  },
13
13
  "whisper": {
14
- 'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
14
+ 'eng': ("talkbank/CHATWhisper-en", "openai/whisper-large-v2"),
15
15
  'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),
16
16
  "heb": ("ivrit-ai/whisper-large-v3", "ivrit-ai/whisper-large-v3")
17
17
  }
@@ -93,7 +93,7 @@ class WhisperASRModel(object):
93
93
  stride_length_s=3,
94
94
  device=DEVICE,
95
95
  torch_dtype=torch.bfloat16,
96
- return_timestamps="word",
96
+ return_timestamps=True,
97
97
  )
98
98
  except TypeError:
99
99
  self.pipe = pipeline(
@@ -104,7 +104,7 @@ class WhisperASRModel(object):
104
104
  stride_length_s=3,
105
105
  device=DEVICE,
106
106
  torch_dtype=torch.float16,
107
- return_timestamps="word",
107
+ return_timestamps=True,
108
108
  )
109
109
  L.debug("Done, initalizing processor and config...")
110
110
  processor = WhisperProcessor.from_pretrained(base)
@@ -729,7 +729,7 @@ def morphoanalyze(doc: Document, retokenize:bool, skipmultilang:bool, status_hoo
729
729
  pass
730
730
 
731
731
 
732
- # pycountry.languages.get(alpha_3=i).alpha_2 for i in lang
732
+ # pycountry.languages.get(alpha_3=i).alpha_2 for i in lang
733
733
 
734
734
  config = {"processors": {"tokenize": "default",
735
735
  "pos": "default",
@@ -813,6 +813,9 @@ def morphoanalyze(doc: Document, retokenize:bool, skipmultilang:bool, status_hoo
813
813
  line_cut = i.strip(join_with_spaces=True)
814
814
  else:
815
815
  line_cut = i.strip(join_with_spaces=True)[:-len(ending)].strip()
816
+
817
+ # import ipdb
818
+ # ipdb.set_trace()
816
819
  # ending = ending.replace("+//", "")
817
820
 
818
821
  # if we don't have anything in line cut, just take the original
@@ -881,6 +884,7 @@ def morphoanalyze(doc: Document, retokenize:bool, skipmultilang:bool, status_hoo
881
884
 
882
885
  # parse the stanza output
883
886
  mor, gra = parse_sentence(sents[0], ending, special_forms_cleaned, lang[0])
887
+ mor = mor.replace("~part|s verb|", "~aux|is verb|")
884
888
  # breakpoint()
885
889
 
886
890
  if mor.strip() == "" or mor.strip() in ENDING_PUNCT:
@@ -0,0 +1,3 @@
1
+ 0.7.20-post.13
2
+ July 30th, 2025
3
+ even more parsing patches?
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.2
1
+ Metadata-Version: 2.4
2
2
  Name: batchalign
3
- Version: 0.7.20.post11
3
+ Version: 0.7.20.post13
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -48,6 +48,7 @@ Dynamic: author-email
48
48
  Dynamic: classifier
49
49
  Dynamic: description
50
50
  Dynamic: description-content-type
51
+ Dynamic: license-file
51
52
  Dynamic: provides-extra
52
53
  Dynamic: requires-dist
53
54
  Dynamic: summary
@@ -1,3 +0,0 @@
1
- 0.7.20-post.11
2
- July 30th, 2025
3
- even more parsing patches?