batchalign 0.7.1b8__tar.gz → 0.7.1b10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {batchalign-0.7.1b8/batchalign.egg-info → batchalign-0.7.1b10}/PKG-INFO +1 -1
  2. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/cli/cli.py +15 -12
  3. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/document.py +7 -3
  4. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/utils.py +1 -1
  5. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/morphosyntax/ud.py +3 -0
  6. batchalign-0.7.1b10/batchalign/version +3 -0
  7. {batchalign-0.7.1b8 → batchalign-0.7.1b10/batchalign.egg-info}/PKG-INFO +1 -1
  8. batchalign-0.7.1b8/batchalign/version +0 -3
  9. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/LICENSE +0 -0
  10. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/MANIFEST.in +0 -0
  11. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/README.md +0 -0
  12. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/__init__.py +0 -0
  13. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/__main__.py +0 -0
  14. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/cli/__init__.py +0 -0
  15. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/cli/dispatch.py +0 -0
  16. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/constants.py +0 -0
  17. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/errors.py +0 -0
  18. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/__init__.py +0 -0
  19. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/base.py +0 -0
  20. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/__init__.py +0 -0
  21. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/file.py +0 -0
  22. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/generator.py +0 -0
  23. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/lexer.py +0 -0
  24. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/parser.py +0 -0
  25. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/chat/utils.py +0 -0
  26. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/__init__.py +0 -0
  27. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/file.py +0 -0
  28. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/generator.py +0 -0
  29. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/formats/textgrid/parser.py +0 -0
  30. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/__init__.py +0 -0
  31. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/resolve.py +0 -0
  32. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/__init__.py +0 -0
  33. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/config.yaml +0 -0
  34. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/infer.py +0 -0
  35. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/speaker/utils.py +0 -0
  36. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/training/__init__.py +0 -0
  37. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/training/run.py +0 -0
  38. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/training/utils.py +0 -0
  39. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utils.py +0 -0
  40. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/__init__.py +0 -0
  41. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/dataset.py +0 -0
  42. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/execute.py +0 -0
  43. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/infer.py +0 -0
  44. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/prep.py +0 -0
  45. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/utterance/train.py +0 -0
  46. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/whisper/__init__.py +0 -0
  47. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/whisper/infer_asr.py +0 -0
  48. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/models/whisper/infer_fa.py +0 -0
  49. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/__init__.py +0 -0
  50. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/analysis/__init__.py +0 -0
  51. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/analysis/eval.py +0 -0
  52. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/__init__.py +0 -0
  53. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/rev.py +0 -0
  54. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/whisper.py +0 -0
  55. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/asr/whisperx.py +0 -0
  56. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/base.py +0 -0
  57. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/__init__.py +0 -0
  58. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  59. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  60. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  61. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/retrace.py +0 -0
  62. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  63. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  64. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/cleanup/support/test.test +0 -0
  65. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/dispatch.py +0 -0
  66. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/fa/__init__.py +0 -0
  67. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  68. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  69. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  70. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/pipeline.py +0 -0
  71. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/speaker/__init__.py +0 -0
  72. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  73. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/__init__.py +0 -0
  74. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/rev_utr.py +0 -0
  75. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/utils.py +0 -0
  76. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  77. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utterance/__init__.py +0 -0
  78. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  79. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/__init__.py +0 -0
  80. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/conftest.py +0 -0
  81. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  82. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  83. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  84. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  85. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  86. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  87. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  88. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  89. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  90. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  91. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  92. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  93. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/fixures.py +0 -0
  94. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  95. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  96. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/tests/test_document.py +0 -0
  97. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/__init__.py +0 -0
  98. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/config.py +0 -0
  99. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/dp.py +0 -0
  100. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign/utils/utils.py +0 -0
  101. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/SOURCES.txt +0 -0
  102. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/dependency_links.txt +0 -0
  103. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/entry_points.txt +0 -0
  104. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/requires.txt +0 -0
  105. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/batchalign.egg-info/top_level.txt +0 -0
  106. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/setup.cfg +0 -0
  107. {batchalign-0.7.1b8 → batchalign-0.7.1b10}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.1b8
3
+ Version: 0.7.1b10
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -119,10 +119,10 @@ batchalign.add_command(train, "models")
119
119
  def align(ctx, in_dir, out_dir, lang, num_speakers, whisper, **kwargs):
120
120
  """Align transcripts against corresponding media files."""
121
121
  def loader(file):
122
- return CHATFile(path=os.path.abspath(file), special_mor_=True).doc
122
+ return CHATFile(path=os.path.abspath(file)).doc
123
123
 
124
124
  def writer(doc, output):
125
- CHATFile(doc=doc, special_mor_=True).write(output)
125
+ CHATFile(doc=doc).write(output)
126
126
 
127
127
  _dispatch("align", lang, num_speakers,
128
128
  ["cha"], ctx,
@@ -159,11 +159,11 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
159
159
  def writer(doc, output):
160
160
  doc.content.insert(0, CustomLine(id="Comment", type=CustomLineType.INDEPENDENT,
161
161
  content=f"Batchalign {VERSION_NUMBER.strip()}, ASR Engine {asr}"))
162
- CHATFile(doc=doc, special_mor_=True).write(output
163
- .replace(".wav", ".cha")
164
- .replace(".mp4", ".cha")
165
- .replace(".mp3", ".cha"),
166
- write_wor=kwargs.get("wor", False))
162
+ CHATFile(doc=doc).write(output
163
+ .replace(".wav", ".cha")
164
+ .replace(".mp4", ".cha")
165
+ .replace(".mp3", ".cha"),
166
+ write_wor=kwargs.get("wor", False))
167
167
 
168
168
  if kwargs.get("diarize"):
169
169
  _dispatch("transcribe_s",
@@ -192,7 +192,6 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
192
192
  def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
193
193
  """Perform morphosyntactic analysis on transcripts."""
194
194
 
195
-
196
195
  def loader(file):
197
196
  mwt = {}
198
197
  if kwargs.get("lexicon") != None and kwargs.get("lexicon", "").strip() != "":
@@ -202,13 +201,17 @@ def morphotag(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
202
201
  raw = [i for i in csv.reader(df)]
203
202
  for i in raw:
204
203
  mwt[i[0]] = tuple(i[1:])
204
+ cf = CHATFile(path=os.path.abspath(file), special_mor_=True)
205
+ doc = cf.doc
206
+ if str(cf).count("%mor") > 0:
207
+ doc.ba_special_["special_mor_notation"] = True
205
208
  return (
206
- CHATFile(path=os.path.abspath(file), special_mor_=True).doc,
209
+ doc,
207
210
  {"retokenize": kwargs["retokenize"], "mwt": mwt}
208
211
  )
209
212
 
210
213
  def writer(doc, output):
211
- CHATFile(doc=doc, special_mor_=True).write(output)
214
+ CHATFile(doc=doc, special_mor_=doc.ba_special_.get("special_mor_notation", False)).write(output)
212
215
 
213
216
  _dispatch("morphotag", lang, num_speakers, ["cha"], ctx,
214
217
  in_dir, out_dir,
@@ -224,10 +227,10 @@ def utseg(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
224
227
  """Perform morphosyntactic analysis on transcripts."""
225
228
 
226
229
  def loader(file):
227
- return CHATFile(path=os.path.abspath(file), special_mor_=True).doc
230
+ return CHATFile(path=os.path.abspath(file)).doc
228
231
 
229
232
  def writer(doc, output):
230
- CHATFile(doc=doc, special_mor_=True).write(output)
233
+ CHATFile(doc=doc).write(output)
231
234
 
232
235
  _dispatch("utseg", lang, num_speakers, ["cha"], ctx,
233
236
  in_dir, out_dir,
@@ -1,5 +1,5 @@
1
1
  from enum import Enum, IntEnum
2
- from typing import Optional, List, Tuple, Union
2
+ from typing import Optional, List, Tuple, Union, Any, Dict
3
3
  from typing_extensions import Annotated
4
4
 
5
5
  from pydantic import BaseModel, Field, computed_field
@@ -193,9 +193,12 @@ class Utterance(BaseModel):
193
193
 
194
194
  def __str__(self):
195
195
  if self.text != None:
196
- return self.text
196
+ t = self.text
197
197
  else:
198
- return self._detokenize()
198
+ t = self._detokenize()
199
+
200
+ t = t.replace(". . .", "+...")
201
+ return t
199
202
 
200
203
  def __repr__(self):
201
204
  return str(self)
@@ -331,6 +334,7 @@ class Document(BaseModel):
331
334
  langs: List[str] = Field(default=["eng"])
332
335
  # persistent digital identifier
333
336
  pid: Optional[str] = Field(default=None)
337
+ ba_special_: Optional[Dict] = Field(default={})
334
338
 
335
339
  def __repr__(self):
336
340
  return "\n".join(self.transcript())
@@ -188,7 +188,7 @@ def process_generation(output, lang="eng", utterance_engine=None):
188
188
 
189
189
  final_utterances.append(Utterance(
190
190
  tier=participant,
191
- content = words
191
+ content=words
192
192
  ))
193
193
 
194
194
  doc.content = final_utterances
@@ -848,7 +848,10 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
848
848
  for j in i]
849
849
 
850
850
  retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"])
851
+ retokenized_ut = retokenized_ut.replace("^", "")
851
852
  retokenized_ut = re.sub(r" +", " ", retokenized_ut)
853
+ retokenized_ut = retokenized_ut.replace("+ \"", "+\"")
854
+ retokenized_ut = retokenized_ut.replace(" >", ">")
852
855
  # pray to everyone that it works---this will simply crash and ignore
853
856
  # the utterance if it didn't work, so we are doing this as a sanity
854
857
  # check rather than needing the parsed result
@@ -0,0 +1,3 @@
1
+ 0.7.1-beta.10
2
+ May 21st, 2024
3
+ patch minor ud bugs
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.1b8
3
+ Version: 0.7.1b10
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.1-beta.8
2
- May 21st, 2024
3
- better retokenize algorithm
File without changes
File without changes
File without changes
File without changes
File without changes