batchalign 0.7.1b6__tar.gz → 0.7.1b8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {batchalign-0.7.1b6/batchalign.egg-info → batchalign-0.7.1b8}/PKG-INFO +1 -1
  2. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/cli/cli.py +9 -6
  3. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/chat/parser.py +1 -1
  4. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/morphosyntax/ud.py +46 -0
  5. batchalign-0.7.1b8/batchalign/version +3 -0
  6. {batchalign-0.7.1b6 → batchalign-0.7.1b8/batchalign.egg-info}/PKG-INFO +1 -1
  7. batchalign-0.7.1b6/batchalign/version +0 -3
  8. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/LICENSE +0 -0
  9. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/MANIFEST.in +0 -0
  10. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/README.md +0 -0
  11. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/__init__.py +0 -0
  12. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/__main__.py +0 -0
  13. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/cli/__init__.py +0 -0
  14. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/cli/dispatch.py +0 -0
  15. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/constants.py +0 -0
  16. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/document.py +0 -0
  17. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/errors.py +0 -0
  18. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/__init__.py +0 -0
  19. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/base.py +0 -0
  20. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/chat/__init__.py +0 -0
  21. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/chat/file.py +0 -0
  22. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/chat/generator.py +0 -0
  23. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/chat/lexer.py +0 -0
  24. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/chat/utils.py +0 -0
  25. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/textgrid/__init__.py +0 -0
  26. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/textgrid/file.py +0 -0
  27. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/textgrid/generator.py +0 -0
  28. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/formats/textgrid/parser.py +0 -0
  29. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/__init__.py +0 -0
  30. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/resolve.py +0 -0
  31. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/speaker/__init__.py +0 -0
  32. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/speaker/config.yaml +0 -0
  33. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/speaker/infer.py +0 -0
  34. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/speaker/utils.py +0 -0
  35. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/training/__init__.py +0 -0
  36. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/training/run.py +0 -0
  37. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/training/utils.py +0 -0
  38. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/utils.py +0 -0
  39. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/utterance/__init__.py +0 -0
  40. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/utterance/dataset.py +0 -0
  41. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/utterance/execute.py +0 -0
  42. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/utterance/infer.py +0 -0
  43. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/utterance/prep.py +0 -0
  44. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/utterance/train.py +0 -0
  45. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/whisper/__init__.py +0 -0
  46. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/whisper/infer_asr.py +0 -0
  47. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/models/whisper/infer_fa.py +0 -0
  48. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/__init__.py +0 -0
  49. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/analysis/__init__.py +0 -0
  50. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/analysis/eval.py +0 -0
  51. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/asr/__init__.py +0 -0
  52. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/asr/rev.py +0 -0
  53. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/asr/utils.py +0 -0
  54. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/asr/whisper.py +0 -0
  55. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/asr/whisperx.py +0 -0
  56. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/base.py +0 -0
  57. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/__init__.py +0 -0
  58. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  59. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  60. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  61. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/retrace.py +0 -0
  62. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  63. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  64. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/cleanup/support/test.test +0 -0
  65. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/dispatch.py +0 -0
  66. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/fa/__init__.py +0 -0
  67. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  68. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  69. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  70. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/pipeline.py +0 -0
  71. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/speaker/__init__.py +0 -0
  72. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  73. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/utr/__init__.py +0 -0
  74. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/utr/rev_utr.py +0 -0
  75. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/utr/utils.py +0 -0
  76. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  77. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/utterance/__init__.py +0 -0
  78. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  79. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/__init__.py +0 -0
  80. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/conftest.py +0 -0
  81. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  82. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  83. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  84. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  85. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  86. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  87. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  88. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  89. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  90. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  91. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  92. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  93. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/fixures.py +0 -0
  94. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  95. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  96. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/tests/test_document.py +0 -0
  97. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/utils/__init__.py +0 -0
  98. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/utils/config.py +0 -0
  99. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/utils/dp.py +0 -0
  100. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign/utils/utils.py +0 -0
  101. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign.egg-info/SOURCES.txt +0 -0
  102. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign.egg-info/dependency_links.txt +0 -0
  103. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign.egg-info/entry_points.txt +0 -0
  104. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign.egg-info/requires.txt +0 -0
  105. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/batchalign.egg-info/top_level.txt +0 -0
  106. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/setup.cfg +0 -0
  107. {batchalign-0.7.1b6 → batchalign-0.7.1b8}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.1b6
3
+ Version: 0.7.1b8
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -149,19 +149,22 @@ def transcribe(ctx, in_dir, out_dir, lang, num_speakers, **kwargs):
149
149
  def loader(file):
150
150
  return file
151
151
 
152
+ asr = "rev"
153
+ if kwargs["whisper"]:
154
+ asr = "whisper"
155
+ if kwargs["whisperx"]:
156
+ asr = "whisperx"
157
+
158
+
152
159
  def writer(doc, output):
160
+ doc.content.insert(0, CustomLine(id="Comment", type=CustomLineType.INDEPENDENT,
161
+ content=f"Batchalign {VERSION_NUMBER.strip()}, ASR Engine {asr}"))
153
162
  CHATFile(doc=doc, special_mor_=True).write(output
154
163
  .replace(".wav", ".cha")
155
164
  .replace(".mp4", ".cha")
156
165
  .replace(".mp3", ".cha"),
157
166
  write_wor=kwargs.get("wor", False))
158
167
 
159
- asr = "rev"
160
- if kwargs["whisper"]:
161
- asr = "whisper"
162
- if kwargs["whisperx"]:
163
- asr = "whisperx"
164
-
165
168
  if kwargs.get("diarize"):
166
169
  _dispatch("transcribe_s",
167
170
  lang, num_speakers, ["mp3", "mp4", "wav"], ctx,
@@ -222,7 +222,7 @@ def chat_parse_doc(lines, special_mor=False):
222
222
  continue
223
223
  # we split because there are multiple languages possible
224
224
  elif "@Languages" in line.strip():
225
- results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().split(",")]
225
+ results["langs"] = [i.strip() for i in line.strip("@Languages:").strip().replace(" ", ",").strip().split(",") if i.strip() != ""]
226
226
  if len(results["langs"]) > 0 and results["langs"][0] == "eng" and special_mor:
227
227
  use_special_mor = True
228
228
  # parse participants; the number of | delinates the metedata field
@@ -37,6 +37,7 @@ repath_file = lambda file_path, new_dir: os.path.join(new_dir, pathlib.Path(file
37
37
 
38
38
 
39
39
  from batchalign.document import *
40
+ from batchalign.constants import *
40
41
  from batchalign.pipelines.base import *
41
42
  from batchalign.formats.chat.parser import chat_parse_utterance
42
43
 
@@ -808,7 +809,52 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
808
809
  ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending,
809
810
  mor, gra,
810
811
  None, None)
812
+ # split the text up into previous chunks
813
+ chunks = list(enumerate(doc.content[indx].text.split(" ")))
814
+ # filter out everything that could not possibly align
815
+ chunks_align = [(i,j) for i,j in chunks
816
+ if len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
817
+ and (len(j) <= 2 or (j[-2] not in "@"))
818
+ and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"]]
819
+ # hollow out anything we are trying to align, and leave everything else
820
+ chunks_backplate = [[j]
821
+ if not (len(j) != 0 and (j[0] not in ["<", "[", "&", "\x15"])
822
+ and (len(j) <= 2 or (j[-2] not in "@"))
823
+ and j.strip() not in ENDING_PUNCT + MOR_PUNCT + CHAT_IGNORE + ["++"])
824
+ else
825
+ []
826
+ for i,j in chunks]
827
+ # render each into a list
828
+ chunks_chars = []
829
+ for i,j in chunks_align:
830
+ for k in j:
831
+ chunks_chars.append(PayloadTarget(k, payload=i))
832
+ ud_chars = []
833
+ for i,j in enumerate(ut):
834
+ for k in j.text:
835
+ ud_chars.append(ReferenceTarget(k, payload=i))
836
+ # brrr
837
+ aligned = align(chunks_chars, ud_chars, tqdm=False)
838
+ for i in aligned:
839
+ if isinstance(i, Match):
840
+ if i.reference_payload not in chunks_backplate[i.payload]:
841
+ chunks_backplate[i.payload].append(i.reference_payload)
842
+ elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
843
+ # just put it back
844
+ chunks_backplate[i.payload].append(i.key)
845
+ # resolve all the numbers and flatten
846
+ chunks_backplate = [j if isinstance(j, str) else ut[j].text
847
+ for i in chunks_backplate
848
+ for j in i]
849
+
850
+ retokenized_ut = " ".join(i for i in chunks_backplate if i.strip() not in ["(", ")"])
851
+ retokenized_ut = re.sub(r" +", " ", retokenized_ut)
852
+ # pray to everyone that it works---this will simply crash and ignore
853
+ # the utterance if it didn't work, so we are doing this as a sanity
854
+ # check rather than needing the parsed result
855
+ _1, _2 = chat_parse_utterance(retokenized_ut, mor, gra, None, None)
811
856
  doc.content[indx] = Utterance(content=ut,
857
+ text=retokenized_ut,
812
858
  tier=doc.content[indx].tier,
813
859
  time=doc.content[indx].time,
814
860
  custom_dependencies=doc.content[indx].custom_dependencies)
@@ -0,0 +1,3 @@
1
+ 0.7.1-beta.8
2
+ May 21st, 2024
3
+ better retokenize algorithm
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.1b6
3
+ Version: 0.7.1b8
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.1-beta.6
2
- May 17th, 2024
3
- remove Hebrew for UD
File without changes
File without changes
File without changes
File without changes
File without changes