batchalign 0.7.6a12__tar.gz → 0.7.6a14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. {batchalign-0.7.6a12/batchalign.egg-info → batchalign-0.7.6a14}/PKG-INFO +1 -1
  2. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/document.py +1 -2
  3. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/utils.py +1 -1
  4. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/ud.py +24 -7
  5. batchalign-0.7.6a14/batchalign/version +3 -0
  6. {batchalign-0.7.6a12 → batchalign-0.7.6a14/batchalign.egg-info}/PKG-INFO +1 -1
  7. batchalign-0.7.6a12/batchalign/version +0 -3
  8. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/LICENSE +0 -0
  9. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/MANIFEST.in +0 -0
  10. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/README.md +0 -0
  11. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/__init__.py +0 -0
  12. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/__main__.py +0 -0
  13. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/cli/__init__.py +0 -0
  14. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/cli/cli.py +0 -0
  15. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/cli/dispatch.py +0 -0
  16. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/constants.py +0 -0
  17. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/errors.py +0 -0
  18. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/__init__.py +0 -0
  19. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/base.py +0 -0
  20. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/__init__.py +0 -0
  21. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/file.py +0 -0
  22. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/generator.py +0 -0
  23. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/lexer.py +0 -0
  24. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/chat/parser.py +0 -0
  25. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/__init__.py +0 -0
  26. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/file.py +0 -0
  27. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/generator.py +0 -0
  28. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/formats/textgrid/parser.py +0 -0
  29. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/__init__.py +0 -0
  30. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/resolve.py +0 -0
  31. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/__init__.py +0 -0
  32. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/config.yaml +0 -0
  33. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/infer.py +0 -0
  34. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/speaker/utils.py +0 -0
  35. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/training/__init__.py +0 -0
  36. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/training/run.py +0 -0
  37. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/training/utils.py +0 -0
  38. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utils.py +0 -0
  39. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/__init__.py +0 -0
  40. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/dataset.py +0 -0
  41. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/execute.py +0 -0
  42. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/infer.py +0 -0
  43. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/prep.py +0 -0
  44. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/utterance/train.py +0 -0
  45. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/whisper/__init__.py +0 -0
  46. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/whisper/infer_asr.py +0 -0
  47. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/models/whisper/infer_fa.py +0 -0
  48. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/__init__.py +0 -0
  49. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/analysis/__init__.py +0 -0
  50. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/analysis/eval.py +0 -0
  51. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/__init__.py +0 -0
  52. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/rev.py +0 -0
  53. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/utils.py +0 -0
  54. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/whisper.py +0 -0
  55. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/asr/whisperx.py +0 -0
  56. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/base.py +0 -0
  57. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/__init__.py +0 -0
  58. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  59. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  60. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  61. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/retrace.py +0 -0
  62. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  63. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  64. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/cleanup/support/test.test +0 -0
  65. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/dispatch.py +0 -0
  66. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/fa/__init__.py +0 -0
  67. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  68. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  69. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  70. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  71. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  72. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/pipeline.py +0 -0
  73. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/speaker/__init__.py +0 -0
  74. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  75. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/__init__.py +0 -0
  76. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/rev_utr.py +0 -0
  77. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/utils.py +0 -0
  78. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  79. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utterance/__init__.py +0 -0
  80. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  81. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/__init__.py +0 -0
  82. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/conftest.py +0 -0
  83. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  84. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  85. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  86. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  87. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  88. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  89. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  90. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  91. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  92. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  93. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  94. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  95. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/fixures.py +0 -0
  96. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  97. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  98. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/tests/test_document.py +0 -0
  99. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/__init__.py +0 -0
  100. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/config.py +0 -0
  101. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/dp.py +0 -0
  102. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign/utils/utils.py +0 -0
  103. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/SOURCES.txt +0 -0
  104. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/dependency_links.txt +0 -0
  105. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/entry_points.txt +0 -0
  106. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/requires.txt +0 -0
  107. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/batchalign.egg-info/top_level.txt +0 -0
  108. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/setup.cfg +0 -0
  109. {batchalign-0.7.6a12 → batchalign-0.7.6a14}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.6a12
3
+ Version: 0.7.6a14
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -324,8 +324,7 @@ class Utterance(BaseModel):
324
324
  to_include.append(TokenType.RETRACE)
325
325
  if include_fp:
326
326
  to_include.append(TokenType.FP)
327
- filtered = filter(lambda x:x.type in to_include,
328
- self.content)
327
+ filtered = filter(lambda x:x.type in to_include, self.content)
329
328
  # chain them together
330
329
  if join_with_spaces:
331
330
  return " ".join([i.text for i in filtered])
@@ -146,7 +146,7 @@ def annotation_clean(content, special=False):
146
146
  cleaned_word = cleaned_word.replace("~","").replace("&~","")
147
147
  cleaned_word = cleaned_word.replace(">","").replace("<","")
148
148
  cleaned_word = cleaned_word.replace("〕","").replace("//","").replace(";","")
149
- cleaned_word = re.sub(r"@[^abcefpoqs]", '', cleaned_word)
149
+ cleaned_word = re.sub(r"@[^abcefpoqsw]", '', cleaned_word)
150
150
  cleaned_word = re.sub(r"&.", '', cleaned_word)
151
151
 
152
152
  return cleaned_word
@@ -837,11 +837,17 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
837
837
  L.debug(f"Encountered an utterance that's likely devoid of morphological information; skipping... utterance='{doc.content[indx]}'")
838
838
  continue
839
839
 
840
+
840
841
  if retokenize:
841
842
  # rewrite the sentence with our desired tokenizations
842
843
  ut, end = chat_parse_utterance(" ".join([i.text for i in sents[0].tokens])+" "+ending,
843
844
  mor, gra,
844
845
  None, None)
846
+ # fix xbxxx
847
+ for i in ut:
848
+ if i.text == "xbxxx" and len(i.morphology) > 0:
849
+ i.text = i.morphology[0].lemma
850
+
845
851
  # split the text up into previous chunks
846
852
  chunks = list(enumerate(doc.content[indx].text.split(" ")))
847
853
  # filter out everything that could not possibly align
@@ -866,6 +872,8 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
866
872
  for i,j in enumerate(ut):
867
873
  for k in j.text:
868
874
  ud_chars.append(ReferenceTarget(k, payload=i))
875
+ creaky = False
876
+ collected = ""
869
877
  # brrr
870
878
  aligned = align(chunks_chars, ud_chars, tqdm=False)
871
879
  for i in aligned:
@@ -873,16 +881,23 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
873
881
  if i.reference_payload not in chunks_backplate[i.payload]:
874
882
  chunks_backplate[i.payload].append(i.reference_payload)
875
883
  elif isinstance(i, Extra) and i.extra_type == ExtraType.PAYLOAD:
876
- # just put it back
877
- chunks_backplate[i.payload].append(i.key)
884
+ if i.key == "*":
885
+ creaky = not creaky
886
+ chunks_backplate[i.payload].append("*"+collected+"*")
887
+ collected = ""
888
+ elif creaky:
889
+ collected += i.key
890
+ elif not creaky:
891
+ chunks_backplate[i.payload].append(i.key)
878
892
  # we want to replace the morphology of forms that are not actually
879
893
  # supposed to be analyzed
880
894
  elif isinstance(i, Extra) and i.extra_type == ExtraType.REFERENCE:
881
- ut[i.payload].morphology = [Morphology(
882
- lemma = sents[0].tokens[i.payload].text,
883
- pos = "x",
884
- feats = ""
885
- )]
895
+ if ut[i.payload].text != ",":
896
+ ut[i.payload].morphology = [Morphology(
897
+ lemma = sents[0].tokens[i.payload].text if len(sents) > 0 and len(sents[0].tokens) > i.payload and sents[0].tokens[i.payload].text != "xbxxx" else ut[i.payload].text,
898
+ pos = "x",
899
+ feats = ""
900
+ )]
886
901
 
887
902
  poses = [i.morphology[0].pos.upper() for i in ut
888
903
  if i.morphology
@@ -908,6 +923,8 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
908
923
  retokenized_ut = retokenized_ut.replace(" ↑", "↑")
909
924
  retokenized_ut = re.sub(r"@ ?w ?p", "@wp", retokenized_ut)
910
925
  retokenized_ut = retokenized_ut.replace(" @", "@")
926
+ retokenized_ut = re.sub(r"\*[* ]*", "*", retokenized_ut)
927
+ retokenized_ut = re.sub(r"\*(.*?)\*", r"*\1* ", retokenized_ut)
911
928
  # pray to everyone that it works---this will simply crash and ignore
912
929
  # the utterance if it didn't work, so we are doing this as a sanity
913
930
  # check rather than needing the parsed result
@@ -0,0 +1,3 @@
1
+ 0.7.6-alpha.14
2
+ October 14, 2024
3
+ creaky
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.6a12
3
+ Version: 0.7.6a14
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.6-alpha.12
2
- October 10, 2024
3
- patch bug regarding comma structure
File without changes
File without changes
File without changes
File without changes
File without changes