batchalign 0.7.6a33__tar.gz → 0.7.7a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. {batchalign-0.7.6a33/batchalign.egg-info → batchalign-0.7.7a2}/PKG-INFO +1 -1
  2. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/chat/utils.py +1 -1
  3. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/analysis/eval.py +44 -6
  4. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/ud.py +2 -1
  5. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/utterance/ud_utterance.py +2 -1
  6. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/utils/utils.py +3 -0
  7. batchalign-0.7.7a2/batchalign/version +3 -0
  8. {batchalign-0.7.6a33 → batchalign-0.7.7a2/batchalign.egg-info}/PKG-INFO +1 -1
  9. batchalign-0.7.6a33/batchalign/version +0 -3
  10. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/LICENSE +0 -0
  11. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/MANIFEST.in +0 -0
  12. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/README.md +0 -0
  13. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/__init__.py +0 -0
  14. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/__main__.py +0 -0
  15. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/cli/__init__.py +0 -0
  16. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/cli/cli.py +0 -0
  17. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/cli/dispatch.py +0 -0
  18. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/constants.py +0 -0
  19. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/document.py +0 -0
  20. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/errors.py +0 -0
  21. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/__init__.py +0 -0
  22. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/base.py +0 -0
  23. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/chat/__init__.py +0 -0
  24. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/chat/file.py +0 -0
  25. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/chat/generator.py +0 -0
  26. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/chat/lexer.py +0 -0
  27. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/chat/parser.py +0 -0
  28. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/textgrid/__init__.py +0 -0
  29. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/textgrid/file.py +0 -0
  30. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/textgrid/generator.py +0 -0
  31. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/formats/textgrid/parser.py +0 -0
  32. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/__init__.py +0 -0
  33. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/resolve.py +0 -0
  34. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/speaker/__init__.py +0 -0
  35. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/speaker/config.yaml +0 -0
  36. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/speaker/infer.py +0 -0
  37. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/speaker/utils.py +0 -0
  38. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/training/__init__.py +0 -0
  39. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/training/run.py +0 -0
  40. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/training/utils.py +0 -0
  41. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/utils.py +0 -0
  42. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/utterance/__init__.py +0 -0
  43. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/utterance/dataset.py +0 -0
  44. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/utterance/execute.py +0 -0
  45. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/utterance/infer.py +0 -0
  46. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/utterance/prep.py +0 -0
  47. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/utterance/train.py +0 -0
  48. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/whisper/__init__.py +0 -0
  49. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/whisper/infer_asr.py +0 -0
  50. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/models/whisper/infer_fa.py +0 -0
  51. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/__init__.py +0 -0
  52. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/analysis/__init__.py +0 -0
  53. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/asr/__init__.py +0 -0
  54. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/asr/rev.py +0 -0
  55. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/asr/utils.py +0 -0
  56. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/asr/whisper.py +0 -0
  57. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/asr/whisperx.py +0 -0
  58. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/base.py +0 -0
  59. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/__init__.py +0 -0
  60. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  61. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  62. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  63. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/retrace.py +0 -0
  64. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  65. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  66. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/cleanup/support/test.test +0 -0
  67. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/dispatch.py +0 -0
  68. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/fa/__init__.py +0 -0
  69. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  70. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  71. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  72. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  73. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  74. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  75. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  76. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  77. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/pipeline.py +0 -0
  78. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/speaker/__init__.py +0 -0
  79. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  80. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/utr/__init__.py +0 -0
  81. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/utr/rev_utr.py +0 -0
  82. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/utr/utils.py +0 -0
  83. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  84. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/pipelines/utterance/__init__.py +0 -0
  85. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/__init__.py +0 -0
  86. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/conftest.py +0 -0
  87. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  88. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  89. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  90. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  91. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  92. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  93. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  94. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  95. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  96. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  97. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  98. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  99. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/fixures.py +0 -0
  100. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  101. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  102. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/tests/test_document.py +0 -0
  103. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/utils/__init__.py +0 -0
  104. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/utils/config.py +0 -0
  105. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign/utils/dp.py +0 -0
  106. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign.egg-info/SOURCES.txt +0 -0
  107. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign.egg-info/dependency_links.txt +0 -0
  108. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign.egg-info/entry_points.txt +0 -0
  109. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign.egg-info/requires.txt +0 -0
  110. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/batchalign.egg-info/top_level.txt +0 -0
  111. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/setup.cfg +0 -0
  112. {batchalign-0.7.6a33 → batchalign-0.7.7a2}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.6a33
3
+ Version: 0.7.7a2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -108,7 +108,7 @@ def annotation_clean(content, special=False):
108
108
  cleaned_word = re.sub(r"\x15\d+_\d+\x15", '', cleaned_word)
109
109
  if not special:
110
110
  cleaned_word = re.sub(r"&~\w+", '', cleaned_word)
111
- cleaned_word = cleaned_word.replace("(","").replace(")","")
111
+ # cleaned_word = cleaned_word.replace("(","").replace(")","")
112
112
  cleaned_word = cleaned_word.replace("[","").replace("]","")
113
113
  cleaned_word = cleaned_word.replace("<","").replace(">","")
114
114
  cleaned_word = cleaned_word.replace("“","").replace("”","")
@@ -3,6 +3,7 @@ eval.py
3
3
  Engines for transcript evaluation
4
4
  """
5
5
 
6
+ import re
6
7
  from batchalign.document import *
7
8
  from batchalign.pipelines.base import *
8
9
  from batchalign.pipelines.asr.utils import *
@@ -22,11 +23,34 @@ class EvaluationEngine(BatchalignEngine):
22
23
  forms = [ j.text.lower() for i in doc.content for j in i.content if isinstance(i, Utterance)]
23
24
  gold_forms = [ j.text.lower() for i in gold.content for j in i.content if isinstance(i, Utterance)]
24
25
 
25
- forms = [i for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
26
- gold_forms = [i for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
26
+ forms = [i.replace("-", "") for i in forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
27
+ gold_forms = [i.replace("-", "") for i in gold_forms if i.strip() not in MOR_PUNCT+ENDING_PUNCT]
28
+
29
+ forms = [re.sub(r"\((.*)\)",r"", i) for i in forms]
30
+ gold_forms = [re.sub(r"\((.*)\)",r"", i) for i in gold_forms]
31
+
32
+ # if there are single letter frames, we combine them tofgether
33
+ # until the utterance is done or there isn't any left
34
+ forms_finished = []
35
+
36
+ single_sticky = ""
37
+ is_single = False
38
+
39
+ for i in forms:
40
+ if len(i) == 1:
41
+ single_sticky += i
42
+ else:
43
+ if single_sticky != "":
44
+ forms_finished.append(single_sticky)
45
+ single_sticky = ""
46
+ forms_finished.append(i)
47
+
48
+ if single_sticky != "":
49
+ forms_finished.append(single_sticky)
50
+ single_sticky = ""
27
51
 
28
52
  # dp!
29
- alignment = align(forms, gold_forms, False)
53
+ alignment = align(forms_finished, gold_forms, False)
30
54
 
31
55
  # calculate each type of error
32
56
  sub = 0
@@ -39,14 +63,28 @@ class EvaluationEngine(BatchalignEngine):
39
63
  # but if we have <extra.reference> <extra.reference> this is 2 insertions
40
64
 
41
65
  cleaned_alignment = []
66
+ # whether we had a "firstname" in reference document and hence are
67
+ # anticipating a payload for it (the actual name) in the next entry in the
68
+ # alignment
69
+ anticipating_payload = False
42
70
 
43
71
  for i in alignment:
44
72
 
45
73
  if isinstance(i, Extra):
46
- if len(cleaned_alignment) > 0 and i.extra_type == ExtraType.REFERENCE and "name" in i.key and i.key[:4] != "name":
47
- cleaned_alignment.pop(-1)
74
+
75
+ if i.extra_type == ExtraType.REFERENCE and "name" in i.key and i.key[:4] != "name":
76
+ if (isinstance(cleaned_alignment[-1], Extra) and
77
+ cleaned_alignment[-1].extra_type == ExtraType.PAYLOAD and
78
+ len(cleaned_alignment) > 0):
79
+ cleaned_alignment.pop(-1)
80
+ else:
81
+ anticipating_payload = True
48
82
  cleaned_alignment.append(Match(i.key, None, None))
49
83
  continue
84
+ elif i.extra_type == ExtraType.PAYLOAD and anticipating_payload:
85
+ anticipating_payload = False
86
+ continue
87
+
50
88
 
51
89
  if prev_error != None and prev_error != i.extra_type:
52
90
  # this is a substitution: we have different "extra"s in
@@ -75,7 +113,7 @@ class EvaluationEngine(BatchalignEngine):
75
113
  cleaned_alignment.append(i)
76
114
 
77
115
  diff = []
78
- for i in alignment:
116
+ for i in cleaned_alignment:
79
117
  if isinstance(i, Extra):
80
118
  diff.append(f"{'+' if i.extra_type == ExtraType.REFERENCE else '-'} {i.key}")
81
119
  else:
@@ -115,6 +115,7 @@ def handler(word, lang=None):
115
115
  target = target.replace('/100', '')
116
116
  target = target.replace('/r', '')
117
117
  target = target.replace('(', '')
118
+ target = target.replace("(","").replace(")","")
118
119
 
119
120
  # remove attachments
120
121
  if "|" in target:
@@ -848,7 +849,7 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
848
849
  inputs.append(line_cut)
849
850
 
850
851
  try:
851
- sents = nlp(line_cut.strip()).sentences
852
+ sents = nlp(line_cut.replace("(","").replace(")","").strip()).sentences
852
853
 
853
854
  if len(sents) == 0:
854
855
  continue
@@ -84,6 +84,7 @@ def parse_tree(subtree):
84
84
  for i in stack]
85
85
 
86
86
  def process_ut(ut, nlp):
87
+
87
88
  # remove punct
88
89
  if (ut.content[-1].type == TokenType.PUNCT or
89
90
  ut.content[-1].text in ENDING_PUNCT):
@@ -142,7 +143,7 @@ def process_ut(ut, nlp):
142
143
  if isinstance(i, Match):
143
144
  matches.append(i)
144
145
  elif i.extra_type == ExtraType.REFERENCE:
145
- new_refs.append(ReferenceTarget(key=i.key, payload=i.payload))
146
+ new_refs.append(ReferenceTarget(key=i.key, payload=i.payload if i.payload else -1))
146
147
 
147
148
  # we now sort the references based on their orignial utterance order
148
149
  matches = matches + new_refs
@@ -29,6 +29,7 @@ def word_tokenize(str):
29
29
  return tmp.tokenize(str)
30
30
  except LookupError:
31
31
  nltk.download("punkt")
32
+ nltk.download("punkt_tab")
32
33
  return tmp.tokenize(str)
33
34
 
34
35
  def sent_tokenize(str):
@@ -49,6 +50,7 @@ def sent_tokenize(str):
49
50
  return ST(str)
50
51
  except LookupError:
51
52
  nltk.download("punkt")
53
+ nltk.download("punkt_tab")
52
54
  return ST(str)
53
55
 
54
56
  def detokenize(tokens):
@@ -69,6 +71,7 @@ def detokenize(tokens):
69
71
  return TreebankWordDetokenizer().detokenize(tokens)
70
72
  except LookupError:
71
73
  nltk.download("punkt")
74
+ nltk.download("punkt_tab")
72
75
  return TreebankWordDetokenizer().detokenize(tokens)
73
76
 
74
77
  def correct_timing(doc):
@@ -0,0 +1,3 @@
1
+ 0.7.7-alpha.2
2
+ December 06, 2024
3
+ DP logic
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: batchalign
3
- Version: 0.7.6a33
3
+ Version: 0.7.7a2
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.6-alpha.33
2
- November 26, 2024
3
- French APM (minor)
File without changes
File without changes
File without changes
File without changes
File without changes