batchalign 0.7.18.post7__tar.gz → 0.7.18.post9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of batchalign might be problematic. Click here for more details.

Files changed (121) hide show
  1. {batchalign-0.7.18.post7/batchalign.egg-info → batchalign-0.7.18.post9}/PKG-INFO +1 -1
  2. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/cli/cli.py +5 -1
  3. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/document.py +1 -0
  4. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/chat/parser.py +3 -1
  5. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/resolve.py +1 -0
  6. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/ud.py +4 -2
  7. batchalign-0.7.18.post9/batchalign/version +3 -0
  8. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9/batchalign.egg-info}/PKG-INFO +1 -1
  9. batchalign-0.7.18.post7/batchalign/version +0 -3
  10. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/LICENSE +0 -0
  11. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/MANIFEST.in +0 -0
  12. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/README.md +0 -0
  13. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/__init__.py +0 -0
  14. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/__main__.py +0 -0
  15. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/cli/__init__.py +0 -0
  16. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/cli/dispatch.py +0 -0
  17. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/constants.py +0 -0
  18. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/errors.py +0 -0
  19. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/__init__.py +0 -0
  20. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/base.py +0 -0
  21. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/chat/__init__.py +0 -0
  22. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/chat/file.py +0 -0
  23. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/chat/generator.py +0 -0
  24. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/chat/lexer.py +0 -0
  25. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/chat/utils.py +0 -0
  26. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/textgrid/__init__.py +0 -0
  27. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/textgrid/file.py +0 -0
  28. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/textgrid/generator.py +0 -0
  29. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/formats/textgrid/parser.py +0 -0
  30. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/__init__.py +0 -0
  31. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/speaker/__init__.py +0 -0
  32. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/speaker/config.yaml +0 -0
  33. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/speaker/infer.py +0 -0
  34. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/speaker/utils.py +0 -0
  35. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/training/__init__.py +0 -0
  36. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/training/run.py +0 -0
  37. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/training/utils.py +0 -0
  38. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utils.py +0 -0
  39. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utterance/__init__.py +0 -0
  40. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utterance/cantonese_infer.py +0 -0
  41. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utterance/dataset.py +0 -0
  42. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utterance/execute.py +0 -0
  43. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utterance/infer.py +0 -0
  44. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utterance/prep.py +0 -0
  45. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/utterance/train.py +0 -0
  46. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/wave2vec/__init__.py +0 -0
  47. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/wave2vec/infer_fa.py +0 -0
  48. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/whisper/__init__.py +0 -0
  49. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/whisper/infer_asr.py +0 -0
  50. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/models/whisper/infer_fa.py +0 -0
  51. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/__init__.py +0 -0
  52. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/analysis/__init__.py +0 -0
  53. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/analysis/eval.py +0 -0
  54. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/asr/__init__.py +0 -0
  55. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/asr/num2chinese.py +0 -0
  56. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/asr/rev.py +0 -0
  57. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/asr/utils.py +0 -0
  58. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/asr/whisper.py +0 -0
  59. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/asr/whisperx.py +0 -0
  60. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/base.py +0 -0
  61. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/__init__.py +0 -0
  62. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/cleanup.py +0 -0
  63. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/disfluencies.py +0 -0
  64. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/parse_support.py +0 -0
  65. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/retrace.py +0 -0
  66. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/support/filled_pauses.eng +0 -0
  67. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/support/replacements.eng +0 -0
  68. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/cleanup/support/test.test +0 -0
  69. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/dispatch.py +0 -0
  70. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/fa/__init__.py +0 -0
  71. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/fa/wave2vec_fa.py +0 -0
  72. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/fa/whisper_fa.py +0 -0
  73. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/__init__.py +0 -0
  74. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/coref.py +0 -0
  75. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/en/irr.py +0 -0
  76. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/fr/apm.py +0 -0
  77. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/fr/apmn.py +0 -0
  78. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/fr/case.py +0 -0
  79. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/morphosyntax/ja/verbforms.py +0 -0
  80. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/pipeline.py +0 -0
  81. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/speaker/__init__.py +0 -0
  82. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/speaker/nemo_speaker.py +0 -0
  83. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/translate/__init__.py +0 -0
  84. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/translate/gtrans.py +0 -0
  85. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/translate/seamless.py +0 -0
  86. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/translate/utils.py +0 -0
  87. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/utr/__init__.py +0 -0
  88. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/utr/rev_utr.py +0 -0
  89. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/utr/utils.py +0 -0
  90. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/utr/whisper_utr.py +0 -0
  91. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/utterance/__init__.py +0 -0
  92. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/pipelines/utterance/ud_utterance.py +0 -0
  93. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/__init__.py +0 -0
  94. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/conftest.py +0 -0
  95. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/formats/chat/test_chat_file.py +0 -0
  96. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/formats/chat/test_chat_generator.py +0 -0
  97. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/formats/chat/test_chat_lexer.py +0 -0
  98. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/formats/chat/test_chat_parser.py +0 -0
  99. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/formats/chat/test_chat_utils.py +0 -0
  100. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/formats/textgrid/test_textgrid.py +0 -0
  101. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/analysis/test_eval.py +0 -0
  102. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/asr/test_asr_pipeline.py +0 -0
  103. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/asr/test_asr_utils.py +0 -0
  104. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/cleanup/test_disfluency.py +0 -0
  105. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/cleanup/test_parse_support.py +0 -0
  106. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/fa/test_fa_pipeline.py +0 -0
  107. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/fixures.py +0 -0
  108. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/test_pipeline.py +0 -0
  109. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/pipelines/test_pipeline_models.py +0 -0
  110. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/tests/test_document.py +0 -0
  111. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/utils/__init__.py +0 -0
  112. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/utils/config.py +0 -0
  113. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/utils/dp.py +0 -0
  114. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign/utils/utils.py +0 -0
  115. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign.egg-info/SOURCES.txt +0 -0
  116. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign.egg-info/dependency_links.txt +0 -0
  117. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign.egg-info/entry_points.txt +0 -0
  118. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign.egg-info/requires.txt +0 -0
  119. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/batchalign.egg-info/top_level.txt +0 -0
  120. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/setup.cfg +0 -0
  121. {batchalign-0.7.18.post7 → batchalign-0.7.18.post9}/setup.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: batchalign
3
- Version: 0.7.18.post7
3
+ Version: 0.7.18.post9
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -224,6 +224,8 @@ def translate(ctx, in_dir, out_dir, **kwargs):
224
224
  @common_options
225
225
  @click.option("--retokenize/--keeptokens",
226
226
  default=False, help="Retokenize the main line to fit the UD tokenizations.")
227
+ @click.option("--skipmultilang/--multilang",
228
+ default=False, help="skip code switching")
227
229
  @click.option("--lexicon",
228
230
  type=click.Path(exists=True,
229
231
  file_okay=True, dir_okay=False),
@@ -247,7 +249,9 @@ def morphotag(ctx, in_dir, out_dir, **kwargs):
247
249
  doc.ba_special_["special_mor_notation"] = True
248
250
  return (
249
251
  doc,
250
- {"retokenize": kwargs["retokenize"], "mwt": mwt}
252
+ {"retokenize": kwargs["retokenize"],
253
+ "skipmultilang": kwargs["skipmultilang"],
254
+ "mwt": mwt}
251
255
  )
252
256
 
253
257
  def writer(doc, output):
@@ -153,6 +153,7 @@ class Utterance(BaseModel):
153
153
  tier: Tier = Field(default=Tier())
154
154
  content: Sentence
155
155
  text: Optional[str] = Field(default=None)
156
+ override_lang: Optional[str] = Field(default=None)
156
157
  translation: Optional[str] = Field(default=None)
157
158
  time: Optional[Tuple[int,int]] = Field(default=None)
158
159
  custom_dependencies: List[CustomLine] = Field(default=[])
@@ -300,6 +300,7 @@ def chat_parse_doc(lines, special_mor=False):
300
300
  content=line.strip()))
301
301
 
302
302
  # parse the actual utterance
303
+ multilingual = re.findall(r"^\[- (\w+)\]", text)
303
304
  parsed, delim = chat_parse_utterance(text, mor, gra, wor, additional)
304
305
 
305
306
  # get the timing of the utterance
@@ -313,7 +314,8 @@ def chat_parse_doc(lines, special_mor=False):
313
314
  "text": text,
314
315
  "delim": delim,
315
316
  "custom_dependencies": additional,
316
- "translation": translation
317
+ "translation": translation,
318
+ "override_lang": None if len(multilingual) == 0 else multilingual[0]
317
319
  })
318
320
 
319
321
  timing = re.findall(rf"\x15(\d+)_(\d+)\x15", text)
@@ -13,6 +13,7 @@ resolver = {
13
13
  "whisper": {
14
14
  'eng': ("talkbank/CHATWhisper-en-large-v1", "openai/whisper-large-v2"),
15
15
  'yue': ("alvanlii/whisper-small-cantonese", "alvanlii/whisper-small-cantonese"),
16
+ "heb": ("ivrit-ai/whisper-large-v3", "ivrit-ai/whisper-large-v3")
16
17
  }
17
18
  }
18
19
 
@@ -710,7 +710,7 @@ def adlist_postprocessor(i, lang, adlist):
710
710
  return cpy
711
711
 
712
712
  ######
713
- def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, **kwargs):
713
+ def morphoanalyze(doc: Document, retokenize:bool, skipmultilang:bool, status_hook:callable = None, **kwargs):
714
714
  L.debug("Starting Stanza...")
715
715
  inputs = []
716
716
 
@@ -788,6 +788,8 @@ def morphoanalyze(doc: Document, retokenize:bool, status_hook:callable = None, *
788
788
  L.info(f"Stanza processing utterance {indx+1}/{len(doc.content)}")
789
789
  if not isinstance(i, Utterance):
790
790
  continue
791
+ if i.override_lang and skipmultilang:
792
+ continue
791
793
 
792
794
  # generate simplified version of the line
793
795
  line = str(i)
@@ -1014,4 +1016,4 @@ class StanzaEngine(BatchalignEngine):
1014
1016
  self.status_hook = status_hook
1015
1017
 
1016
1018
  def process(self, doc, **kwargs):
1017
- return morphoanalyze(doc, retokenize=kwargs.get("retokenize", False), status_hook=self.status_hook, mwt=kwargs.get("mwt", {}))
1019
+ return morphoanalyze(doc, retokenize=kwargs.get("retokenize", False), skipmultilang=kwargs.get("skipmultilang", False), status_hook=self.status_hook, mwt=kwargs.get("mwt", {}))
@@ -0,0 +1,3 @@
1
+ 0.7.18-post.9
2
+ April 27th, 2025
3
+ temporary hebrew resolution
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: batchalign
3
- Version: 0.7.18.post7
3
+ Version: 0.7.18.post9
4
4
  Summary: Python Speech Language Sample Analysis
5
5
  Author: Brian MacWhinney, Houjun Liu
6
6
  Author-email: macw@cmu.edu, houjun@cmu.edu
@@ -1,3 +0,0 @@
1
- 0.7.18-post.7
2
- April 26th, 2025
3
- l2 for @s